From 00000d963882baaca5f42415d28ba26f86d5aa2b Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Fri, 23 Jan 2026 23:44:59 -0500 Subject: [PATCH] e - add tflix.py - remove strmd.py - modify playwright browser/context usage - misc. edits --- M3U8/fetch.py | 71 +++++----- M3U8/scrapers/cdnlivetv.py | 21 ++- M3U8/scrapers/embedhd.py | 21 ++- M3U8/scrapers/fawa.py | 2 +- M3U8/scrapers/istreameast.py | 2 +- M3U8/scrapers/pawa.py | 2 +- M3U8/scrapers/pixel.py | 25 ++-- M3U8/scrapers/ppv.py | 21 ++- M3U8/scrapers/roxie.py | 4 +- M3U8/scrapers/shark.py | 4 +- M3U8/scrapers/sport9.py | 19 +-- M3U8/scrapers/streambtw.py | 2 +- M3U8/scrapers/streamcenter.py | 21 ++- M3U8/scrapers/streamfree.py | 2 +- M3U8/scrapers/streamhub.py | 21 ++- M3U8/scrapers/streamsgate.py | 21 ++- M3U8/scrapers/strmd.py | 196 --------------------------- M3U8/scrapers/tflix.py | 234 +++++++++++++++++++++++++++++++++ M3U8/scrapers/totalsportek.py | 2 +- M3U8/scrapers/tvpass.py | 2 +- M3U8/scrapers/utils/caching.py | 4 +- M3U8/scrapers/utils/webwork.py | 195 +++++++++++++++------------ M3U8/scrapers/watchfooty.py | 30 ++--- M3U8/scrapers/webcast.py | 21 ++- 24 files changed, 481 insertions(+), 462 deletions(-) delete mode 100644 M3U8/scrapers/strmd.py create mode 100644 M3U8/scrapers/tflix.py diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 602371d8..816cd5dd 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -3,6 +3,7 @@ import asyncio import re from pathlib import Path +from playwright.async_api import async_playwright from scrapers import ( cdnlivetv, embedhd, @@ -19,7 +20,7 @@ from scrapers import ( streamfree, streamhub, streamsgate, - strmd, + tflix, totalsportek, tvpass, watchfooty, @@ -53,31 +54,46 @@ async def main() -> None: base_m3u8, tvg_chno = load_base() - tasks = [ - asyncio.create_task(cdnlivetv.scrape()), - asyncio.create_task(embedhd.scrape()), - asyncio.create_task(fawa.scrape()), - asyncio.create_task(istreameast.scrape()), - asyncio.create_task(pawa.scrape()), - asyncio.create_task(pixel.scrape()), - asyncio.create_task(ppv.scrape()), - asyncio.create_task(roxie.scrape()), - asyncio.create_task(shark.scrape()), - asyncio.create_task(sport9.scrape()), - asyncio.create_task(streambtw.scrape()), - asyncio.create_task(streamcenter.scrape()), - asyncio.create_task(streamfree.scrape()), - asyncio.create_task(streamhub.scrape()), - asyncio.create_task(streamsgate.scrape()), - # asyncio.create_task(strmd.scrape()), - asyncio.create_task(totalsportek.scrape()), - asyncio.create_task(tvpass.scrape()), - asyncio.create_task(webcast.scrape()), - ] + async with async_playwright() as p: + try: + hdl_brwsr = await network.browser(p) - await asyncio.gather(*tasks) + xtrnl_brwsr = await network.browser(p, external=True) - await watchfooty.scrape() + pw_tasks = [ + asyncio.create_task(cdnlivetv.scrape(hdl_brwsr)), + asyncio.create_task(embedhd.scrape(hdl_brwsr)), + asyncio.create_task(pixel.scrape(hdl_brwsr)), + asyncio.create_task(ppv.scrape(xtrnl_brwsr)), + asyncio.create_task(sport9.scrape(xtrnl_brwsr)), + asyncio.create_task(streamcenter.scrape(xtrnl_brwsr)), + asyncio.create_task(streamhub.scrape(xtrnl_brwsr)), + asyncio.create_task(streamsgate.scrape(xtrnl_brwsr)), + asyncio.create_task(tflix.scrape(xtrnl_brwsr)), + asyncio.create_task(webcast.scrape(hdl_brwsr)), + asyncio.create_task(watchfooty.scrape(xtrnl_brwsr)), + ] + + httpx_tasks = [ + asyncio.create_task(fawa.scrape()), + asyncio.create_task(istreameast.scrape()), + asyncio.create_task(pawa.scrape()), + asyncio.create_task(roxie.scrape()), + asyncio.create_task(shark.scrape()), + asyncio.create_task(streambtw.scrape()), + asyncio.create_task(streamfree.scrape()), + asyncio.create_task(totalsportek.scrape()), + asyncio.create_task(tvpass.scrape()), + ] + + await asyncio.gather(*(pw_tasks + httpx_tasks)) + + finally: + await hdl_brwsr.close() + + await xtrnl_brwsr.close() + + await network.client.aclose() additions = ( cdnlivetv.urls @@ -95,7 +111,7 @@ async def main() -> None: | streamfree.urls | streamhub.urls | streamsgate.urls - | strmd.urls + | tflix.urls | totalsportek.urls | tvpass.urls | watchfooty.urls @@ -153,8 +169,3 @@ async def main() -> None: if __name__ == "__main__": asyncio.run(main()) - - try: - asyncio.run(network.client.aclose()) - except Exception: - pass diff --git a/M3U8/scrapers/cdnlivetv.py b/M3U8/scrapers/cdnlivetv.py index 8a8ef522..13f5446a 100644 --- a/M3U8/scrapers/cdnlivetv.py +++ b/M3U8/scrapers/cdnlivetv.py @@ -1,6 +1,6 @@ from functools import partial -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext from .utils import Cache, Time, get_logger, leagues, network @@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "CDNTV" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) -API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800) +API_FILE = Cache(f"{TAG}-api", exp=19_800) API_URL = "https://api.cdn-live.tv/api/v1/events/sports" @@ -85,7 +85,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return events -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) @@ -101,16 +101,14 @@ async def scrape() -> None: log.info(f"Processing {len(events)} new URL(s)") if events: - async with async_playwright() as p: - browser, context = await network.browser(p) - - try: - for i, ev in enumerate(events, start=1): + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: handler = partial( network.process_event, url=ev["link"], url_num=i, - context=context, + page=page, log=log, ) @@ -144,9 +142,6 @@ async def scrape() -> None: urls[key] = cached_urls[key] = entry - finally: - await browser.close() - if new_count := len(cached_urls) - cached_count: log.info(f"Collected and cached {new_count} new event(s)") diff --git a/M3U8/scrapers/embedhd.py b/M3U8/scrapers/embedhd.py index edb68703..851af796 100644 --- a/M3U8/scrapers/embedhd.py +++ b/M3U8/scrapers/embedhd.py @@ -1,6 +1,6 @@ from functools import partial -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext from .utils import Cache, Time, get_logger, leagues, network @@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "EMBEDHD" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=5_400) +CACHE_FILE = Cache(TAG, exp=5_400) -API_CACHE = Cache(f"{TAG.lower()}-api.json", exp=28_800) +API_CACHE = Cache(f"{TAG}-api", exp=28_800) BASE_URL = "https://embedhd.org/api-event.php" @@ -75,7 +75,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return events -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) @@ -91,16 +91,14 @@ async def scrape() -> None: log.info(f"Processing {len(events)} new URL(s)") if events: - async with async_playwright() as p: - browser, context = await network.browser(p) - - try: - for i, ev in enumerate(events, start=1): + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: handler = partial( network.process_event, url=ev["link"], url_num=i, - context=context, + page=page, log=log, ) @@ -134,9 +132,6 @@ async def scrape() -> None: urls[key] = cached_urls[key] = entry - finally: - await browser.close() - if new_count := len(cached_urls) - cached_count: log.info(f"Collected and cached {new_count} new event(s)") diff --git a/M3U8/scrapers/fawa.py b/M3U8/scrapers/fawa.py index aec9edd8..c454c234 100644 --- a/M3U8/scrapers/fawa.py +++ b/M3U8/scrapers/fawa.py @@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "FAWA" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) BASE_URL = "http://www.fawanews.sc/" diff --git a/M3U8/scrapers/istreameast.py b/M3U8/scrapers/istreameast.py index 3d5cf511..fe7a089c 100644 --- a/M3U8/scrapers/istreameast.py +++ b/M3U8/scrapers/istreameast.py @@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "iSTRMEAST" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) BASE_URL = "https://istreameast.app" diff --git a/M3U8/scrapers/pawa.py b/M3U8/scrapers/pawa.py index 974df149..51d6f333 100644 --- a/M3U8/scrapers/pawa.py +++ b/M3U8/scrapers/pawa.py @@ -13,7 +13,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "PAWA" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) BASE_URL = "https://pawastreams.net/feed" diff --git a/M3U8/scrapers/pixel.py b/M3U8/scrapers/pixel.py index 7a8fa87f..44409628 100644 --- a/M3U8/scrapers/pixel.py +++ b/M3U8/scrapers/pixel.py @@ -1,7 +1,7 @@ import json from functools import partial -from playwright.async_api import BrowserContext, async_playwright +from playwright.async_api import BrowserContext, Page from .utils import Cache, Time, get_logger, leagues, network @@ -11,15 +11,13 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "PIXEL" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=19_800) +CACHE_FILE = Cache(TAG, exp=19_800) BASE_URL = "https://pixelsport.tv/backend/livetv/events" -async def get_api_data(context: BrowserContext) -> dict[str, list[dict, str, str]]: +async def get_api_data(page: Page) -> dict[str, list[dict, str, str]]: try: - page = await context.new_page() - await page.goto( BASE_URL, wait_until="domcontentloaded", @@ -35,10 +33,10 @@ async def get_api_data(context: BrowserContext) -> dict[str, list[dict, str, str return json.loads(raw_json) -async def get_events(context: BrowserContext) -> dict[str, dict[str, str | float]]: +async def get_events(page: Page) -> dict[str, dict[str, str | float]]: now = Time.clean(Time.now()) - api_data = await get_api_data(context) + api_data = await get_api_data(page) events = {} @@ -75,7 +73,7 @@ async def get_events(context: BrowserContext) -> dict[str, dict[str, str | float return events -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: if cached := CACHE_FILE.load(): urls.update(cached) @@ -85,11 +83,9 @@ async def scrape() -> None: log.info(f'Scraping from "{BASE_URL}"') - async with async_playwright() as p: - browser, context = await network.browser(p) - - try: - handler = partial(get_events, context=context) + async with network.event_context(browser) as context: + async with network.event_page(context) as page: + handler = partial(get_events, page=page) events = await network.safe_process( handler, @@ -98,9 +94,6 @@ async def scrape() -> None: log=log, ) - finally: - await browser.close() - urls.update(events or {}) CACHE_FILE.write(urls) diff --git a/M3U8/scrapers/ppv.py b/M3U8/scrapers/ppv.py index 5f7b301b..613bbfde 100644 --- a/M3U8/scrapers/ppv.py +++ b/M3U8/scrapers/ppv.py @@ -1,6 +1,6 @@ from functools import partial -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext from .utils import Cache, Time, get_logger, leagues, network @@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "PPV" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) -API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800) +API_FILE = Cache(f"{TAG}-api", exp=19_800) MIRRORS = [ "https://old.ppv.to/api/streams", @@ -78,7 +78,7 @@ async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]: return events -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) @@ -101,16 +101,14 @@ async def scrape() -> None: log.info(f"Processing {len(events)} new URL(s)") if events: - async with async_playwright() as p: - browser, context = await network.browser(p, browser="external") - - try: - for i, ev in enumerate(events, start=1): + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: handler = partial( network.process_event, url=ev["link"], url_num=i, - context=context, + page=page, timeout=6, log=log, ) @@ -146,9 +144,6 @@ async def scrape() -> None: urls[key] = cached_urls[key] = entry - finally: - await browser.close() - if new_count := len(cached_urls) - cached_count: log.info(f"Collected and cached {new_count} new event(s)") diff --git a/M3U8/scrapers/roxie.py b/M3U8/scrapers/roxie.py index c7cf6efb..07b72d25 100644 --- a/M3U8/scrapers/roxie.py +++ b/M3U8/scrapers/roxie.py @@ -13,9 +13,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "ROXIE" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) -HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=19_800) +HTML_CACHE = Cache(f"{TAG}-html", exp=19_800) BASE_URL = "https://roxiestreams.live" diff --git a/M3U8/scrapers/shark.py b/M3U8/scrapers/shark.py index bc42cc32..f2a50a5d 100644 --- a/M3U8/scrapers/shark.py +++ b/M3U8/scrapers/shark.py @@ -11,9 +11,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "SHARK" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) -HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=19_800) +HTML_CACHE = Cache(f"{TAG}-html", exp=19_800) BASE_URL = "https://sharkstreams.net" diff --git a/M3U8/scrapers/sport9.py b/M3U8/scrapers/sport9.py index 418b4b30..c1bc70d9 100644 --- a/M3U8/scrapers/sport9.py +++ b/M3U8/scrapers/sport9.py @@ -2,7 +2,7 @@ import asyncio from functools import partial from urllib.parse import urljoin -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -13,7 +13,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "SPORT9" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=5_400) +CACHE_FILE = Cache(TAG, exp=5_400) BASE_URL = "https://sport9.ru/" @@ -88,7 +88,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return events -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) @@ -106,16 +106,14 @@ async def scrape() -> None: if events: now = Time.clean(Time.now()).timestamp() - async with async_playwright() as p: - browser, context = await network.browser(p, browser="external") - - try: - for i, ev in enumerate(events, start=1): + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: handler = partial( network.process_event, url=ev["link"], url_num=i, - context=context, + page=page, log=log, ) @@ -148,9 +146,6 @@ async def scrape() -> None: urls[key] = cached_urls[key] = entry - finally: - await browser.close() - if new_count := len(cached_urls) - cached_count: log.info(f"Collected and cached {new_count} new event(s)") diff --git a/M3U8/scrapers/streambtw.py b/M3U8/scrapers/streambtw.py index 1f4f04ff..5af7abd6 100644 --- a/M3U8/scrapers/streambtw.py +++ b/M3U8/scrapers/streambtw.py @@ -13,7 +13,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "STRMBTW" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=3_600) +CACHE_FILE = Cache(TAG, exp=3_600) BASE_URLS = ["https://hiteasport.info/", "https://streambtw.com/"] diff --git a/M3U8/scrapers/streamcenter.py b/M3U8/scrapers/streamcenter.py index d2c5fb47..fa66502d 100644 --- a/M3U8/scrapers/streamcenter.py +++ b/M3U8/scrapers/streamcenter.py @@ -1,6 +1,6 @@ from functools import partial -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext from .utils import Cache, Time, get_logger, leagues, network @@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "STRMCNTR" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) -API_FILE = Cache(f"{TAG.lower()}-api.json", exp=28_800) +API_FILE = Cache(f"{TAG}-api", exp=28_800) BASE_URL = "https://backend.streamcenter.live/api/Parties" @@ -90,7 +90,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return events -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) @@ -106,16 +106,14 @@ async def scrape() -> None: log.info(f"Processing {len(events)} new URL(s)") if events: - async with async_playwright() as p: - browser, context = await network.browser(p, browser="external") - - try: - for i, ev in enumerate(events, start=1): + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: handler = partial( network.process_event, url=ev["link"], url_num=i, - context=context, + page=page, log=log, ) @@ -149,9 +147,6 @@ async def scrape() -> None: urls[key] = cached_urls[key] = entry - finally: - await browser.close() - if new_count := len(cached_urls) - cached_count: log.info(f"Collected and cached {new_count} new event(s)") diff --git a/M3U8/scrapers/streamfree.py b/M3U8/scrapers/streamfree.py index ee9a1c2e..966eb4f1 100644 --- a/M3U8/scrapers/streamfree.py +++ b/M3U8/scrapers/streamfree.py @@ -8,7 +8,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "STRMFREE" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=19_800) +CACHE_FILE = Cache(TAG, exp=19_800) BASE_URL = "https://streamfree.to/" diff --git a/M3U8/scrapers/streamhub.py b/M3U8/scrapers/streamhub.py index 4a47006e..9e858d19 100644 --- a/M3U8/scrapers/streamhub.py +++ b/M3U8/scrapers/streamhub.py @@ -2,7 +2,7 @@ import asyncio from functools import partial from urllib.parse import urljoin -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -13,9 +13,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "STRMHUB" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) -HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=28_800) +HTML_CACHE = Cache(f"{TAG}-html", exp=28_800) BASE_URL = "https://streamhub.pro/" @@ -132,7 +132,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return live -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: cached_urls = CACHE_FILE.load() valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} @@ -150,16 +150,14 @@ async def scrape() -> None: log.info(f"Processing {len(events)} new URL(s)") if events: - async with async_playwright() as p: - browser, context = await network.browser(p, browser="external") - - try: - for i, ev in enumerate(events, start=1): + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: handler = partial( network.process_event, url=ev["link"], url_num=i, - context=context, + page=page, timeout=5, log=log, ) @@ -199,9 +197,6 @@ async def scrape() -> None: urls[key] = entry - finally: - await browser.close() - if new_count := valid_count - cached_count: log.info(f"Collected and cached {new_count} new event(s)") diff --git a/M3U8/scrapers/streamsgate.py b/M3U8/scrapers/streamsgate.py index ee07b36b..f31d8820 100644 --- a/M3U8/scrapers/streamsgate.py +++ b/M3U8/scrapers/streamsgate.py @@ -4,7 +4,7 @@ from itertools import chain from typing import Any from urllib.parse import urljoin -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext from .utils import Cache, Time, get_logger, leagues, network @@ -14,9 +14,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "STRMSGATE" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) -API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800) +API_FILE = Cache(f"{TAG}-api", exp=19_800) BASE_URL = "https://streamingon.org" @@ -120,7 +120,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return events -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) @@ -136,16 +136,14 @@ async def scrape() -> None: log.info(f"Processing {len(events)} new URL(s)") if events: - async with async_playwright() as p: - browser, context = await network.browser(p, browser="external") - - try: - for i, ev in enumerate(events, start=1): + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: handler = partial( network.process_event, url=ev["link"], url_num=i, - context=context, + page=page, log=log, ) @@ -179,9 +177,6 @@ async def scrape() -> None: urls[key] = cached_urls[key] = entry - finally: - await browser.close() - if new_count := len(cached_urls) - cached_count: log.info(f"Collected and cached {new_count} new event(s)") diff --git a/M3U8/scrapers/strmd.py b/M3U8/scrapers/strmd.py deleted file mode 100644 index 8b892998..00000000 --- a/M3U8/scrapers/strmd.py +++ /dev/null @@ -1,196 +0,0 @@ -import re -from functools import partial -from urllib.parse import urljoin - -from playwright.async_api import async_playwright - -from .utils import Cache, Time, get_logger, leagues, network - -log = get_logger(__name__) - -urls: dict[str, dict[str, str | float]] = {} - -TAG = "STRMD" - -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) - -API_FILE = Cache(f"{TAG.lower()}-api.json", exp=28_800) - -MIRRORS = [ - "https://streami.su", - # "https://streamed.st", - "https://streamed.pk", -] - - -def fix_sport(s: str) -> str: - if "-" in s: - return " ".join(i.capitalize() for i in s.split("-")) - - elif s == "fight": - return "Fight (UFC/Boxing)" - - return s.capitalize() if len(s) >= 4 else s.upper() - - -async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]: - now = Time.clean(Time.now()) - - if not (api_data := API_FILE.load(per_entry=False, index=-1)): - log.info("Refreshing API cache") - - api_data = [{"timestamp": now.timestamp()}] - - if r := await network.request( - urljoin(url, "api/matches/all-today"), - log=log, - ): - api_data: list[dict] = r.json() - - api_data[-1]["timestamp"] = now.timestamp() - - API_FILE.write(api_data) - - events = [] - - pattern = re.compile(r"[\n\r]+|\s{2,}") - - start_dt = now.delta(minutes=-30) - end_dt = now.delta(minutes=30) - - for event in api_data: - if (category := event.get("category")) == "other": - continue - - if not (ts := event["date"]): - continue - - start_ts = float(f"{ts}"[:-3]) - - event_dt = Time.from_ts(start_ts) - - if not start_dt <= event_dt <= end_dt: - continue - - sport = fix_sport(category) - - parts = pattern.split(event["title"].strip()) - - name = " | ".join(p.strip() for p in parts if p.strip()) - - logo = urljoin(url, poster) if (poster := event.get("poster")) else None - - if f"[{sport}] {name} ({TAG})" in cached_keys: - continue - - sources: list[dict[str, str]] = event["sources"] - - if not sources: - continue - - skip_types = ["alpha", "bravo"] - - valid_sources = [d for d in sources if d.get("source") not in skip_types] - - if not valid_sources: - continue - - srce = valid_sources[0] - - source_type = srce.get("source") - - stream_id = srce.get("id") - - if not (source_type and stream_id): - continue - - events.append( - { - "sport": sport, - "event": name, - "link": f"https://embedsports.top/embed/{source_type}/{stream_id}/1", - "logo": logo, - "timestamp": event_dt.timestamp(), - } - ) - - return events - - -async def scrape() -> None: - cached_urls = CACHE_FILE.load() - - cached_count = len(cached_urls) - - urls.update(cached_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") - - if not (base_url := await network.get_base(MIRRORS)): - log.warning("No working STRMD mirrors") - - CACHE_FILE.write(cached_urls) - - return - - log.info(f'Scraping from "{base_url}"') - - events = await get_events(base_url, cached_urls.keys()) - - log.info(f"Processing {len(events)} new URL(s)") - - if events: - async with async_playwright() as p: - browser, context = await network.browser(p, browser="external") - - try: - for i, ev in enumerate(events, start=1): - handler = partial( - network.process_event, - url=ev["link"], - url_num=i, - context=context, - log=log, - ) - - url = await network.safe_process( - handler, - url_num=i, - semaphore=network.PW_S, - log=log, - ) - - if url: - sport, event, logo, ts, link = ( - ev["sport"], - ev["event"], - ev["logo"], - ev["timestamp"], - ev["link"], - ) - - key = f"[{sport}] {event} ({TAG})" - - tvg_id, pic = leagues.get_tvg_info(sport, event) - - entry = { - "url": url, - "logo": logo or pic, - "base": "https://embedsports.top/", - "timestamp": ts, - "id": tvg_id or "Live.Event.us", - "link": link, - } - - urls[key] = cached_urls[key] = entry - - finally: - await browser.close() - - if new_count := len(cached_urls) - cached_count: - log.info(f"Collected and cached {new_count} new event(s)") - - else: - log.info("No new events found") - - CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/tflix.py b/M3U8/scrapers/tflix.py new file mode 100644 index 00000000..c344e10e --- /dev/null +++ b/M3U8/scrapers/tflix.py @@ -0,0 +1,234 @@ +import asyncio +from functools import partial +from urllib.parse import urljoin + +import feedparser +from playwright.async_api import BrowserContext, Error, Page, TimeoutError + +from .utils import Cache, Time, get_logger, leagues, network + +log = get_logger(__name__) + +urls: dict[str, dict[str, str | float]] = {} + +TAG = "TFLIX" + +CACHE_FILE = Cache(TAG, exp=28_800) + +BASE_URL = "https://tv.tflix.app/" + +SPORT_ENDPOINTS = ["football", "nba", "nfl", "nhl"] + + +async def process_event( + url: str, + url_num: int, + page: Page, +) -> tuple[str | None, str | None]: + try: + await page.goto( + url, + wait_until="domcontentloaded", + timeout=15_000, + ) + + try: + iframe = await page.wait_for_selector( + "iframe.metaframe.rptss", + timeout=3_500, + ) + except TimeoutError: + log.warning(f"URL {url_num}) No iframe element.") + + return None, None + + if (old_src := await iframe.get_attribute("src")) and old_src.startswith( + "https://kloxmkhs.site/stream" + ): + new_src = old_src + + else: + try: + option = await page.wait_for_selector( + 'li.dooplay_player_option >> span.title:has-text("TFLIX HD - iOS")', + timeout=3_000, + ) + + await option.scroll_into_view_if_needed() + + await option.evaluate("el => el.click()") + + await page.wait_for_function( + """ + (oldSrc) => { + const iframe = document.querySelector('iframe.metaframe.rptss'); + return iframe && iframe.src && iframe.src !== oldSrc; + }; + """, + arg=old_src, + timeout=5_000, + ) + + iframe_2 = await page.wait_for_selector("iframe.metaframe.rptss") + + if not iframe_2 or not (new_src := await iframe_2.get_attribute("src")): + log.warning(f"URL {url_num}) No iframe source.") + + return None, None + except TimeoutError: + log.warning(f"URL {url_num}) No valid TFLIX source.") + + return None, None + + try: + await page.goto( + new_src, + wait_until="domcontentloaded", + timeout=10_000, + referer=url, + ) + except Error: + log.warning( + f"URL {url_num}) HTTP 403/404 error while redirecting to iframe source." + ) + + return None, None + + try: + play_btn = await page.wait_for_selector( + 'button[data-url][onclick*="startPlcb"]', + timeout=5_000, + ) + except TimeoutError: + log.warning(f"URL {url_num}) No play button found.") + + return None, None + + if not (data_url := await play_btn.get_attribute("data-url")): + log.warning(f"URL {url_num}) No PBID found.") + + return None, None + + log.info(f"URL {url_num}) Captured M3U8") + + return ( + f"https://kloxmkhs.site/stream/stream.m3u8?id={data_url}&format=.m3u8", + new_src, + ) + + except Exception as e: + log.warning(f"URL {url_num}) Exception while processing: {e}") + + return None, None + + +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: + tasks = [ + network.request(urljoin(BASE_URL, f"genre/{sport}/feed"), log=log) + for sport in SPORT_ENDPOINTS + ] + + results = await asyncio.gather(*tasks) + + events = [] + + if not (feeds := [feedparser.parse(html.content) for html in results if html]): + return events + + for feed in feeds: + title: str = feed["feed"]["title"] + + sport = title.split("Archives")[0].strip() + + for entry in feed.entries: + if not (link := entry.get("link")): + continue + + if not (title := entry.get("title")): + continue + + if f"[{sport}] {title} ({TAG})" in cached_keys: + continue + + events.append( + { + "sport": sport, + "event": title, + "link": link, + } + ) + + return events + + +async def scrape(browser: BrowserContext) -> None: + cached_urls = CACHE_FILE.load() + + valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + + valid_count = cached_count = len(cached_urls) + + urls.update(valid_urls) + + log.info(f"Loaded {cached_count} event(s) from cache") + + log.info(f'Scraping from "{BASE_URL}"') + + events = await get_events(cached_urls.keys()) + + log.info(f"Processing {len(events)} new URL(s)") + + if events: + now = Time.clean(Time.now()).timestamp() + + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: + handler = partial( + process_event, + url=ev["link"], + url_num=i, + page=page, + ) + + url, iframe = await network.safe_process( + handler, + url_num=i, + semaphore=network.PW_S, + log=log, + timeout=20, + ) + + sport, event, link = ( + ev["sport"], + ev["event"], + ev["link"], + ) + + key = f"[{sport}] {event} ({TAG})" + + tvg_id, logo = leagues.get_tvg_info(sport, event) + + entry = { + "url": url, + "logo": logo, + "base": iframe, + "timestamp": now, + "id": tvg_id or "Live.Event.us", + "link": link, + } + + cached_urls[key] = entry + + if url: + valid_count += 1 + + urls[key] = entry + + if new_count := valid_count - cached_count: + log.info(f"Collected and cached {new_count} new event(s)") + + else: + log.info("No new events found") + + CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/totalsportek.py b/M3U8/scrapers/totalsportek.py index c866f3dc..fecb29d0 100644 --- a/M3U8/scrapers/totalsportek.py +++ b/M3U8/scrapers/totalsportek.py @@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "TOTALSPRTK" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=28_800) +CACHE_FILE = Cache(TAG, exp=28_800) MIRRORS = [ { diff --git a/M3U8/scrapers/tvpass.py b/M3U8/scrapers/tvpass.py index f99db2ed..390c557f 100644 --- a/M3U8/scrapers/tvpass.py +++ b/M3U8/scrapers/tvpass.py @@ -8,7 +8,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "TVPASS" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=86_400) +CACHE_FILE = Cache(TAG, exp=86_400) BASE_URL = "https://tvpass.org/playlist/m3u" diff --git a/M3U8/scrapers/utils/caching.py b/M3U8/scrapers/utils/caching.py index d9a47567..3a86f5db 100644 --- a/M3U8/scrapers/utils/caching.py +++ b/M3U8/scrapers/utils/caching.py @@ -7,8 +7,8 @@ from .config import Time class Cache: now_ts: float = Time.now().timestamp() - def __init__(self, file: str, exp: int | float) -> None: - self.file = Path(__file__).parent.parent / "caches" / file + def __init__(self, filename: str, exp: int | float) -> None: + self.file = Path(__file__).parent.parent / "caches" / f"{filename.lower()}.json" self.exp = exp diff --git a/M3U8/scrapers/utils/webwork.py b/M3U8/scrapers/utils/webwork.py index d499972d..19d3fdc0 100644 --- a/M3U8/scrapers/utils/webwork.py +++ b/M3U8/scrapers/utils/webwork.py @@ -3,12 +3,13 @@ import logging import random import re from collections.abc import Awaitable, Callable +from contextlib import asynccontextmanager from functools import partial -from typing import TypeVar +from typing import AsyncGenerator, TypeVar from urllib.parse import urlencode, urljoin import httpx -from playwright.async_api import Browser, BrowserContext, Playwright, Request +from playwright.async_api import Browser, BrowserContext, Page, Playwright, Request from .logger import get_logger @@ -123,6 +124,112 @@ class Network: return + @staticmethod + @asynccontextmanager + async def event_context( + browser: Browser, + stealth: bool = True, + ) -> AsyncGenerator[BrowserContext, None]: + context: BrowserContext | None = None + + try: + context = await browser.new_context( + user_agent=Network.UA if stealth else None, + viewport={"width": 1366, "height": 768}, + device_scale_factor=1, + locale="en-US", + timezone_id="America/New_York", + color_scheme="dark", + permissions=["geolocation"], + extra_http_headers=( + { + "Accept-Language": "en-US,en;q=0.9", + "Upgrade-Insecure-Requests": "1", + } + if stealth + else None + ), + ) + + if stealth: + await context.add_init_script(""" + Object.defineProperty(navigator, "webdriver", { get: () => undefined }); + + Object.defineProperty(navigator, "languages", { + get: () => ["en-US", "en"], + }); + + Object.defineProperty(navigator, "plugins", { + get: () => [1, 2, 3, 4], + }); + + const elementDescriptor = Object.getOwnPropertyDescriptor( + HTMLElement.prototype, + "offsetHeight" + ); + + Object.defineProperty(HTMLDivElement.prototype, "offsetHeight", { + ...elementDescriptor, + get: function () { + if (this.id === "modernizr") { + return 24; + } + return elementDescriptor.get.apply(this); + }, + }); + + Object.defineProperty(window.screen, "width", { get: () => 1366 }); + Object.defineProperty(window.screen, "height", { get: () => 768 }); + + const getParameter = WebGLRenderingContext.prototype.getParameter; + + WebGLRenderingContext.prototype.getParameter = function (param) { + if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL + if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL + return getParameter.apply(this, [param]); + }; + + const observer = new MutationObserver((mutations) => { + mutations.forEach((mutation) => { + mutation.addedNodes.forEach((node) => { + if (node.tagName === "IFRAME" && node.hasAttribute("sandbox")) { + node.removeAttribute("sandbox"); + } + }); + }); + }); + + observer.observe(document.documentElement, { childList: true, subtree: true }); + """) + + else: + context = await browser.new_context() + + yield context + + finally: + if context: + await context.close() + + @staticmethod + @asynccontextmanager + async def event_page(context: BrowserContext) -> AsyncGenerator[Page, None]: + page = await context.new_page() + + try: + yield page + + finally: + await page.close() + + @staticmethod + async def browser(playwright: Playwright, external: bool = False) -> Browser: + return ( + await playwright.chromium.connect_over_cdp("http://localhost:9222") + if external + else await playwright.firefox.launch(headless=True) + ) + @staticmethod def capture_req( req: Request, @@ -147,15 +254,13 @@ class Network: self, url: str, url_num: int, - context: BrowserContext, + page: Page, timeout: int | float = 10, log: logging.Logger | None = None, ) -> str | None: log = log or logger - page = await context.new_page() - captured: list[str] = [] got_one = asyncio.Event() @@ -212,86 +317,6 @@ class Network: await page.close() - @staticmethod - async def browser( - playwright: Playwright, browser: str = "internal" - ) -> tuple[Browser, BrowserContext]: - if browser == "external": - brwsr = await playwright.chromium.connect_over_cdp("http://localhost:9222") - - context = brwsr.contexts[0] - - else: - brwsr = await playwright.firefox.launch(headless=True) - - context = await brwsr.new_context( - user_agent=Network.UA, - ignore_https_errors=False, - viewport={"width": 1366, "height": 768}, - device_scale_factor=1, - locale="en-US", - timezone_id="America/New_York", - color_scheme="dark", - permissions=["geolocation"], - extra_http_headers={ - "Accept-Language": "en-US,en;q=0.9", - "Upgrade-Insecure-Requests": "1", - }, - ) - - await context.add_init_script(""" - Object.defineProperty(navigator, "webdriver", { get: () => undefined }); - - Object.defineProperty(navigator, "languages", { - get: () => ["en-US", "en"], - }); - - Object.defineProperty(navigator, "plugins", { - get: () => [1, 2, 3, 4], - }); - - const elementDescriptor = Object.getOwnPropertyDescriptor( - HTMLElement.prototype, - "offsetHeight" - ); - - Object.defineProperty(HTMLDivElement.prototype, "offsetHeight", { - ...elementDescriptor, - get: function () { - if (this.id === "modernizr") { - return 24; - } - return elementDescriptor.get.apply(this); - }, - }); - - Object.defineProperty(window.screen, "width", { get: () => 1366 }); - Object.defineProperty(window.screen, "height", { get: () => 768 }); - - const getParameter = WebGLRenderingContext.prototype.getParameter; - - WebGLRenderingContext.prototype.getParameter = function (param) { - if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL - if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL - return getParameter.apply(this, [param]); - }; - - const observer = new MutationObserver((mutations) => { - mutations.forEach((mutation) => { - mutation.addedNodes.forEach((node) => { - if (node.tagName === "IFRAME" && node.hasAttribute("sandbox")) { - node.removeAttribute("sandbox"); - } - }); - }); - }); - - observer.observe(document.documentElement, { childList: true, subtree: true }); - - """) - - return brwsr, context - network = Network() diff --git a/M3U8/scrapers/watchfooty.py b/M3U8/scrapers/watchfooty.py index 61c71ec5..e9dba8f1 100644 --- a/M3U8/scrapers/watchfooty.py +++ b/M3U8/scrapers/watchfooty.py @@ -5,7 +5,7 @@ from itertools import chain from typing import Any from urllib.parse import urljoin -from playwright.async_api import BrowserContext, async_playwright +from playwright.async_api import BrowserContext, Page, TimeoutError from .utils import Cache, Time, get_logger, leagues, network @@ -15,9 +15,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "WATCHFTY" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) -API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800) +API_FILE = Cache(f"{TAG}-api.json", exp=19_800) API_URL = "https://api.watchfooty.st" @@ -73,7 +73,7 @@ async def refresh_api_cache(now: Time) -> list[dict[str, Any]]: async def process_event( url: str, url_num: int, - context: BrowserContext, + page: Page, ) -> tuple[str | None, str | None]: pattern = re.compile(r"\((\d+)\)") @@ -82,8 +82,6 @@ async def process_event( got_one = asyncio.Event() - page = await context.new_page() - handler = partial( network.capture_req, captured=captured, @@ -117,7 +115,8 @@ async def process_event( try: first_available = await page.wait_for_selector( - 'a[href*="/stream/"]', timeout=3_000 + 'a[href*="/stream/"]', + timeout=3_000, ) except TimeoutError: log.warning(f"URL {url_num}) No available stream links.") @@ -176,8 +175,6 @@ async def process_event( finally: page.remove_listener("request", handler) - await page.close() - async def get_events(base_url: str, cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) @@ -235,7 +232,7 @@ async def get_events(base_url: str, cached_keys: list[str]) -> list[dict[str, st return events -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: cached_urls = CACHE_FILE.load() valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} @@ -260,16 +257,14 @@ async def scrape() -> None: log.info(f"Processing {len(events)} new URL(s)") if events: - async with async_playwright() as p: - browser, context = await network.browser(p, browser="external") - - try: - for i, ev in enumerate(events, start=1): + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: handler = partial( process_event, url=ev["link"], url_num=i, - context=context, + page=page, ) url, iframe = await network.safe_process( @@ -307,9 +302,6 @@ async def scrape() -> None: urls[key] = entry - finally: - await browser.close() - if new_count := valid_count - cached_count: log.info(f"Collected and cached {new_count} new event(s)") diff --git a/M3U8/scrapers/webcast.py b/M3U8/scrapers/webcast.py index cc3f7f56..71de8d5f 100644 --- a/M3U8/scrapers/webcast.py +++ b/M3U8/scrapers/webcast.py @@ -1,7 +1,7 @@ import asyncio from functools import partial -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -12,9 +12,9 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "WEBCAST" -CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) +CACHE_FILE = Cache(TAG, exp=10_800) -HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=86_400) +HTML_CACHE = Cache(f"{TAG}-html", exp=86_400) BASE_URLS = {"NFL": "https://nflwebcast.com", "NHL": "https://slapstreams.com"} @@ -110,7 +110,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return live -async def scrape() -> None: +async def scrape(browser: BrowserContext) -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) @@ -126,16 +126,14 @@ async def scrape() -> None: log.info(f"Processing {len(events)} new URL(s)") if events: - async with async_playwright() as p: - browser, context = await network.browser(p) - - try: - for i, ev in enumerate(events, start=1): + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: handler = partial( network.process_event, url=ev["link"], url_num=i, - context=context, + page=page, log=log, ) @@ -169,9 +167,6 @@ async def scrape() -> None: urls[key] = cached_urls[key] = entry - finally: - await browser.close() - if new_count := len(cached_urls) - cached_count: log.info(f"Collected and cached {new_count} new event(s)")