From 00000d98552f8aaac73d4877b5151850efbd14d4 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Wed, 3 Sep 2025 00:00:22 -0400 Subject: [PATCH] e --- M3U8/fetch.py | 9 +-- M3U8/scrape/ace.py | 2 +- M3U8/scrape/fstv.py | 5 +- M3U8/scrape/livetvsx.py | 120 ++++++++++++++++++++++++++++------------ M3U8/scrape/tvpass.py | 4 +- 5 files changed, 98 insertions(+), 42 deletions(-) diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 1090c34..80f1a3d 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -41,13 +41,14 @@ async def main() -> None: tasks = [ # ace.main(client), # fstv.main(client), - livetvsx.main(CLIENT), - tvpass.main(CLIENT), + asyncio.create_task(livetvsx.main(CLIENT)), + asyncio.create_task(tvpass.main(CLIENT)), + vanilla_fetch(), ] - await asyncio.gather(*tasks) + results = await asyncio.gather(*tasks) - base_m3u8, tvg_chno = await vanilla_fetch() + base_m3u8, tvg_chno = results[-1] additions = ace.urls | fstv.urls | livetvsx.urls | tvpass.urls diff --git a/M3U8/scrape/ace.py b/M3U8/scrape/ace.py index 48c0d84..3080531 100644 --- a/M3U8/scrape/ace.py +++ b/M3U8/scrape/ace.py @@ -50,7 +50,7 @@ async def get_schedule(client: httpx.AsyncClient, base_url: str) -> list[dict]: ) if " - " in after_time: - sport, event_name = [x.strip() for x in after_time.split(" - ", 1)] + sport, event_name = (x.strip() for x in after_time.split(" - ", 1)) else: sport, event_name = "", after_time diff --git a/M3U8/scrape/fstv.py b/M3U8/scrape/fstv.py index ef03466..8e6ebb1 100644 --- a/M3U8/scrape/fstv.py +++ b/M3U8/scrape/fstv.py @@ -131,4 +131,7 @@ async def main(client: httpx.AsyncClient) -> None: ), } - log.info(f"Collected {len(urls)} live events") + log.info(f"Collected {len(urls)} live event(s)") + + +# add caching diff --git a/M3U8/scrape/livetvsx.py b/M3U8/scrape/livetvsx.py index c2bd0bf..0ef6cfe 100644 --- a/M3U8/scrape/livetvsx.py +++ b/M3U8/scrape/livetvsx.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import asyncio import io +import json import ssl import xml.etree.ElementTree as ET from datetime import datetime, timedelta @@ -30,12 +31,15 @@ CERT_BUNDL_URLS = [ CERT_FILE = Path(__file__).parent / "cached-ca.pem" +CACHE_FILE = Path(__file__).parent / "livetvsx.json" -async def safe_process_event(fn, timeout_sec=20) -> Any | None: + +async def safe_process_event(fn, url_num: int, timeout=20) -> Any | None: try: - return await asyncio.wait_for(fn(), timeout=timeout_sec) + return await asyncio.wait_for(fn(), timeout=timeout) except asyncio.TimeoutError: - log.warning(f"Timed out after {timeout_sec}s, skipping event") + log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event") + return async def write_to_cert(client: httpx.AsyncClient, url: str, cert: Path) -> None: @@ -71,6 +75,21 @@ async def get_cert(client: httpx.AsyncClient) -> ssl.SSLContext: return ssl.create_default_context(cafile=CERT_FILE) +def load_cache() -> dict[str, dict[str, str | str]]: + try: + data = json.loads(CACHE_FILE.read_text(encoding="utf-8")) + + now = datetime.now().timestamp() + + return { + k: v + for k, v in data.items() + if now - v.get("timestamp", 0) < timedelta(hours=4).total_seconds() + } + except (FileNotFoundError, json.JSONDecodeError): + return {} + + async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO: buffer = io.BytesIO() @@ -91,8 +110,13 @@ async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO: return io.BytesIO(b"") -async def parse_feed(url: str, ssl_ctx: ssl.SSLContext) -> dict[str, dict[str, str]]: - events = [] +async def parse_feed( + url: str, + ssl_ctx: ssl.SSLContext, + cached_keys: set[str], +) -> list[dict[str, str]]: + + events: list[dict[str, str]] = [] pub_date_format = "%a, %d %b %Y %H:%M:%S %z" now = datetime.now(TZ) @@ -124,21 +148,28 @@ async def parse_feed(url: str, ssl_ctx: ssl.SSLContext) -> dict[str, dict[str, s else ("", "") ) - events.append( - { - "sport": sport, - "event": event, - "title": title, - "link": link, - } - ) + key = f"[{sport}: {event}] {title}" + + if key in cached_keys: + elem.clear() + continue + + elif not tvp_sports & {sport, event}: + events.append( + { + "sport": sport, + "event": event, + "title": title, + "link": link, + } + ) elem.clear() return events -async def process_event(url: str, max_wait_ms=15_000) -> str | None: +async def process_event(url: str, url_num: int, max_wait_ms=15_000) -> str | None: async with async_playwright() as p: browser = await p.firefox.launch(headless=True) @@ -179,16 +210,18 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None: await ev_page.wait_for_timeout(500) except Exception as e: - log.debug(f"Failed to click Browser Links tab: {e}") + log.debug(f"URL {url_num}) Failed to click Browser Links tab: {e}") + return else: - log.warning("Browser Links tab not found") + log.warning(f"URL {url_num}) Browser Links tab not found") link_img = await ev_page.query_selector( "tr:nth-child(2) > td:nth-child(1) td:nth-child(6) img" ) if not link_img: - log.warning("No browser link to click.") + log.warning(f"URL {url_num}) No browser link to click.") + return ev_page.on("request", capture_req) @@ -198,7 +231,7 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None: await link_img.click() except Exception as e: log.debug( - f"Click failed (popup might have already been opened): {e}" + f"URL {url_num}) Click failed (popup might have already been opened): {e}" ) popup = await popup_info.value @@ -209,7 +242,8 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None: try: await link_img.click() except Exception as e: - log.debug(f"Fallback click failed: {e}") + log.debug(f"URL {url_num}) Fallback click failed: {e}") + return wait_task = asyncio.create_task(got_one.wait()) @@ -217,7 +251,8 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None: await asyncio.wait_for(wait_task, timeout=max_wait_ms / 1000) except asyncio.TimeoutError: - log.warning("Timed out waiting for m3u8.") + log.warning(f"URL {url_num}) Timed out waiting for m3u8.") + return finally: if not wait_task.done(): @@ -238,10 +273,12 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None: await ev_page.close() if captured: + log.info(f"URL {url_num}) Captured M3U8") + return captured[-1] - log.warning("No m3u8 captured in popup or inline playback.") - + log.warning(f"URL {url_num}) No m3u8 captured in popup or inline playback.") + return except Exception as e: try: ev_page.remove_listener("request", capture_req) @@ -263,29 +300,44 @@ async def main(client: httpx.AsyncClient) -> None: cert = await get_cert(client) - events = await parse_feed(BASE_URL, cert) + cached_urls = load_cache() + cached_keys = set(cached_urls.keys()) + cached_count = len(cached_urls) - log.info(f"Processing {len(events)} events") + events = await parse_feed(BASE_URL, cert, cached_keys) - for ev in events: - if tvp_sports & { - sport := ev["sport"], - event := ev["event"], - }: # already in tvpass - continue + log.info(f"Processing {len(events)} URLs") - url = await safe_process_event(lambda: process_event(ev["link"])) + now_ts = datetime.now().timestamp() + + for num, ev in enumerate(events, start=1): + sport = ev["sport"] + event = ev["event"] + title = ev["title"] + link = ev["link"] + + key = f"[{sport}: {event}] {title}" + + url = await safe_process_event( + lambda: process_event(link, url_num=num), url_num=num + ) if url: - urls[f"[{sport}: {event}] {ev['title']}"] = { + entry = { "url": url, "logo": logos.get( sport, "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png", ), + "timestamp": now_ts, } - log.info(f"Collected {len(urls)} live events") + urls[key] = cached_urls[key] = entry + CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8") -# add caching + new_count = len(cached_urls) - cached_count + + log.info(f"Cached {cached_count} event(s)") + + log.info(f"Collected {new_count} new event(s)") diff --git a/M3U8/scrape/tvpass.py b/M3U8/scrape/tvpass.py index 752fddd..cc2262e 100644 --- a/M3U8/scrape/tvpass.py +++ b/M3U8/scrape/tvpass.py @@ -51,7 +51,7 @@ async def fetch_m3u8(client: httpx.AsyncClient) -> list[str] | None: async def main(client: httpx.AsyncClient) -> None: if cached := load_cache(): urls.update(cached) - log.info(f"Collected {len(urls)} events from cache") + log.info(f"Collected {len(urls)} event(s) from cache") return log.info(f'Scraping from "{base_url}"') @@ -87,4 +87,4 @@ async def main(client: httpx.AsyncClient) -> None: if urls: base_file.write_text(json.dumps(urls, indent=2), encoding="utf-8") - log.info(f"Cached {len(urls)} events") + log.info(f"Cached {len(urls)} event(s)")