e

2025-09-03 00:00:22 -04:00 · 2025-09-03 00:00:22 -04:00 · 00000d9855
commit 00000d9855
parent a02d30459a
5 changed files with 98 additions and 42 deletions
--- a/M3U8/scrape/ace.py
+++ b/M3U8/scrape/ace.py
@ -50,7 +50,7 @@ async def get_schedule(client: httpx.AsyncClient, base_url: str) -> list[dict]:
        )

        if " - " in after_time:
-            sport, event_name = [x.strip() for x in after_time.split(" - ", 1)]
+            sport, event_name = (x.strip() for x in after_time.split(" - ", 1))
        else:
            sport, event_name = "", after_time

--- a/M3U8/scrape/fstv.py
+++ b/M3U8/scrape/fstv.py
@ -131,4 +131,7 @@ async def main(client: httpx.AsyncClient) -> None:
                ),
            }

-    log.info(f"Collected {len(urls)} live events")
+    log.info(f"Collected {len(urls)} live event(s)")
+
+
+# add caching
--- a/M3U8/scrape/livetvsx.py
+++ b/M3U8/scrape/livetvsx.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import asyncio
 import io
+import json
 import ssl
 import xml.etree.ElementTree as ET
 from datetime import datetime, timedelta
@ -30,12 +31,15 @@ CERT_BUNDL_URLS = [

 CERT_FILE = Path(__file__).parent / "cached-ca.pem"

+CACHE_FILE = Path(__file__).parent / "livetvsx.json"

-async def safe_process_event(fn, timeout_sec=20) -> Any | None:
+
+async def safe_process_event(fn, url_num: int, timeout=20) -> Any | None:
    try:
-        return await asyncio.wait_for(fn(), timeout=timeout_sec)
+        return await asyncio.wait_for(fn(), timeout=timeout)
    except asyncio.TimeoutError:
-        log.warning(f"Timed out after {timeout_sec}s, skipping event")
+        log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event")
+        return


 async def write_to_cert(client: httpx.AsyncClient, url: str, cert: Path) -> None:
@ -71,6 +75,21 @@ async def get_cert(client: httpx.AsyncClient) -> ssl.SSLContext:
    return ssl.create_default_context(cafile=CERT_FILE)


+def load_cache() -> dict[str, dict[str, str | str]]:
+    try:
+        data = json.loads(CACHE_FILE.read_text(encoding="utf-8"))
+
+        now = datetime.now().timestamp()
+
+        return {
+            k: v
+            for k, v in data.items()
+            if now - v.get("timestamp", 0) < timedelta(hours=4).total_seconds()
+        }
+    except (FileNotFoundError, json.JSONDecodeError):
+        return {}
+
+
 async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO:
    buffer = io.BytesIO()

@ -91,8 +110,13 @@ async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO:
        return io.BytesIO(b"")


-async def parse_feed(url: str, ssl_ctx: ssl.SSLContext) -> dict[str, dict[str, str]]:
-    events = []
+async def parse_feed(
+    url: str,
+    ssl_ctx: ssl.SSLContext,
+    cached_keys: set[str],
+) -> list[dict[str, str]]:
+
+    events: list[dict[str, str]] = []
    pub_date_format = "%a, %d %b %Y %H:%M:%S %z"
    now = datetime.now(TZ)

@ -124,21 +148,28 @@ async def parse_feed(url: str, ssl_ctx: ssl.SSLContext) -> dict[str, dict[str, s
                    else ("", "")
                )

-                events.append(
-                    {
-                        "sport": sport,
-                        "event": event,
-                        "title": title,
-                        "link": link,
-                    }
-                )
+                key = f"[{sport}: {event}] {title}"
+
+                if key in cached_keys:
+                    elem.clear()
+                    continue
+
+                elif not tvp_sports & {sport, event}:
+                    events.append(
+                        {
+                            "sport": sport,
+                            "event": event,
+                            "title": title,
+                            "link": link,
+                        }
+                    )

            elem.clear()

    return events


-async def process_event(url: str, max_wait_ms=15_000) -> str | None:
+async def process_event(url: str, url_num: int, max_wait_ms=15_000) -> str | None:
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)

@ -179,16 +210,18 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:

                    await ev_page.wait_for_timeout(500)
                except Exception as e:
-                    log.debug(f"Failed to click Browser Links tab: {e}")
+                    log.debug(f"URL {url_num}) Failed to click Browser Links tab: {e}")
+                    return
            else:
-                log.warning("Browser Links tab not found")
+                log.warning(f"URL {url_num}) Browser Links tab not found")

            link_img = await ev_page.query_selector(
                "tr:nth-child(2) > td:nth-child(1) td:nth-child(6) img"
            )

            if not link_img:
-                log.warning("No browser link to click.")
+                log.warning(f"URL {url_num}) No browser link to click.")
+                return

            ev_page.on("request", capture_req)

@ -198,7 +231,7 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:
                        await link_img.click()
                    except Exception as e:
                        log.debug(
-                            f"Click failed (popup might have already been opened): {e}"
+                            f"URL {url_num}) Click failed (popup might have already been opened): {e}"
                        )

                popup = await popup_info.value
@ -209,7 +242,8 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:
                try:
                    await link_img.click()
                except Exception as e:
-                    log.debug(f"Fallback click failed: {e}")
+                    log.debug(f"URL {url_num}) Fallback click failed: {e}")
+                    return

            wait_task = asyncio.create_task(got_one.wait())

@ -217,7 +251,8 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:
                await asyncio.wait_for(wait_task, timeout=max_wait_ms / 1000)

            except asyncio.TimeoutError:
-                log.warning("Timed out waiting for m3u8.")
+                log.warning(f"URL {url_num}) Timed out waiting for m3u8.")
+                return

            finally:
                if not wait_task.done():
@ -238,10 +273,12 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:
            await ev_page.close()

            if captured:
+                log.info(f"URL {url_num}) Captured M3U8")
+
                return captured[-1]

-            log.warning("No m3u8 captured in popup or inline playback.")
-
+            log.warning(f"URL {url_num}) No m3u8 captured in popup or inline playback.")
+            return
        except Exception as e:
            try:
                ev_page.remove_listener("request", capture_req)
@ -263,29 +300,44 @@ async def main(client: httpx.AsyncClient) -> None:

    cert = await get_cert(client)

-    events = await parse_feed(BASE_URL, cert)
+    cached_urls = load_cache()
+    cached_keys = set(cached_urls.keys())
+    cached_count = len(cached_urls)

-    log.info(f"Processing {len(events)} events")
+    events = await parse_feed(BASE_URL, cert, cached_keys)

-    for ev in events:
-        if tvp_sports & {
-            sport := ev["sport"],
-            event := ev["event"],
-        }:  # already in tvpass
-            continue
+    log.info(f"Processing {len(events)} URLs")

-        url = await safe_process_event(lambda: process_event(ev["link"]))
+    now_ts = datetime.now().timestamp()
+
+    for num, ev in enumerate(events, start=1):
+        sport = ev["sport"]
+        event = ev["event"]
+        title = ev["title"]
+        link = ev["link"]
+
+        key = f"[{sport}: {event}] {title}"
+
+        url = await safe_process_event(
+            lambda: process_event(link, url_num=num), url_num=num
+        )

        if url:
-            urls[f"[{sport}: {event}] {ev['title']}"] = {
+            entry = {
                "url": url,
                "logo": logos.get(
                    sport,
                    "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
                ),
+                "timestamp": now_ts,
            }

-    log.info(f"Collected {len(urls)} live events")
+            urls[key] = cached_urls[key] = entry

+    CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")

-# add caching
+    new_count = len(cached_urls) - cached_count
+
+    log.info(f"Cached {cached_count} event(s)")
+
+    log.info(f"Collected {new_count} new event(s)")
--- a/M3U8/scrape/tvpass.py
+++ b/M3U8/scrape/tvpass.py
@ -51,7 +51,7 @@ async def fetch_m3u8(client: httpx.AsyncClient) -> list[str] | None:
 async def main(client: httpx.AsyncClient) -> None:
    if cached := load_cache():
        urls.update(cached)
-        log.info(f"Collected {len(urls)} events from cache")
+        log.info(f"Collected {len(urls)} event(s) from cache")
        return

    log.info(f'Scraping from "{base_url}"')
@ -87,4 +87,4 @@ async def main(client: httpx.AsyncClient) -> None:
    if urls:
        base_file.write_text(json.dumps(urls, indent=2), encoding="utf-8")

-        log.info(f"Cached {len(urls)} events")
+        log.info(f"Cached {len(urls)} event(s)")