e

cache all events for streamhub instead of live events
2026-04-24 20:16:59 +02:00 · 2025-12-15 02:06:46 -05:00 · 2025-12-15 02:06:46 -05:00 · 00000d9cc1
commit 00000d9cc1
parent f755ffc78b
2 changed files with 111 additions and 64 deletions
--- a/M3U8/scrapers/roxie.py
+++ b/M3U8/scrapers/roxie.py
@ -57,6 +57,18 @@ async def process_event(
    return match[1]
 async def get_html_data(client: httpx.AsyncClient, url: str) -> bytes:
    try:
        r = await client.get(url)
        r.raise_for_status()
    except Exception as e:
        log.error(f'Failed to fetch "{url}": {e}')
        return b""
    return r.content
 async def refresh_html_cache(
    client: httpx.AsyncClient,
    url: str,
@ -64,15 +76,9 @@ async def refresh_html_cache(
    now_ts: float,
 ) -> dict[str, dict[str, str | float]]:
-    try:
+    html_data = await get_html_data(client, url)
        r = await client.get(url)
        r.raise_for_status()
    except Exception as e:
        log.error(f'Failed to fetch "{url}": {e}')
-        return {}
+    soup = HTMLParser(html_data)
    soup = HTMLParser(r.content)
    events = {}
@ -108,16 +114,15 @@ async def refresh_html_cache(
 async def get_events(
-    client: httpx.AsyncClient,
+    client: httpx.AsyncClient, cached_keys: set[str]
    sport_urls: dict[str, str],
    cached_keys: set[str],
 ) -> list[dict[str, str]]:
    now = Time.clean(Time.now())
    if not (events := HTML_CACHE.load()):
        log.info("Refreshing HTML cache")
        sport_urls = {sport: urljoin(BASE_URL, sport) for sport in SPORT_ENDPOINTS}
        tasks = [
            refresh_html_cache(
                client,
@ -160,13 +165,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
    log.info(f'Scraping from "{BASE_URL}"')
-    sport_urls = {sport: urljoin(BASE_URL, sport) for sport in SPORT_ENDPOINTS}
+    events = await get_events(client, set(cached_urls.keys()))
    events = await get_events(
        client,
        sport_urls,
        set(cached_urls.keys()),
    )
    log.info(f"Processing {len(events)} new URL(s)")
--- a/M3U8/scrapers/streamhub.py
+++ b/M3U8/scrapers/streamhub.py
@ -1,5 +1,6 @@
 import asyncio
 from functools import partial
 from urllib.parse import urljoin
 import httpx
 from playwright.async_api import async_playwright
@ -15,7 +16,9 @@ TAG = "STRMHUB"
 CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
-BASE_URL = "https://streamhub.pro/live-now"
+HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=28_800)
 BASE_URL = "https://streamhub.pro/"
 CATEGORIES = {
@ -33,31 +36,33 @@ CATEGORIES = {
 }
-async def get_html_data(client: httpx.AsyncClient, sport: str) -> bytes:
+async def get_html_data(client: httpx.AsyncClient, sport_id: str) -> bytes:
    try:
-        r = await client.get(BASE_URL, params={"sport_id": sport})
+        url = urljoin(BASE_URL, f"events/{Time.now().date()}")
        r = await client.get(url, params={"sport_id": sport_id})
        r.raise_for_status()
    except Exception as e:
-        log.error(f'Failed to fetch "{BASE_URL}": {e}')
+        log.error(f'Failed to fetch "{url}": {e}')
        return b""
    return r.content
-async def get_events(
+async def refresh_html_cache(
-    client: httpx.AsyncClient, cached_keys: set[str]
+    client: httpx.AsyncClient,
-) -> list[dict[str, str]]:
+    sport_id: str,
    ts: float,
 ) -> dict[str, dict[str, str | float]]:
-    tasks = [get_html_data(client, sport) for sport in CATEGORIES.values()]
+    html_data = await get_html_data(client, sport_id)
-    results = await asyncio.gather(*tasks)
+    soup = HTMLParser(html_data)
-    soups = [HTMLParser(html) for html in results]
+    events = {}
    events = []
    for soup in soups:
    for section in soup.css(".events-section"):
        if not (sport_node := section.css_first(".section-titlte")):
            continue
@ -74,28 +79,72 @@ async def get_events(
                event_name = f"{away} vs {home}"
-                if not (event_button := event.css_first("div.event-button a")) or not (
+            if not (event_button := event.css_first(".event-button a")) or not (
                href := event_button.attributes.get("href")
            ):
                continue
            event_date = event.css_first(".event-countdown").attributes.get(
                "data-start"
            )
            event_dt = Time.from_str(event_date, timezone="UTC")
            key = f"[{sport}] {event_name} ({TAG})"
-                if cached_keys & {key}:
+            events[key] = {
                    continue
                events.append(
                    {
                "sport": sport,
                "event": event_name,
                "link": href,
                "logo": logo,
                "timestamp": ts,
                "event_ts": event_dt.timestamp(),
            }
                )
    return events
 async def get_events(
    client: httpx.AsyncClient,
    cached_keys: set[str],
 ) -> list[dict[str, str]]:
    now = Time.clean(Time.now())
    if not (events := HTML_CACHE.load()):
        log.info("Refreshing HTML cache")
        tasks = [
            refresh_html_cache(
                client,
                sport_id,
                now.timestamp(),
            )
            for sport_id in CATEGORIES.values()
        ]
        results = await asyncio.gather(*tasks)
        events = {k: v for data in results for k, v in data.items()}
        HTML_CACHE.write(events)
    live = []
    start_ts = now.delta(hours=-1).timestamp()
    end_ts = now.delta(minutes=5).timestamp()
    for k, v in events.items():
        if cached_keys & {k}:
            continue
        if not start_ts <= v["event_ts"] <= end_ts:
            continue
        live.append({**v})
    return live
 async def scrape(client: httpx.AsyncClient) -> None:
    cached_urls = CACHE_FILE.load()
    valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
@ -111,8 +160,6 @@ async def scrape(client: httpx.AsyncClient) -> None:
    log.info(f"Processing {len(events)} new URL(s)")
    if events:
        now = Time.now().timestamp()
        async with async_playwright() as p:
            browser, context = await network.browser(p)
@ -132,11 +179,12 @@ async def scrape(client: httpx.AsyncClient) -> None:
                    log=log,
                )
-                sport, event, logo, link = (
+                sport, event, logo, link, ts = (
                    ev["sport"],
                    ev["event"],
                    ev["logo"],
                    ev["link"],
                    ev["timestamp"],
                )
                key = f"[{sport}] {event} ({TAG})"
@ -147,7 +195,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
                    "url": url,
                    "logo": logo or pic,
                    "base": "https://storytrench.net/",
-                    "timestamp": now,
+                    "timestamp": ts,
                    "id": tvg_id or "Live.Event.us",
                    "link": link,
                }