From 00000d9cc121e27cca585a0b5b0a1c97d9206ad2 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Mon, 15 Dec 2025 02:06:46 -0500 Subject: [PATCH] e cache all events for streamhub instead of live events --- M3U8/scrapers/roxie.py | 37 +++++----- M3U8/scrapers/streamhub.py | 138 +++++++++++++++++++++++++------------ 2 files changed, 111 insertions(+), 64 deletions(-) diff --git a/M3U8/scrapers/roxie.py b/M3U8/scrapers/roxie.py index 005b8ee..b7d17ac 100644 --- a/M3U8/scrapers/roxie.py +++ b/M3U8/scrapers/roxie.py @@ -57,6 +57,18 @@ async def process_event( return match[1] +async def get_html_data(client: httpx.AsyncClient, url: str) -> bytes: + try: + r = await client.get(url) + r.raise_for_status() + except Exception as e: + log.error(f'Failed to fetch "{url}": {e}') + + return b"" + + return r.content + + async def refresh_html_cache( client: httpx.AsyncClient, url: str, @@ -64,15 +76,9 @@ async def refresh_html_cache( now_ts: float, ) -> dict[str, dict[str, str | float]]: - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') + html_data = await get_html_data(client, url) - return {} - - soup = HTMLParser(r.content) + soup = HTMLParser(html_data) events = {} @@ -108,16 +114,15 @@ async def refresh_html_cache( async def get_events( - client: httpx.AsyncClient, - sport_urls: dict[str, str], - cached_keys: set[str], + client: httpx.AsyncClient, cached_keys: set[str] ) -> list[dict[str, str]]: - now = Time.clean(Time.now()) if not (events := HTML_CACHE.load()): log.info("Refreshing HTML cache") + sport_urls = {sport: urljoin(BASE_URL, sport) for sport in SPORT_ENDPOINTS} + tasks = [ refresh_html_cache( client, @@ -160,13 +165,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - sport_urls = {sport: urljoin(BASE_URL, sport) for sport in SPORT_ENDPOINTS} - - events = await get_events( - client, - sport_urls, - set(cached_urls.keys()), - ) + events = await get_events(client, set(cached_urls.keys())) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/streamhub.py b/M3U8/scrapers/streamhub.py index c8f0bbf..283be92 100644 --- a/M3U8/scrapers/streamhub.py +++ b/M3U8/scrapers/streamhub.py @@ -1,5 +1,6 @@ import asyncio from functools import partial +from urllib.parse import urljoin import httpx from playwright.async_api import async_playwright @@ -15,7 +16,9 @@ TAG = "STRMHUB" CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) -BASE_URL = "https://streamhub.pro/live-now" +HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=28_800) + +BASE_URL = "https://streamhub.pro/" CATEGORIES = { @@ -33,69 +36,115 @@ CATEGORIES = { } -async def get_html_data(client: httpx.AsyncClient, sport: str) -> bytes: +async def get_html_data(client: httpx.AsyncClient, sport_id: str) -> bytes: try: - r = await client.get(BASE_URL, params={"sport_id": sport}) + url = urljoin(BASE_URL, f"events/{Time.now().date()}") + + r = await client.get(url, params={"sport_id": sport_id}) + r.raise_for_status() except Exception as e: - log.error(f'Failed to fetch "{BASE_URL}": {e}') + log.error(f'Failed to fetch "{url}": {e}') return b"" return r.content -async def get_events( - client: httpx.AsyncClient, cached_keys: set[str] -) -> list[dict[str, str]]: +async def refresh_html_cache( + client: httpx.AsyncClient, + sport_id: str, + ts: float, +) -> dict[str, dict[str, str | float]]: - tasks = [get_html_data(client, sport) for sport in CATEGORIES.values()] + html_data = await get_html_data(client, sport_id) - results = await asyncio.gather(*tasks) + soup = HTMLParser(html_data) - soups = [HTMLParser(html) for html in results] + events = {} - events = [] + for section in soup.css(".events-section"): + if not (sport_node := section.css_first(".section-titlte")): + continue - for soup in soups: - for section in soup.css(".events-section"): - if not (sport_node := section.css_first(".section-titlte")): + sport = sport_node.text(strip=True) + + logo = section.css_first(".league-icon img").attributes.get("src") + + for event in section.css(".section-event"): + event_name = "Live Event" + + if teams := event.css_first(".event-competitors"): + home, away = teams.text(strip=True).split("vs.") + + event_name = f"{away} vs {home}" + + if not (event_button := event.css_first(".event-button a")) or not ( + href := event_button.attributes.get("href") + ): continue - sport = sport_node.text(strip=True) + event_date = event.css_first(".event-countdown").attributes.get( + "data-start" + ) - logo = section.css_first(".league-icon img").attributes.get("src") + event_dt = Time.from_str(event_date, timezone="UTC") - for event in section.css(".section-event"): - event_name = "Live Event" + key = f"[{sport}] {event_name} ({TAG})" - if teams := event.css_first(".event-competitors"): - home, away = teams.text(strip=True).split("vs.") - - event_name = f"{away} vs {home}" - - if not (event_button := event.css_first("div.event-button a")) or not ( - href := event_button.attributes.get("href") - ): - continue - - key = f"[{sport}] {event_name} ({TAG})" - - if cached_keys & {key}: - continue - - events.append( - { - "sport": sport, - "event": event_name, - "link": href, - "logo": logo, - } - ) + events[key] = { + "sport": sport, + "event": event_name, + "link": href, + "logo": logo, + "timestamp": ts, + "event_ts": event_dt.timestamp(), + } return events +async def get_events( + client: httpx.AsyncClient, + cached_keys: set[str], +) -> list[dict[str, str]]: + now = Time.clean(Time.now()) + + if not (events := HTML_CACHE.load()): + log.info("Refreshing HTML cache") + + tasks = [ + refresh_html_cache( + client, + sport_id, + now.timestamp(), + ) + for sport_id in CATEGORIES.values() + ] + + results = await asyncio.gather(*tasks) + + events = {k: v for data in results for k, v in data.items()} + + HTML_CACHE.write(events) + + live = [] + + start_ts = now.delta(hours=-1).timestamp() + end_ts = now.delta(minutes=5).timestamp() + + for k, v in events.items(): + if cached_keys & {k}: + continue + + if not start_ts <= v["event_ts"] <= end_ts: + continue + + live.append({**v}) + + return live + + async def scrape(client: httpx.AsyncClient) -> None: cached_urls = CACHE_FILE.load() valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} @@ -111,8 +160,6 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f"Processing {len(events)} new URL(s)") if events: - now = Time.now().timestamp() - async with async_playwright() as p: browser, context = await network.browser(p) @@ -132,11 +179,12 @@ async def scrape(client: httpx.AsyncClient) -> None: log=log, ) - sport, event, logo, link = ( + sport, event, logo, link, ts = ( ev["sport"], ev["event"], ev["logo"], ev["link"], + ev["timestamp"], ) key = f"[{sport}] {event} ({TAG})" @@ -147,7 +195,7 @@ async def scrape(client: httpx.AsyncClient) -> None: "url": url, "logo": logo or pic, "base": "https://storytrench.net/", - "timestamp": now, + "timestamp": ts, "id": tvg_id or "Live.Event.us", "link": link, }