diff --git a/M3U8/scrapers/streamsgate.py b/M3U8/scrapers/streamsgate.py index 1b8f0975..ffe3a4dc 100644 --- a/M3U8/scrapers/streamsgate.py +++ b/M3U8/scrapers/streamsgate.py @@ -1,10 +1,10 @@ -import json -import re +import asyncio from functools import partial +from itertools import chain from typing import Any +from urllib.parse import urljoin from playwright.async_api import Browser -from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -16,95 +16,101 @@ TAG = "STRMSGATE" CACHE_FILE = Cache(TAG, exp=10_800) -BASE_URL = "https://streamingon.org/index.php" +API_FILE = Cache(f"{TAG}-api", exp=19_800) + +BASE_URL = "https://streamingon.org" + +SPORT_URLS = [ + urljoin(BASE_URL, f"data/{sport}.json") + for sport in [ + # "cfb", + "mlb", + "nba", + # "nfl", + "nhl", + "soccer", + "ufc", + ] +] + + +def get_event(t1: str, t2: str) -> str: + match t1: + case "RED ZONE": + return "NFL RedZone" + + case "TBD": + return "TBD" + + case _: + return f"{t1.strip()} vs {t2.strip()}" + + +async def refresh_api_cache(now_ts: float) -> list[dict[str, Any]]: + tasks = [network.request(url, log=log) for url in SPORT_URLS] + + results = await asyncio.gather(*tasks) + + if not (data := [*chain.from_iterable(r.json() for r in results if r)]): + return [{"timestamp": now_ts}] + + for ev in data: + ev["ts"] = ev.pop("timestamp") + + data[-1]["timestamp"] = now_ts + + return data async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) + if not (api_data := API_FILE.load(per_entry=False, index=-1)): + log.info("Refreshing API cache") + + api_data = await refresh_api_cache(now.timestamp()) + + API_FILE.write(api_data) + events = [] - if not ( - html_data := await network.request( - BASE_URL, - params={ - "sport": "all", - "league": "all", - "sort": "time", - "stream": "available", - "day": "all", - }, - log=log, - ) - ): - return events + start_dt = now.delta(hours=-1) + end_dt = now.delta(minutes=5) - link_data_ptrn = re.compile(r"var\s+linkData\s+=\s+({.*?});", re.I | re.S) + for stream_group in api_data: + date = stream_group.get("time") - if not (match := link_data_ptrn.search(html_data.text)): - log.warning("No `linkData` variable found.") - return events + sport = stream_group.get("league") - link_data: dict[str, dict[str, Any]] = json.loads(match[1]) + t1, t2 = stream_group.get("away"), stream_group.get("home") - start_dt = now.delta(minutes=-30) - end_dt = now.delta(minutes=30) + event = get_event(t1, t2) - soup = HTMLParser(html_data.content) - - for body in soup.css(".sport-body"): - if not (date_elem := body.css_first(".date-label")): + if not (date and sport): continue - event_date = date_elem.text(strip=True) + if f"[{sport}] {event} ({TAG})" in cached_keys: + continue - for card in soup.css(".game-card"): - if not (event_id := card.attributes.get("data-id")): - continue + event_dt = Time.from_str(date, timezone="UTC") - if not (league_elem := card.css_first(".card-league")): - continue + if not start_dt <= event_dt <= end_dt: + continue - if not (teams := card.css(".card-teams .card-team-name")): - continue + if not (streams := stream_group.get("streams")): + continue - if not (time_elem := card.css_first(".card-time")): - continue + if not (url := streams[0].get("url")): + continue - event_dt = Time.from_str( - f"{event_date} {time_elem.text(strip=True)}", - timezone="CET", - ) - - if not start_dt <= event_dt <= end_dt: - continue - - sport = league_elem.text(strip=True) - - team_1, team_2 = (team.text(strip=True) for team in teams) - - event_name = f"{team_2} vs {team_1}" - - if f"[{sport}] {event_name} ({TAG})" in cached_keys: - continue - - if not (event_info := link_data.get(event_id)): - continue - - if not (stream_links := event_info.get("streamLinks")): - continue - - if not (url := stream_links[0].get("url")): - continue - - events.append( - { - "sport": sport, - "event": event_name, - "link": url, - "timestamp": now.timestamp(), - } - ) + events.append( + { + "sport": sport, + "event": event, + "link": url, + "timestamp": event_dt.timestamp(), + } + ) return events @@ -167,6 +173,8 @@ async def scrape(browser: Browser) -> None: if url: valid_count += 1 + entry["url"] = url.split("&e")[0] + urls[key] = entry log.info(f"Collected and cached {valid_count - cached_count} new event(s)")