From 00000d9b7428e78d064c006466afa5318d89e89c Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Thu, 19 Mar 2026 14:13:33 -0400 Subject: [PATCH] e - add march madness for roxie.py - edit scraping for streamsgate.py --- M3U8/scrapers/roxie.py | 13 ++- M3U8/scrapers/streamsgate.py | 178 ++++++++++++++++------------------ M3U8/scrapers/utils/config.py | 1 + 3 files changed, 96 insertions(+), 96 deletions(-) diff --git a/M3U8/scrapers/roxie.py b/M3U8/scrapers/roxie.py index 35871b84..49d77cdc 100644 --- a/M3U8/scrapers/roxie.py +++ b/M3U8/scrapers/roxie.py @@ -1,4 +1,5 @@ import asyncio +import re from functools import partial from urllib.parse import urljoin @@ -20,6 +21,7 @@ HTML_CACHE = Cache(f"{TAG}-html", exp=19_800) BASE_URL = "https://roxiestreams.info" SPORT_URLS = { + "March Madness": urljoin(BASE_URL, "march-madness"), "Racing": urljoin(BASE_URL, "motorsports"), # "American Football": urljoin(BASE_URL, "nfl"), } | { @@ -57,9 +59,16 @@ async def refresh_html_cache( if not (span := row.css_first("span.countdown-timer")): continue - data_start = span.attributes["data-start"].rsplit(":", 1)[0] + if not (data_start := span.attributes.get("data-start")): + continue - event_dt = Time.from_str(data_start, timezone="PST") + event_time = ( + data_start.rsplit(":", 1)[0] + if re.search(r"\d+:\d+:\d+", data_start) + else data_start + ) + + event_dt = Time.from_str(event_time, timezone="PST") event_sport = next((k for k, v in SPORT_URLS.items() if v == url), "Live Event") diff --git a/M3U8/scrapers/streamsgate.py b/M3U8/scrapers/streamsgate.py index f9b506a2..1b8f0975 100644 --- a/M3U8/scrapers/streamsgate.py +++ b/M3U8/scrapers/streamsgate.py @@ -1,10 +1,10 @@ -import asyncio +import json +import re from functools import partial -from itertools import chain from typing import Any -from urllib.parse import urljoin from playwright.async_api import Browser +from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -16,103 +16,95 @@ TAG = "STRMSGATE" CACHE_FILE = Cache(TAG, exp=10_800) -API_FILE = Cache(f"{TAG}-api", exp=19_800) - -BASE_URL = "https://streamingon.org" - -SPORT_URLS = [ - urljoin(BASE_URL, f"data/{sport}.json") - for sport in [ - "boxing", - # "cfb", - "f1", - "mlb", - "nba", - # "nfl", - "nhl", - "soccer", - "ufc", - ] -] - - -def get_event(t1: str, t2: str) -> str: - match t1: - case "RED ZONE": - return "NFL RedZone" - - case "TBD": - return "TBD" - - case _: - return f"{t1.strip()} vs {t2.strip()}" - - -async def refresh_api_cache(now_ts: float) -> list[dict[str, Any]]: - tasks = [network.request(url, log=log) for url in SPORT_URLS] - - results = await asyncio.gather(*tasks) - - if not (data := [*chain.from_iterable(r.json() for r in results if r)]): - return [{"timestamp": now_ts}] - - for ev in data: - ev["ts"] = ev.pop("timestamp") - - data[-1]["timestamp"] = now_ts - - return data +BASE_URL = "https://streamingon.org/index.php" async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) - if not (api_data := API_FILE.load(per_entry=False, index=-1)): - log.info("Refreshing API cache") - - api_data = await refresh_api_cache(now.timestamp()) - - API_FILE.write(api_data) - events = [] - start_dt = now.delta(hours=-1) - end_dt = now.delta(minutes=5) - - for stream_group in api_data: - date = stream_group.get("time") - - sport = stream_group.get("league") - - t1, t2 = stream_group.get("away"), stream_group.get("home") - - event = get_event(t1, t2) - - if not (date and sport): - continue - - if f"[{sport}] {event} ({TAG})" in cached_keys: - continue - - event_dt = Time.from_str(date, timezone="UTC") - - if not start_dt <= event_dt <= end_dt: - continue - - if not (streams := stream_group.get("streams")): - continue - - if not (url := streams[0].get("url")): - continue - - events.append( - { - "sport": sport, - "event": event, - "link": url, - "timestamp": event_dt.timestamp(), - } + if not ( + html_data := await network.request( + BASE_URL, + params={ + "sport": "all", + "league": "all", + "sort": "time", + "stream": "available", + "day": "all", + }, + log=log, ) + ): + return events + + link_data_ptrn = re.compile(r"var\s+linkData\s+=\s+({.*?});", re.I | re.S) + + if not (match := link_data_ptrn.search(html_data.text)): + log.warning("No `linkData` variable found.") + return events + + link_data: dict[str, dict[str, Any]] = json.loads(match[1]) + + start_dt = now.delta(minutes=-30) + end_dt = now.delta(minutes=30) + + soup = HTMLParser(html_data.content) + + for body in soup.css(".sport-body"): + if not (date_elem := body.css_first(".date-label")): + continue + + event_date = date_elem.text(strip=True) + + for card in soup.css(".game-card"): + if not (event_id := card.attributes.get("data-id")): + continue + + if not (league_elem := card.css_first(".card-league")): + continue + + if not (teams := card.css(".card-teams .card-team-name")): + continue + + if not (time_elem := card.css_first(".card-time")): + continue + + event_dt = Time.from_str( + f"{event_date} {time_elem.text(strip=True)}", + timezone="CET", + ) + + if not start_dt <= event_dt <= end_dt: + continue + + sport = league_elem.text(strip=True) + + team_1, team_2 = (team.text(strip=True) for team in teams) + + event_name = f"{team_2} vs {team_1}" + + if f"[{sport}] {event_name} ({TAG})" in cached_keys: + continue + + if not (event_info := link_data.get(event_id)): + continue + + if not (stream_links := event_info.get("streamLinks")): + continue + + if not (url := stream_links[0].get("url")): + continue + + events.append( + { + "sport": sport, + "event": event_name, + "link": url, + "timestamp": now.timestamp(), + } + ) return events @@ -175,8 +167,6 @@ async def scrape(browser: Browser) -> None: if url: valid_count += 1 - entry["url"] = url.split("&e")[0] - urls[key] = entry log.info(f"Collected and cached {valid_count - cached_count} new event(s)") diff --git a/M3U8/scrapers/utils/config.py b/M3U8/scrapers/utils/config.py index 5cda4646..0aba68dc 100644 --- a/M3U8/scrapers/utils/config.py +++ b/M3U8/scrapers/utils/config.py @@ -90,6 +90,7 @@ class Time(datetime): "%m/%d/%Y %H:%M", "%m/%d/%Y %I:%M %p", "%m/%d/%Y %H:%M:%S", + "%a, %d %b %Y %H:%M", "%a, %d %b %Y %H:%M:%S %z", "%A, %b %d, %Y %H:%M", ]