- edit scraping for streamsgate.py
This commit is contained in:
doms9 2026-03-20 12:52:11 -04:00
parent 0ae675a583
commit 00000d9411

View file

@ -1,10 +1,10 @@
import json import asyncio
import re
from functools import partial from functools import partial
from itertools import chain
from typing import Any from typing import Any
from urllib.parse import urljoin
from playwright.async_api import Browser from playwright.async_api import Browser
from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -16,95 +16,101 @@ TAG = "STRMSGATE"
CACHE_FILE = Cache(TAG, exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
BASE_URL = "https://streamingon.org/index.php" API_FILE = Cache(f"{TAG}-api", exp=19_800)
BASE_URL = "https://streamingon.org"
SPORT_URLS = [
urljoin(BASE_URL, f"data/{sport}.json")
for sport in [
# "cfb",
"mlb",
"nba",
# "nfl",
"nhl",
"soccer",
"ufc",
]
]
def get_event(t1: str, t2: str) -> str:
match t1:
case "RED ZONE":
return "NFL RedZone"
case "TBD":
return "TBD"
case _:
return f"{t1.strip()} vs {t2.strip()}"
async def refresh_api_cache(now_ts: float) -> list[dict[str, Any]]:
tasks = [network.request(url, log=log) for url in SPORT_URLS]
results = await asyncio.gather(*tasks)
if not (data := [*chain.from_iterable(r.json() for r in results if r)]):
return [{"timestamp": now_ts}]
for ev in data:
ev["ts"] = ev.pop("timestamp")
data[-1]["timestamp"] = now_ts
return data
async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
if not (api_data := API_FILE.load(per_entry=False, index=-1)):
log.info("Refreshing API cache")
api_data = await refresh_api_cache(now.timestamp())
API_FILE.write(api_data)
events = [] events = []
if not ( start_dt = now.delta(hours=-1)
html_data := await network.request( end_dt = now.delta(minutes=5)
BASE_URL,
params={
"sport": "all",
"league": "all",
"sort": "time",
"stream": "available",
"day": "all",
},
log=log,
)
):
return events
link_data_ptrn = re.compile(r"var\s+linkData\s+=\s+({.*?});", re.I | re.S) for stream_group in api_data:
date = stream_group.get("time")
if not (match := link_data_ptrn.search(html_data.text)): sport = stream_group.get("league")
log.warning("No `linkData` variable found.")
return events
link_data: dict[str, dict[str, Any]] = json.loads(match[1]) t1, t2 = stream_group.get("away"), stream_group.get("home")
start_dt = now.delta(minutes=-30) event = get_event(t1, t2)
end_dt = now.delta(minutes=30)
soup = HTMLParser(html_data.content) if not (date and sport):
for body in soup.css(".sport-body"):
if not (date_elem := body.css_first(".date-label")):
continue continue
event_date = date_elem.text(strip=True) if f"[{sport}] {event} ({TAG})" in cached_keys:
continue
for card in soup.css(".game-card"): event_dt = Time.from_str(date, timezone="UTC")
if not (event_id := card.attributes.get("data-id")):
continue
if not (league_elem := card.css_first(".card-league")): if not start_dt <= event_dt <= end_dt:
continue continue
if not (teams := card.css(".card-teams .card-team-name")): if not (streams := stream_group.get("streams")):
continue continue
if not (time_elem := card.css_first(".card-time")): if not (url := streams[0].get("url")):
continue continue
event_dt = Time.from_str( events.append(
f"{event_date} {time_elem.text(strip=True)}", {
timezone="CET", "sport": sport,
) "event": event,
"link": url,
if not start_dt <= event_dt <= end_dt: "timestamp": event_dt.timestamp(),
continue }
)
sport = league_elem.text(strip=True)
team_1, team_2 = (team.text(strip=True) for team in teams)
event_name = f"{team_2} vs {team_1}"
if f"[{sport}] {event_name} ({TAG})" in cached_keys:
continue
if not (event_info := link_data.get(event_id)):
continue
if not (stream_links := event_info.get("streamLinks")):
continue
if not (url := stream_links[0].get("url")):
continue
events.append(
{
"sport": sport,
"event": event_name,
"link": url,
"timestamp": now.timestamp(),
}
)
return events return events
@ -167,6 +173,8 @@ async def scrape(browser: Browser) -> None:
if url: if url:
valid_count += 1 valid_count += 1
entry["url"] = url.split("&e")[0]
urls[key] = entry urls[key] = entry
log.info(f"Collected and cached {valid_count - cached_count} new event(s)") log.info(f"Collected and cached {valid_count - cached_count} new event(s)")