cache all events for streamhub instead of live events
This commit is contained in:
doms9 2025-12-15 02:06:46 -05:00
parent f755ffc78b
commit 00000d9cc1
2 changed files with 111 additions and 64 deletions

View file

@ -57,6 +57,18 @@ async def process_event(
return match[1] return match[1]
async def get_html_data(client: httpx.AsyncClient, url: str) -> bytes:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return b""
return r.content
async def refresh_html_cache( async def refresh_html_cache(
client: httpx.AsyncClient, client: httpx.AsyncClient,
url: str, url: str,
@ -64,15 +76,9 @@ async def refresh_html_cache(
now_ts: float, now_ts: float,
) -> dict[str, dict[str, str | float]]: ) -> dict[str, dict[str, str | float]]:
try: html_data = await get_html_data(client, url)
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return {} soup = HTMLParser(html_data)
soup = HTMLParser(r.content)
events = {} events = {}
@ -108,16 +114,15 @@ async def refresh_html_cache(
async def get_events( async def get_events(
client: httpx.AsyncClient, client: httpx.AsyncClient, cached_keys: set[str]
sport_urls: dict[str, str],
cached_keys: set[str],
) -> list[dict[str, str]]: ) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
if not (events := HTML_CACHE.load()): if not (events := HTML_CACHE.load()):
log.info("Refreshing HTML cache") log.info("Refreshing HTML cache")
sport_urls = {sport: urljoin(BASE_URL, sport) for sport in SPORT_ENDPOINTS}
tasks = [ tasks = [
refresh_html_cache( refresh_html_cache(
client, client,
@ -160,13 +165,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
sport_urls = {sport: urljoin(BASE_URL, sport) for sport in SPORT_ENDPOINTS} events = await get_events(client, set(cached_urls.keys()))
events = await get_events(
client,
sport_urls,
set(cached_urls.keys()),
)
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")

View file

@ -1,5 +1,6 @@
import asyncio import asyncio
from functools import partial from functools import partial
from urllib.parse import urljoin
import httpx import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
@ -15,7 +16,9 @@ TAG = "STRMHUB"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
BASE_URL = "https://streamhub.pro/live-now" HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=28_800)
BASE_URL = "https://streamhub.pro/"
CATEGORIES = { CATEGORIES = {
@ -33,69 +36,115 @@ CATEGORIES = {
} }
async def get_html_data(client: httpx.AsyncClient, sport: str) -> bytes: async def get_html_data(client: httpx.AsyncClient, sport_id: str) -> bytes:
try: try:
r = await client.get(BASE_URL, params={"sport_id": sport}) url = urljoin(BASE_URL, f"events/{Time.now().date()}")
r = await client.get(url, params={"sport_id": sport_id})
r.raise_for_status() r.raise_for_status()
except Exception as e: except Exception as e:
log.error(f'Failed to fetch "{BASE_URL}": {e}') log.error(f'Failed to fetch "{url}": {e}')
return b"" return b""
return r.content return r.content
async def get_events( async def refresh_html_cache(
client: httpx.AsyncClient, cached_keys: set[str] client: httpx.AsyncClient,
) -> list[dict[str, str]]: sport_id: str,
ts: float,
) -> dict[str, dict[str, str | float]]:
tasks = [get_html_data(client, sport) for sport in CATEGORIES.values()] html_data = await get_html_data(client, sport_id)
results = await asyncio.gather(*tasks) soup = HTMLParser(html_data)
soups = [HTMLParser(html) for html in results] events = {}
events = [] for section in soup.css(".events-section"):
if not (sport_node := section.css_first(".section-titlte")):
continue
for soup in soups: sport = sport_node.text(strip=True)
for section in soup.css(".events-section"):
if not (sport_node := section.css_first(".section-titlte")): logo = section.css_first(".league-icon img").attributes.get("src")
for event in section.css(".section-event"):
event_name = "Live Event"
if teams := event.css_first(".event-competitors"):
home, away = teams.text(strip=True).split("vs.")
event_name = f"{away} vs {home}"
if not (event_button := event.css_first(".event-button a")) or not (
href := event_button.attributes.get("href")
):
continue continue
sport = sport_node.text(strip=True) event_date = event.css_first(".event-countdown").attributes.get(
"data-start"
)
logo = section.css_first(".league-icon img").attributes.get("src") event_dt = Time.from_str(event_date, timezone="UTC")
for event in section.css(".section-event"): key = f"[{sport}] {event_name} ({TAG})"
event_name = "Live Event"
if teams := event.css_first(".event-competitors"): events[key] = {
home, away = teams.text(strip=True).split("vs.") "sport": sport,
"event": event_name,
event_name = f"{away} vs {home}" "link": href,
"logo": logo,
if not (event_button := event.css_first("div.event-button a")) or not ( "timestamp": ts,
href := event_button.attributes.get("href") "event_ts": event_dt.timestamp(),
): }
continue
key = f"[{sport}] {event_name} ({TAG})"
if cached_keys & {key}:
continue
events.append(
{
"sport": sport,
"event": event_name,
"link": href,
"logo": logo,
}
)
return events return events
async def get_events(
client: httpx.AsyncClient,
cached_keys: set[str],
) -> list[dict[str, str]]:
now = Time.clean(Time.now())
if not (events := HTML_CACHE.load()):
log.info("Refreshing HTML cache")
tasks = [
refresh_html_cache(
client,
sport_id,
now.timestamp(),
)
for sport_id in CATEGORIES.values()
]
results = await asyncio.gather(*tasks)
events = {k: v for data in results for k, v in data.items()}
HTML_CACHE.write(events)
live = []
start_ts = now.delta(hours=-1).timestamp()
end_ts = now.delta(minutes=5).timestamp()
for k, v in events.items():
if cached_keys & {k}:
continue
if not start_ts <= v["event_ts"] <= end_ts:
continue
live.append({**v})
return live
async def scrape(client: httpx.AsyncClient) -> None: async def scrape(client: httpx.AsyncClient) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
@ -111,8 +160,6 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
now = Time.now().timestamp()
async with async_playwright() as p: async with async_playwright() as p:
browser, context = await network.browser(p) browser, context = await network.browser(p)
@ -132,11 +179,12 @@ async def scrape(client: httpx.AsyncClient) -> None:
log=log, log=log,
) )
sport, event, logo, link = ( sport, event, logo, link, ts = (
ev["sport"], ev["sport"],
ev["event"], ev["event"],
ev["logo"], ev["logo"],
ev["link"], ev["link"],
ev["timestamp"],
) )
key = f"[{sport}] {event} ({TAG})" key = f"[{sport}] {event} ({TAG})"
@ -147,7 +195,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
"url": url, "url": url,
"logo": logo or pic, "logo": logo or pic,
"base": "https://storytrench.net/", "base": "https://storytrench.net/",
"timestamp": now, "timestamp": ts,
"id": tvg_id or "Live.Event.us", "id": tvg_id or "Live.Event.us",
"link": link, "link": link,
} }