This commit is contained in:
doms9 2025-09-03 00:00:22 -04:00
parent a02d30459a
commit 00000d9855
5 changed files with 98 additions and 42 deletions

View file

@ -50,7 +50,7 @@ async def get_schedule(client: httpx.AsyncClient, base_url: str) -> list[dict]:
)
if " - " in after_time:
sport, event_name = [x.strip() for x in after_time.split(" - ", 1)]
sport, event_name = (x.strip() for x in after_time.split(" - ", 1))
else:
sport, event_name = "", after_time

View file

@ -131,4 +131,7 @@ async def main(client: httpx.AsyncClient) -> None:
),
}
log.info(f"Collected {len(urls)} live events")
log.info(f"Collected {len(urls)} live event(s)")
# add caching

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import asyncio
import io
import json
import ssl
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
@ -30,12 +31,15 @@ CERT_BUNDL_URLS = [
CERT_FILE = Path(__file__).parent / "cached-ca.pem"
CACHE_FILE = Path(__file__).parent / "livetvsx.json"
async def safe_process_event(fn, timeout_sec=20) -> Any | None:
async def safe_process_event(fn, url_num: int, timeout=20) -> Any | None:
try:
return await asyncio.wait_for(fn(), timeout=timeout_sec)
return await asyncio.wait_for(fn(), timeout=timeout)
except asyncio.TimeoutError:
log.warning(f"Timed out after {timeout_sec}s, skipping event")
log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event")
return
async def write_to_cert(client: httpx.AsyncClient, url: str, cert: Path) -> None:
@ -71,6 +75,21 @@ async def get_cert(client: httpx.AsyncClient) -> ssl.SSLContext:
return ssl.create_default_context(cafile=CERT_FILE)
def load_cache() -> dict[str, dict[str, str | str]]:
try:
data = json.loads(CACHE_FILE.read_text(encoding="utf-8"))
now = datetime.now().timestamp()
return {
k: v
for k, v in data.items()
if now - v.get("timestamp", 0) < timedelta(hours=4).total_seconds()
}
except (FileNotFoundError, json.JSONDecodeError):
return {}
async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO:
buffer = io.BytesIO()
@ -91,8 +110,13 @@ async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO:
return io.BytesIO(b"")
async def parse_feed(url: str, ssl_ctx: ssl.SSLContext) -> dict[str, dict[str, str]]:
events = []
async def parse_feed(
url: str,
ssl_ctx: ssl.SSLContext,
cached_keys: set[str],
) -> list[dict[str, str]]:
events: list[dict[str, str]] = []
pub_date_format = "%a, %d %b %Y %H:%M:%S %z"
now = datetime.now(TZ)
@ -124,21 +148,28 @@ async def parse_feed(url: str, ssl_ctx: ssl.SSLContext) -> dict[str, dict[str, s
else ("", "")
)
events.append(
{
"sport": sport,
"event": event,
"title": title,
"link": link,
}
)
key = f"[{sport}: {event}] {title}"
if key in cached_keys:
elem.clear()
continue
elif not tvp_sports & {sport, event}:
events.append(
{
"sport": sport,
"event": event,
"title": title,
"link": link,
}
)
elem.clear()
return events
async def process_event(url: str, max_wait_ms=15_000) -> str | None:
async def process_event(url: str, url_num: int, max_wait_ms=15_000) -> str | None:
async with async_playwright() as p:
browser = await p.firefox.launch(headless=True)
@ -179,16 +210,18 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:
await ev_page.wait_for_timeout(500)
except Exception as e:
log.debug(f"Failed to click Browser Links tab: {e}")
log.debug(f"URL {url_num}) Failed to click Browser Links tab: {e}")
return
else:
log.warning("Browser Links tab not found")
log.warning(f"URL {url_num}) Browser Links tab not found")
link_img = await ev_page.query_selector(
"tr:nth-child(2) > td:nth-child(1) td:nth-child(6) img"
)
if not link_img:
log.warning("No browser link to click.")
log.warning(f"URL {url_num}) No browser link to click.")
return
ev_page.on("request", capture_req)
@ -198,7 +231,7 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:
await link_img.click()
except Exception as e:
log.debug(
f"Click failed (popup might have already been opened): {e}"
f"URL {url_num}) Click failed (popup might have already been opened): {e}"
)
popup = await popup_info.value
@ -209,7 +242,8 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:
try:
await link_img.click()
except Exception as e:
log.debug(f"Fallback click failed: {e}")
log.debug(f"URL {url_num}) Fallback click failed: {e}")
return
wait_task = asyncio.create_task(got_one.wait())
@ -217,7 +251,8 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:
await asyncio.wait_for(wait_task, timeout=max_wait_ms / 1000)
except asyncio.TimeoutError:
log.warning("Timed out waiting for m3u8.")
log.warning(f"URL {url_num}) Timed out waiting for m3u8.")
return
finally:
if not wait_task.done():
@ -238,10 +273,12 @@ async def process_event(url: str, max_wait_ms=15_000) -> str | None:
await ev_page.close()
if captured:
log.info(f"URL {url_num}) Captured M3U8")
return captured[-1]
log.warning("No m3u8 captured in popup or inline playback.")
log.warning(f"URL {url_num}) No m3u8 captured in popup or inline playback.")
return
except Exception as e:
try:
ev_page.remove_listener("request", capture_req)
@ -263,29 +300,44 @@ async def main(client: httpx.AsyncClient) -> None:
cert = await get_cert(client)
events = await parse_feed(BASE_URL, cert)
cached_urls = load_cache()
cached_keys = set(cached_urls.keys())
cached_count = len(cached_urls)
log.info(f"Processing {len(events)} events")
events = await parse_feed(BASE_URL, cert, cached_keys)
for ev in events:
if tvp_sports & {
sport := ev["sport"],
event := ev["event"],
}: # already in tvpass
continue
log.info(f"Processing {len(events)} URLs")
url = await safe_process_event(lambda: process_event(ev["link"]))
now_ts = datetime.now().timestamp()
for num, ev in enumerate(events, start=1):
sport = ev["sport"]
event = ev["event"]
title = ev["title"]
link = ev["link"]
key = f"[{sport}: {event}] {title}"
url = await safe_process_event(
lambda: process_event(link, url_num=num), url_num=num
)
if url:
urls[f"[{sport}: {event}] {ev['title']}"] = {
entry = {
"url": url,
"logo": logos.get(
sport,
"https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
),
"timestamp": now_ts,
}
log.info(f"Collected {len(urls)} live events")
urls[key] = cached_urls[key] = entry
CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
# add caching
new_count = len(cached_urls) - cached_count
log.info(f"Cached {cached_count} event(s)")
log.info(f"Collected {new_count} new event(s)")

View file

@ -51,7 +51,7 @@ async def fetch_m3u8(client: httpx.AsyncClient) -> list[str] | None:
async def main(client: httpx.AsyncClient) -> None:
if cached := load_cache():
urls.update(cached)
log.info(f"Collected {len(urls)} events from cache")
log.info(f"Collected {len(urls)} event(s) from cache")
return
log.info(f'Scraping from "{base_url}"')
@ -87,4 +87,4 @@ async def main(client: httpx.AsyncClient) -> None:
if urls:
base_file.write_text(json.dumps(urls, indent=2), encoding="utf-8")
log.info(f"Cached {len(urls)} events")
log.info(f"Cached {len(urls)} event(s)")