This commit is contained in:
doms9 2025-09-03 18:41:07 -04:00
parent 26eac184ee
commit 00000d96a5
3 changed files with 76 additions and 77 deletions

View file

@ -97,67 +97,6 @@ async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO:
return io.BytesIO(b"")
async def parse_feed(
url: str,
ssl_ctx: ssl.SSLContext,
cached_keys: set[str],
) -> list[dict[str, str]]:
events: list[dict[str, str]] = []
pub_date_format = "%a, %d %b %Y %H:%M:%S %z"
now = datetime.now(TZ)
window_start, window_end = now - timedelta(hours=3), now + timedelta(hours=1)
buffer = await fetch_xml_stream(url, ssl_ctx)
for _, elem in ET.iterparse(buffer, events=("end",)):
if elem.tag == "item":
title = elem.findtext("title")
desc = elem.findtext("description")
pub_date = elem.findtext("pubDate")
link = elem.findtext("link")
try:
dt = datetime.strptime(pub_date, pub_date_format)
dt = dt.astimezone(TZ)
except Exception:
elem.clear()
continue
if window_start <= dt <= window_end:
sport, event = (
(
desc.split(".")[0].strip(),
" ".join(p.strip() for p in desc.split(".")[1:]),
)
if desc
else ("", "")
)
key = f"[{sport}: {event}] {title}"
if key in cached_keys:
elem.clear()
continue
if exist_sprts & {sport, event}:
continue
events.append(
{
"sport": sport,
"event": event,
"title": title,
"link": link,
}
)
elem.clear()
return events
async def process_event(url: str, url_num: int) -> str | None:
async with async_playwright() as p:
browser = await p.firefox.launch(headless=True)
@ -176,10 +115,8 @@ async def process_event(url: str, url_num: int) -> str | None:
".m3u8" in req.url
and "amazonaws" not in req.url
and "knitcdn" not in req.url
and not captured
):
captured.append(req.url)
got_one.set()
popup = None
@ -284,6 +221,67 @@ async def process_event(url: str, url_num: int) -> str | None:
await browser.close()
async def get_events(
url: str,
ssl_ctx: ssl.SSLContext,
cached_keys: set[str],
) -> list[dict[str, str]]:
events: list[dict[str, str]] = []
pub_date_format = "%a, %d %b %Y %H:%M:%S %z"
now = datetime.now(TZ)
window_start, window_end = now - timedelta(hours=3), now + timedelta(hours=1)
buffer = await fetch_xml_stream(url, ssl_ctx)
for _, elem in ET.iterparse(buffer, events=("end",)):
if elem.tag == "item":
title = elem.findtext("title")
desc = elem.findtext("description")
pub_date = elem.findtext("pubDate")
link = elem.findtext("link")
try:
dt = datetime.strptime(pub_date, pub_date_format)
dt = dt.astimezone(TZ)
except Exception:
elem.clear()
continue
if window_start <= dt <= window_end:
sport, event = (
(
desc.split(".")[0].strip(),
" ".join(p.strip() for p in desc.split(".")[1:]),
)
if desc
else ("", "")
)
key = f"[{sport}: {event}] {title}"
if key in cached_keys:
elem.clear()
continue
if exist_sprts & {sport, event}:
continue
events.append(
{
"sport": sport,
"event": event,
"title": title,
"link": link,
}
)
elem.clear()
return events
async def main(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"')
@ -292,13 +290,13 @@ async def main(client: httpx.AsyncClient) -> None:
cached_urls = load_cache()
cached_count = len(cached_urls)
events = await parse_feed(BASE_URL, cert, set(cached_urls.keys()))
events = await get_events(BASE_URL, cert, set(cached_urls.keys()))
log.info(f"Processing {len(events)} URLs")
now_ts = datetime.now(TZ).timestamp()
for num, ev in enumerate(events, start=1):
for i, ev in enumerate(events, start=1):
sport = ev["sport"]
event = ev["event"]
title = ev["title"]
@ -307,8 +305,8 @@ async def main(client: httpx.AsyncClient) -> None:
key = f"[{sport}: {event}] {title}"
url = await safe_process_event(
lambda: process_event(link, url_num=num),
url_num=num,
lambda: process_event(link, url_num=i),
url_num=i,
log=log,
)
@ -330,4 +328,4 @@ async def main(client: httpx.AsyncClient) -> None:
log.info(f"Cached {cached_count} event(s)")
log.info(f"Collected {new_count} event(s)")
log.info(f"Collected {new_count} new event(s)")