From 00000d9e294441ba19b2677d148b450ade61c85b Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Fri, 27 Feb 2026 18:35:33 -0500 Subject: [PATCH] e edit scraping method for webcast.py --- M3U8/fetch.py | 2 +- M3U8/scrapers/utils/config.py | 2 +- M3U8/scrapers/utils/webwork.py | 5 +- M3U8/scrapers/webcast.py | 119 ++++++++++++++++++++++----------- 4 files changed, 84 insertions(+), 44 deletions(-) diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 910aa988..480da8c0 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -71,7 +71,6 @@ async def main() -> None: asyncio.create_task(streamcenter.scrape(xtrnl_brwsr)), asyncio.create_task(streamhub.scrape(xtrnl_brwsr)), asyncio.create_task(streamsgate.scrape(xtrnl_brwsr)), - asyncio.create_task(webcast.scrape(hdl_brwsr)), ] httpx_tasks = [ @@ -84,6 +83,7 @@ async def main() -> None: asyncio.create_task(tvapp.scrape()), asyncio.create_task(volokit.scrape()), asyncio.create_task(xstreameast.scrape()), + asyncio.create_task(webcast.scrape()), ] await asyncio.gather(*(pw_tasks + httpx_tasks)) diff --git a/M3U8/scrapers/utils/config.py b/M3U8/scrapers/utils/config.py index 3bd8b603..4a43ae6d 100644 --- a/M3U8/scrapers/utils/config.py +++ b/M3U8/scrapers/utils/config.py @@ -149,7 +149,7 @@ class Leagues: league: str, ) -> bool: - pattern = re.compile(r"\s+(?:-|vs\.?|at|@)\s+", re.IGNORECASE) + pattern = re.compile(r"\s+(?:-|vs\.?|at|@)\s+", re.I) if pattern.search(event): t1, t2 = re.split(pattern, event) diff --git a/M3U8/scrapers/utils/webwork.py b/M3U8/scrapers/utils/webwork.py index 5b4c93b2..7d502a6f 100644 --- a/M3U8/scrapers/utils/webwork.py +++ b/M3U8/scrapers/utils/webwork.py @@ -215,10 +215,7 @@ class Network: escaped = [re.escape(i) for i in invalids] - pattern = re.compile( - rf"^(?!.*({'|'.join(escaped)})).*\.m3u8", - re.IGNORECASE, - ) + pattern = re.compile(rf"^(?!.*({'|'.join(escaped)})).*\.m3u8", re.I) if pattern.search(req.url): captured.append(req.url) diff --git a/M3U8/scrapers/webcast.py b/M3U8/scrapers/webcast.py index a2477424..d0d0aa7a 100644 --- a/M3U8/scrapers/webcast.py +++ b/M3U8/scrapers/webcast.py @@ -1,7 +1,7 @@ import asyncio +import re from functools import partial -from playwright.async_api import Browser from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -27,6 +27,47 @@ def fix_event(s: str) -> str: return " vs ".join(s.split("@")) +async def process_event(url: str, url_num: int) -> str | None: + if not (event_data := await network.request(url, log=log)): + log.info(f"URL {url_num}) Failed to load url.") + + return + + soup = HTMLParser(event_data.content) + + if not (iframe := soup.css_first('iframe[name="srcFrame"]')): + log.warning(f"URL {url_num}) No iframe element found.") + + return + + if not (iframe_src := iframe.attributes.get("src")): + log.warning(f"URL {url_num}) No iframe source found.") + + return + + if not ( + iframe_src_data := await network.request( + iframe_src, + headers={"Referer": url}, + log=log, + ) + ): + log.info(f"URL {url_num}) Failed to load iframe source.") + + return + + pattern = re.compile(r"source:\s+(\'|\")(.*)(\'|\")", re.I) + + if not (match := pattern.search(iframe_src_data.text)): + log.warning(f"URL {url_num}) No Clappr source found.") + + return + + log.info(f"URL {url_num}) Captured M3U8") + + return match[2] + + async def refresh_html_cache(url: str) -> dict[str, dict[str, str | float]]: events = {} @@ -112,12 +153,14 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return live -async def scrape(browser: Browser) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() - cached_count = len(cached_urls) + valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} - urls.update(cached_urls) + valid_count = cached_count = len(valid_urls) + + urls.update(valid_urls) log.info(f"Loaded {cached_count} event(s) from cache") @@ -128,47 +171,47 @@ async def scrape(browser: Browser) -> None: if events: log.info(f"Processing {len(events)} new URL(s)") - async with network.event_context(browser) as context: - for i, ev in enumerate(events, start=1): - async with network.event_page(context) as page: - handler = partial( - network.process_event, - url=(link := ev["link"]), - url_num=i, - page=page, - log=log, - ) + for i, ev in enumerate(events, start=1): + handler = partial( + process_event, + url=(link := ev["link"]), + url_num=i, + ) - url = await network.safe_process( - handler, - url_num=i, - semaphore=network.PW_S, - log=log, - ) + url = await network.safe_process( + handler, + url_num=i, + semaphore=network.PW_S, + log=log, + ) - if url: - sport, event, ts = ( - ev["sport"], - ev["event"], - ev["event_ts"], - ) + sport, event, ts = ( + ev["sport"], + ev["event"], + ev["event_ts"], + ) - key = f"[{sport}] {event} ({TAG})" + key = f"[{sport}] {event} ({TAG})" - tvg_id, logo = leagues.get_tvg_info(sport, event) + tvg_id, logo = leagues.get_tvg_info(sport, event) - entry = { - "url": url, - "logo": logo, - "base": BASE_URLS[sport], - "timestamp": ts, - "id": tvg_id or "Live.Event.us", - "link": link, - } + entry = { + "url": url, + "logo": logo, + "base": BASE_URLS[sport], + "timestamp": ts, + "id": tvg_id or "Live.Event.us", + "link": link, + } - urls[key] = cached_urls[key] = entry + cached_urls[key] = entry - if new_count := len(cached_urls) - cached_count: + if url: + valid_count += 1 + + urls[key] = entry + + if new_count := valid_count - cached_count: log.info(f"Collected and cached {new_count} new event(s)") else: