diff --git a/M3U8/scrape/livetvsx.py b/M3U8/scrape/livetvsx.py index 51fc7fa..87e7e53 100644 --- a/M3U8/scrape/livetvsx.py +++ b/M3U8/scrape/livetvsx.py @@ -97,67 +97,6 @@ async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO: return io.BytesIO(b"") -async def parse_feed( - url: str, - ssl_ctx: ssl.SSLContext, - cached_keys: set[str], -) -> list[dict[str, str]]: - - events: list[dict[str, str]] = [] - pub_date_format = "%a, %d %b %Y %H:%M:%S %z" - now = datetime.now(TZ) - - window_start, window_end = now - timedelta(hours=3), now + timedelta(hours=1) - - buffer = await fetch_xml_stream(url, ssl_ctx) - - for _, elem in ET.iterparse(buffer, events=("end",)): - if elem.tag == "item": - title = elem.findtext("title") - desc = elem.findtext("description") - pub_date = elem.findtext("pubDate") - link = elem.findtext("link") - - try: - dt = datetime.strptime(pub_date, pub_date_format) - dt = dt.astimezone(TZ) - except Exception: - elem.clear() - continue - - if window_start <= dt <= window_end: - sport, event = ( - ( - desc.split(".")[0].strip(), - " ".join(p.strip() for p in desc.split(".")[1:]), - ) - if desc - else ("", "") - ) - - key = f"[{sport}: {event}] {title}" - - if key in cached_keys: - elem.clear() - continue - - if exist_sprts & {sport, event}: - continue - - events.append( - { - "sport": sport, - "event": event, - "title": title, - "link": link, - } - ) - - elem.clear() - - return events - - async def process_event(url: str, url_num: int) -> str | None: async with async_playwright() as p: browser = await p.firefox.launch(headless=True) @@ -176,10 +115,8 @@ async def process_event(url: str, url_num: int) -> str | None: ".m3u8" in req.url and "amazonaws" not in req.url and "knitcdn" not in req.url - and not captured ): captured.append(req.url) - got_one.set() popup = None @@ -284,6 +221,67 @@ async def process_event(url: str, url_num: int) -> str | None: await browser.close() +async def get_events( + url: str, + ssl_ctx: ssl.SSLContext, + cached_keys: set[str], +) -> list[dict[str, str]]: + + events: list[dict[str, str]] = [] + pub_date_format = "%a, %d %b %Y %H:%M:%S %z" + now = datetime.now(TZ) + + window_start, window_end = now - timedelta(hours=3), now + timedelta(hours=1) + + buffer = await fetch_xml_stream(url, ssl_ctx) + + for _, elem in ET.iterparse(buffer, events=("end",)): + if elem.tag == "item": + title = elem.findtext("title") + desc = elem.findtext("description") + pub_date = elem.findtext("pubDate") + link = elem.findtext("link") + + try: + dt = datetime.strptime(pub_date, pub_date_format) + dt = dt.astimezone(TZ) + except Exception: + elem.clear() + continue + + if window_start <= dt <= window_end: + sport, event = ( + ( + desc.split(".")[0].strip(), + " ".join(p.strip() for p in desc.split(".")[1:]), + ) + if desc + else ("", "") + ) + + key = f"[{sport}: {event}] {title}" + + if key in cached_keys: + elem.clear() + continue + + if exist_sprts & {sport, event}: + continue + + events.append( + { + "sport": sport, + "event": event, + "title": title, + "link": link, + } + ) + + elem.clear() + + return events + + async def main(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') @@ -292,13 +290,13 @@ async def main(client: httpx.AsyncClient) -> None: cached_urls = load_cache() cached_count = len(cached_urls) - events = await parse_feed(BASE_URL, cert, set(cached_urls.keys())) + events = await get_events(BASE_URL, cert, set(cached_urls.keys())) log.info(f"Processing {len(events)} URLs") now_ts = datetime.now(TZ).timestamp() - for num, ev in enumerate(events, start=1): + for i, ev in enumerate(events, start=1): sport = ev["sport"] event = ev["event"] title = ev["title"] @@ -307,8 +305,8 @@ async def main(client: httpx.AsyncClient) -> None: key = f"[{sport}: {event}] {title}" url = await safe_process_event( - lambda: process_event(link, url_num=num), - url_num=num, + lambda: process_event(link, url_num=i), + url_num=i, log=log, ) @@ -330,4 +328,4 @@ async def main(client: httpx.AsyncClient) -> None: log.info(f"Cached {cached_count} event(s)") - log.info(f"Collected {new_count} event(s)") + log.info(f"Collected {new_count} new event(s)") diff --git a/M3U8/scrape/ppv.py b/M3U8/scrape/ppv.py index e445ef8..dcc41bb 100644 --- a/M3U8/scrape/ppv.py +++ b/M3U8/scrape/ppv.py @@ -71,7 +71,6 @@ async def process_event(url: str, url_num: int) -> str | None: ".m3u8" in req.url and "amazonaws" not in req.url and "knitcdn" not in req.url - and not captured ): captured.append(req.url) got_one.set() @@ -119,12 +118,15 @@ async def process_event(url: str, url_num: int) -> str | None: async def get_events( client: httpx.AsyncClient, api_url: str, - cached_keys: list[str], + cached_keys: set[str], ) -> dict[str, dict[str, str | str]]: - events = [] + + events: list[dict[str, str]] = [] base_url = re.match(r"(https?://.+?)/", api_url)[1] + now = datetime.now(TZ) + if not (api_data := load_api_cache()): api_data = await refresh_api_cache(client, api_url) API_FILE.write_text(json.dumps(api_data, indent=2), encoding="utf-8") @@ -156,7 +158,7 @@ async def get_events( end_dt = datetime.fromtimestamp(end_ts, tz=TZ) - if not start_dt <= datetime.now(TZ) < end_dt: + if not start_dt <= now < end_dt: continue events.append( @@ -189,10 +191,10 @@ async def main(client: httpx.AsyncClient) -> None: log.info(f"Processing {len(events)} URLs") - for num, ev in enumerate(events, start=1): + for i, ev in enumerate(events, start=1): url = await safe_process_event( - lambda: process_event(ev["link"], url_num=num), - url_num=num, + lambda: process_event(ev["link"], url_num=i), + url_num=i, log=log, ) @@ -212,7 +214,7 @@ async def main(client: httpx.AsyncClient) -> None: log.info(f"Cached {cached_count} event(s)") - log.info(f"Collected {new_count} event(s)") + log.info(f"Collected {new_count} new event(s)") # works if no cloudflare bot detection diff --git a/M3U8/scrape/tvpass.py b/M3U8/scrape/tvpass.py index fa37182..d15f92e 100644 --- a/M3U8/scrape/tvpass.py +++ b/M3U8/scrape/tvpass.py @@ -19,7 +19,6 @@ CACHE_FILE = Path(__file__).parent / "caches" / "tvpass.json" def load_cache() -> dict[str, str]: try: data = json.loads(CACHE_FILE.read_text(encoding="utf-8")) - return {} if 8 <= datetime.now(TZ).hour <= 12 else data except (FileNotFoundError, json.JSONDecodeError): return {}