e

2025-09-04 19:53:27 -04:00 · 2025-09-04 19:53:27 -04:00 · 00000d905f
commit 00000d905f
parent cb9d5637fc
7 changed files with 224 additions and 93 deletions
--- a/M3U8/scrape/ppv.py
+++ b/M3U8/scrape/ppv.py
@ -4,13 +4,22 @@ import asyncio
 import json
 import re
 from datetime import datetime, timedelta
+from functools import partial
 from pathlib import Path
 from urllib.parse import urljoin

 import httpx
-from playwright.async_api import Request, async_playwright
+from playwright.async_api import async_playwright

-from .utils import TZ, get_base, get_logger, now, safe_process_event
+from .utils import (
+    TZ,
+    capture_req,
+    get_base,
+    get_logger,
+    load_ts_cache,
+    now,
+    safe_process_event,
+)

 log = get_logger(__name__)

@ -20,7 +29,13 @@ API_FILE = Path(__file__).parent / "caches" / "ppv_api.json"

 CACHE_FILE = Path(__file__).parent / "caches" / "ppv.json"

-MIRRORS = ["https://ppv.to", "https://ppvs.su"]
+MIRRORS = [
+    "https://ppvs.su",
+    "https://ppv.to",
+    "https://ppv.wtf",
+    "https://ppv.land",
+    "https://freeppv.fun",
+]


 async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict:
@ -36,21 +51,6 @@ async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict:
    return r.json()


-def load_cache() -> dict[str, dict[str, str | float]]:
-    try:
-        data: dict[str, dict[str, str | float]] = json.loads(
-            CACHE_FILE.read_text(encoding="utf-8")
-        )
-
-        return {
-            k: v
-            for k, v in data.items()
-            if now.timestamp() - data[k].get("timestamp", 0) < 14400  # 4 hours
-        }
-    except (FileNotFoundError, json.JSONDecodeError):
-        return {}
-
-
 def load_api_cache() -> dict[str, dict[str, str | str]]:
    try:
        data: dict = json.loads(API_FILE.read_text(encoding="utf-8"))
@ -74,16 +74,9 @@ async def process_event(url: str, url_num: int) -> str | None:

        got_one = asyncio.Event()

-        def capture_req(req: Request) -> None:
-            if (
-                ".m3u8" in req.url
-                and "amazonaws" not in req.url
-                and "knitcdn" not in req.url
-            ):
-                captured.append(req.url)
-                got_one.set()
+        handler = partial(capture_req, captured=captured, got_one=got_one)

-        page.on("request", capture_req)
+        page.on("request", handler)

        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=15_000)
@ -93,8 +86,8 @@ async def process_event(url: str, url_num: int) -> str | None:
            try:
                await asyncio.wait_for(wait_task, timeout=10)
            except asyncio.TimeoutError:
-                log.warning(f"URL {url_num}) Timed out waiting for m3u8.")
-                return None
+                log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
+                return

            finally:
                if not wait_task.done():
@ -110,15 +103,15 @@ async def process_event(url: str, url_num: int) -> str | None:

                return captured[-1]

-            log.warning(f"URL {url_num}) No m3u8 captured after waiting.")
-            return None
+            log.warning(f"URL {url_num}) No M3U8 captured after waiting.")
+            return

        except Exception as e:
            log.warning(f"URL {url_num}) Exception while processing: {e}")
-            return None
+            return

        finally:
-            page.remove_listener("request", capture_req)
+            page.remove_listener("request", handler)
            await page.close()
            await browser.close()

@ -127,7 +120,7 @@ async def get_events(
    client: httpx.AsyncClient,
    api_url: str,
    cached_keys: set[str],
-) -> dict[str, dict[str, str | str]]:
+) -> list[dict[str, str]]:

    events: list[dict[str, str]] = []

@ -186,7 +179,7 @@ async def main(client: httpx.AsyncClient) -> None:

    log.info(f'Scraping from "{base_url}"')

-    cached_urls = load_cache()
+    cached_urls = load_ts_cache(CACHE_FILE, 14400)
    cached_count = len(cached_urls)

    log.info(f"Collected {cached_count} event(s) from cache")