iptv/M3U8/scrape/ppv.py

#!/usr/bin/env python3

import asyncio
import json
import re
from datetime import datetime
from pathlib import Path
from urllib.parse import urljoin

import httpx
from playwright.async_api import Request, async_playwright

from .utils import TZ, get_base, get_logger, safe_process_event

log = get_logger(__name__)

urls: dict[str, dict[str, str]] = {}

API_FILE = Path(__file__).parent / "caches" / "ppv_api.json"

CACHE_FILE = Path(__file__).parent / "caches" / "ppv.json"

MIRRORS = ["https://ppv.to", "https://ppvs.su"]


async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict:
    log.info("Refreshing API cache")

    try:
        r = await client.get(url)
        r.raise_for_status()
    except Exception as e:
        log.error(f'Failed to fetch "{url}"\n{e}')
        return {}

    return r.json()


def load_cache() -> dict[str, dict[str, str | str]]:
    try:
        return json.loads(CACHE_FILE.read_text(encoding="utf-8"))
    except (FileNotFoundError, json.JSONDecodeError):
        return {}


def load_api_cache() -> dict[str, dict[str, str | str]]:
    try:
        data: dict = json.loads(API_FILE.read_text(encoding="utf-8"))

        age: float = datetime.now(TZ).timestamp() - data.get("timestamp", 0)

        return data if age < 86400 else {}  # 24 hours
    except (FileNotFoundError, json.JSONDecodeError):
        return {}


async def process_event(url: str, url_num: int) -> str | None:
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)

        context = await browser.new_context()

        page = await context.new_page()

        captured: list[str] = []

        got_one = asyncio.Event()

        def capture_req(req: Request) -> None:
            if (
                ".m3u8" in req.url
                and "amazonaws" not in req.url
                and "knitcdn" not in req.url
            ):
                captured.append(req.url)
                got_one.set()

        page.on("request", capture_req)

        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=10_000)

            wait_task = asyncio.create_task(got_one.wait())

            try:
                await asyncio.wait_for(wait_task, timeout=10)
            except asyncio.TimeoutError:
                log.warning(f"URL {url_num}) Timed out waiting for m3u8.")
                return None

            finally:
                if not wait_task.done():
                    wait_task.cancel()

                    try:
                        await wait_task
                    except asyncio.CancelledError:
                        pass

            if captured:
                log.info(f"URL {url_num}) Captured M3U8")

                return captured[-1]

            log.warning(f"URL {url_num}) No m3u8 captured after waiting.")
            return None

        except Exception as e:
            log.warning(f"URL {url_num}) Exception while processing: {e}")
            return None

        finally:
            page.remove_listener("request", capture_req)
            await page.close()
            await browser.close()


async def get_events(
    client: httpx.AsyncClient,
    api_url: str,
    cached_keys: set[str],
) -> dict[str, dict[str, str | str]]:

    events: list[dict[str, str]] = []

    base_url = re.match(r"(https?://.+?)/", api_url)[1]

    now = datetime.now(TZ)

    if not (api_data := load_api_cache()):
        api_data = await refresh_api_cache(client, api_url)
        API_FILE.write_text(json.dumps(api_data, indent=2), encoding="utf-8")

    for stream_group in api_data["streams"]:
        sport = stream_group["category"]

        if sport == "24/7 Streams":
            continue

        for event in stream_group["streams"]:
            name, start_ts, end_ts, logo, uri_name = (
                event["name"],
                event["starts_at"],
                event["ends_at"],
                event.get(
                    "poster",
                    "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
                ),
                event["uri_name"],
            )

            key = f"[{sport}] {name}"

            if key in cached_keys:
                continue

            start_dt = datetime.fromtimestamp(start_ts, tz=TZ)

            end_dt = datetime.fromtimestamp(end_ts, tz=TZ)

            if not start_dt <= now < end_dt:
                continue

            events.append(
                {
                    "sport": sport,
                    "event": name,
                    "link": urljoin(base_url, f"/live/{uri_name}"),
                    "logo": logo,
                }
            )

    return events


async def main(client: httpx.AsyncClient) -> None:
    if not (base_url := await get_base(client, MIRRORS)):
        log.warning("No working PPV mirrors")
        return

    log.info(f'Scraping from "{base_url}"')

    cached_urls = load_cache()
    cached_count = len(cached_urls)

    events = await get_events(
        client,
        urljoin(base_url, "/api/streams"),
        set(cached_urls.keys()),
    )

    log.info(f"Processing {len(events)} URLs")

    for i, ev in enumerate(events, start=1):
        url = await safe_process_event(
            lambda: process_event(ev["link"], url_num=i),
            url_num=i,
            log=log,
        )

        if url:
            entry = {
                "url": url,
                "logo": ev["logo"],
            }

            key = f"[{ev['sport']}] {ev['event']}"

            urls[key] = cached_urls[key] = entry

    CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")

    new_count = len(cached_urls) - cached_count

    log.info(f"Cached {cached_count} event(s)")

    log.info(f"Collected {new_count} new event(s)")


# works if no cloudflare bot detection