From 00000d9440cb4c6ab3abb6b7fd667ea83984b085 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Wed, 3 Sep 2025 15:00:17 -0400 Subject: [PATCH] e --- .github/workflows/epg.yml | 4 +- .github/workflows/health.yml | 2 +- .github/workflows/m3u8.yml | 8 +- M3U8/fetch.py | 13 +- M3U8/scrape/ace.py | 6 +- M3U8/scrape/fstv.py | 22 +--- M3U8/scrape/livetvsx.py | 67 ++++------- M3U8/scrape/ppv.py | 218 ++++++++++++++++++++++++++++++++++ M3U8/scrape/tvpass.py | 17 +-- M3U8/scrape/utils/__init__.py | 3 + M3U8/scrape/utils/config.py | 106 +++++++++++++++++ M3U8/scrape/utils/logger.py | 38 ------ 12 files changed, 371 insertions(+), 133 deletions(-) create mode 100644 M3U8/scrape/ppv.py create mode 100644 M3U8/scrape/utils/__init__.py create mode 100644 M3U8/scrape/utils/config.py delete mode 100644 M3U8/scrape/utils/logger.py diff --git a/.github/workflows/epg.yml b/.github/workflows/epg.yml index 4ba8b28..5e80740 100644 --- a/.github/workflows/epg.yml +++ b/.github/workflows/epg.yml @@ -10,12 +10,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Cache venv - uses: actions/cache@v4 + uses: actions/cache@v3 with: path: .venv key: shared-venv-${{ runner.os }}-${{ hashFiles('uv.lock') }} diff --git a/.github/workflows/health.yml b/.github/workflows/health.yml index adbf339..60294ff 100644 --- a/.github/workflows/health.yml +++ b/.github/workflows/health.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/m3u8.yml b/.github/workflows/m3u8.yml index cfc639b..806ca95 100644 --- a/.github/workflows/m3u8.yml +++ b/.github/workflows/m3u8.yml @@ -18,13 +18,13 @@ jobs: - name: Checkout if: steps.check_time.outputs.run == 'true' - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Cache venv if: steps.check_time.outputs.run == 'true' - uses: actions/cache@v4 + uses: actions/cache@v3 with: path: .venv key: shared-venv-${{ runner.os }}-${{ hashFiles('uv.lock') }} @@ -32,7 +32,7 @@ jobs: shared-venv-${{ runner.os }}- - name: Cache cert - uses: actions/cache@v4 + uses: actions/cache@v3 with: path: M3U8/scrape/utils/cached-ca.pem key: cert-cache-${{ runner.os }}-${{ hashFiles('M3U8/scrape/utils/cached-ca.pem') }} @@ -57,7 +57,7 @@ jobs: - name: Cache Playwright browsers id: cache-pw if: steps.check_time.outputs.run == 'true' - uses: actions/cache@v4 + uses: actions/cache@v3 with: path: ~/.cache/ms-playwright key: ${{ runner.os }}-playwright diff --git a/M3U8/fetch.py b/M3U8/fetch.py index d0fb83a..918ebe1 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -3,10 +3,10 @@ import asyncio from pathlib import Path import httpx -from scrape import ace, fstv, livetvsx, tvpass -from scrape.utils import logger +from scrape import ace, fstv, livetvsx, ppv, tvpass +from scrape.utils import get_logger -log = logger.get_logger(__name__) +log = get_logger(__name__) BASE_URL = "https://s.id/ePwXT" @@ -40,9 +40,10 @@ async def vanilla_fetch() -> tuple[list[str], int]: async def main() -> None: tasks = [ - # ace.main(client), - # fstv.main(client), + # asyncio.create_task(ace.main(client)), + # asyncio.create_task(fstv.main(client)), asyncio.create_task(livetvsx.main(CLIENT)), + asyncio.create_task(ppv.main(CLIENT)), asyncio.create_task(tvpass.main(CLIENT)), vanilla_fetch(), ] @@ -51,7 +52,7 @@ async def main() -> None: base_m3u8, tvg_chno = results[-1] - additions = ace.urls | fstv.urls | livetvsx.urls | tvpass.urls + additions = ace.urls | fstv.urls | livetvsx.urls | ppv.urls | tvpass.urls lines = [ f'#EXTINF:-1 tvg-chno="{chnl_num}" tvg-id="(N/A)" tvg-name="{event}" tvg-logo="{info["logo"]}" group-title="Live Events",{event}\n{info["url"]}' diff --git a/M3U8/scrape/ace.py b/M3U8/scrape/ace.py index 94848b9..04114a1 100644 --- a/M3U8/scrape/ace.py +++ b/M3U8/scrape/ace.py @@ -5,9 +5,7 @@ from urllib.parse import urljoin import httpx from selectolax.parser import HTMLParser, Node -from .fstv import get_base -from .tvpass import logos -from .utils.logger import get_logger +from .utils import LOGOS, get_base, get_logger log = get_logger(__name__) @@ -116,7 +114,7 @@ async def main(client: httpx.AsyncClient) -> None: urls[f"[{sport}] {event} (S{i})"] = { "url": link, - "logo": logos.get( + "logo": LOGOS.get( sport, "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png", ), diff --git a/M3U8/scrape/fstv.py b/M3U8/scrape/fstv.py index 703116d..1abda54 100644 --- a/M3U8/scrape/fstv.py +++ b/M3U8/scrape/fstv.py @@ -4,8 +4,7 @@ from urllib.parse import urljoin import httpx from selectolax.parser import HTMLParser -from .tvpass import logos -from .utils.logger import get_logger +from .utils import LOGOS, get_base, get_logger log = get_logger(__name__) @@ -19,23 +18,6 @@ MIRRORS = [ ] -async def check_status(client: httpx.AsyncClient, url: str) -> bool: - try: - r = await client.get(url) - r.raise_for_status() - except Exception: - return False - - return r.status_code == 200 - - -async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str: - tasks = [check_status(client, link) for link in mirrors] - results = await asyncio.gather(*tasks) - - return [url for url, ok in zip(mirrors, results) if ok][0] - - async def get_hrefs(client: httpx.AsyncClient, base_url: str) -> list[tuple[str, str]]: log.info(f'Scraping from "{base_url}"') @@ -125,7 +107,7 @@ async def main(client: httpx.AsyncClient) -> None: urls[key] = { "url": link, - "logo": logos.get( + "logo": LOGOS.get( event, "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png", ), diff --git a/M3U8/scrape/livetvsx.py b/M3U8/scrape/livetvsx.py index bcb7f6f..f9b9333 100644 --- a/M3U8/scrape/livetvsx.py +++ b/M3U8/scrape/livetvsx.py @@ -5,20 +5,16 @@ import ssl import xml.etree.ElementTree as ET from datetime import datetime, timedelta from pathlib import Path -from typing import Any import httpx from playwright.async_api import Request, async_playwright -from .tvpass import TZ, logos -from .utils.logger import get_logger +from .utils import LOGOS, TZ, get_logger, safe_process_event log = get_logger(__name__) urls: dict[str, dict[str, str]] = {} -tvp_sports = set(logos.keys()) - BASE_URL = "https://cdn.livetv861.me/rss/upcoming_en.xml" CERT_BUNDL_URLS = [ @@ -32,23 +28,7 @@ CERT_FILE = Path(__file__).parent / "utils" / "cached-ca.pem" CACHE_FILE = Path(__file__).parent / "caches" / "livetvsx.json" - -async def safe_process_event(fn, url_num: int, timeout=20) -> Any | None: - task = asyncio.create_task(fn()) - - try: - return await asyncio.wait_for(task, timeout=timeout) - except asyncio.TimeoutError: - log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event") - - task.cancel() - - try: - await task - except asyncio.CancelledError: - pass - except Exception as e: - log.debug(f"URL {url_num}) Ignore exception after timeout: {e}") +exist_sprts = set(LOGOS.keys()) async def write_to_cert(client: httpx.AsyncClient, url: str, cert: Path) -> None: @@ -86,15 +66,13 @@ async def get_cert(client: httpx.AsyncClient) -> ssl.SSLContext: def load_cache() -> dict[str, dict[str, str | str]]: try: - data = json.loads(CACHE_FILE.read_text(encoding="utf-8")) + data: dict = json.loads(CACHE_FILE.read_text(encoding="utf-8")) now = datetime.now(TZ).timestamp() - return { - k: v - for k, v in data.items() - if now - v.get("timestamp", 0) < timedelta(hours=4).total_seconds() - } + age: float = now - data.get("timestamp", 0) + + return {k: v for k, v in data.items() if age < 14400} # 4 hours except (FileNotFoundError, json.JSONDecodeError): return {} @@ -163,15 +141,17 @@ async def parse_feed( elem.clear() continue - elif not tvp_sports & {sport, event}: - events.append( - { - "sport": sport, - "event": event, - "title": title, - "link": link, - } - ) + if not exist_sprts & {sport, event}: + continue + + events.append( + { + "sport": sport, + "event": event, + "title": title, + "link": link, + } + ) elem.clear() @@ -288,7 +268,7 @@ async def process_event(url: str, url_num: int) -> str | None: log.warning(f"URL {url_num}) No m3u8 captured in popup or inline playback.") return - except Exception as e: + except Exception: try: ev_page.remove_listener("request", capture_req) @@ -310,10 +290,9 @@ async def main(client: httpx.AsyncClient) -> None: cert = await get_cert(client) cached_urls = load_cache() - cached_keys = set(cached_urls.keys()) cached_count = len(cached_urls) - events = await parse_feed(BASE_URL, cert, cached_keys) + events = await parse_feed(BASE_URL, cert, set(cached_urls.keys())) log.info(f"Processing {len(events)} URLs") @@ -328,13 +307,15 @@ async def main(client: httpx.AsyncClient) -> None: key = f"[{sport}: {event}] {title}" url = await safe_process_event( - lambda: process_event(link, url_num=num), url_num=num + lambda: process_event(link, url_num=num), + url_num=num, + log=log, ) if url: entry = { "url": url, - "logo": logos.get( + "logo": LOGOS.get( sport, "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png", ), @@ -349,4 +330,4 @@ async def main(client: httpx.AsyncClient) -> None: log.info(f"Cached {cached_count} event(s)") - log.info(f"Collected {new_count} new event(s)") + log.info(f"Collected {new_count} event(s)") diff --git a/M3U8/scrape/ppv.py b/M3U8/scrape/ppv.py new file mode 100644 index 0000000..e445ef8 --- /dev/null +++ b/M3U8/scrape/ppv.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 + +import asyncio +import json +import re +from datetime import datetime +from pathlib import Path +from urllib.parse import urljoin + +import httpx +from playwright.async_api import Request, async_playwright + +from .utils import TZ, get_base, get_logger, safe_process_event + +log = get_logger(__name__) + +urls: dict[str, dict[str, str]] = {} + +API_FILE = Path(__file__).parent / "caches" / "ppv_api.json" + +CACHE_FILE = Path(__file__).parent / "caches" / "ppv.json" + +MIRRORS = ["https://ppv.to", "https://ppvs.su"] + + +async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict: + log.info("Refreshing API cache") + + try: + r = await client.get(url) + r.raise_for_status() + except Exception as e: + log.error(f'Failed to fetch "{url}"\n{e}') + return {} + + return r.json() + + +def load_cache() -> dict[str, dict[str, str | str]]: + try: + return json.loads(CACHE_FILE.read_text(encoding="utf-8")) + except (FileNotFoundError, json.JSONDecodeError): + return {} + + +def load_api_cache() -> dict[str, dict[str, str | str]]: + try: + data: dict = json.loads(API_FILE.read_text(encoding="utf-8")) + + age: float = datetime.now(TZ).timestamp() - data.get("timestamp", 0) + + return data if age < 86400 else {} # 24 hours + except (FileNotFoundError, json.JSONDecodeError): + return {} + + +async def process_event(url: str, url_num: int) -> str | None: + async with async_playwright() as p: + browser = await p.firefox.launch(headless=True) + + context = await browser.new_context() + + page = await context.new_page() + + captured: list[str] = [] + + got_one = asyncio.Event() + + def capture_req(req: Request) -> None: + if ( + ".m3u8" in req.url + and "amazonaws" not in req.url + and "knitcdn" not in req.url + and not captured + ): + captured.append(req.url) + got_one.set() + + page.on("request", capture_req) + + try: + await page.goto(url, wait_until="domcontentloaded", timeout=10_000) + + wait_task = asyncio.create_task(got_one.wait()) + + try: + await asyncio.wait_for(wait_task, timeout=10) + except asyncio.TimeoutError: + log.warning(f"URL {url_num}) Timed out waiting for m3u8.") + return None + + finally: + if not wait_task.done(): + wait_task.cancel() + + try: + await wait_task + except asyncio.CancelledError: + pass + + if captured: + log.info(f"URL {url_num}) Captured M3U8") + + return captured[-1] + + log.warning(f"URL {url_num}) No m3u8 captured after waiting.") + return None + + except Exception as e: + log.warning(f"URL {url_num}) Exception while processing: {e}") + return None + + finally: + page.remove_listener("request", capture_req) + await page.close() + await browser.close() + + +async def get_events( + client: httpx.AsyncClient, + api_url: str, + cached_keys: list[str], +) -> dict[str, dict[str, str | str]]: + events = [] + + base_url = re.match(r"(https?://.+?)/", api_url)[1] + + if not (api_data := load_api_cache()): + api_data = await refresh_api_cache(client, api_url) + API_FILE.write_text(json.dumps(api_data, indent=2), encoding="utf-8") + + for stream_group in api_data["streams"]: + sport = stream_group["category"] + + if sport == "24/7 Streams": + continue + + for event in stream_group["streams"]: + name, start_ts, end_ts, logo, uri_name = ( + event["name"], + event["starts_at"], + event["ends_at"], + event.get( + "poster", + "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png", + ), + event["uri_name"], + ) + + key = f"[{sport}] {name}" + + if key in cached_keys: + continue + + start_dt = datetime.fromtimestamp(start_ts, tz=TZ) + + end_dt = datetime.fromtimestamp(end_ts, tz=TZ) + + if not start_dt <= datetime.now(TZ) < end_dt: + continue + + events.append( + { + "sport": sport, + "event": name, + "link": urljoin(base_url, f"/live/{uri_name}"), + "logo": logo, + } + ) + + return events + + +async def main(client: httpx.AsyncClient) -> None: + if not (base_url := await get_base(client, MIRRORS)): + log.warning("No working PPV mirrors") + return + + log.info(f'Scraping from "{base_url}"') + + cached_urls = load_cache() + cached_count = len(cached_urls) + + events = await get_events( + client, + urljoin(base_url, "/api/streams"), + set(cached_urls.keys()), + ) + + log.info(f"Processing {len(events)} URLs") + + for num, ev in enumerate(events, start=1): + url = await safe_process_event( + lambda: process_event(ev["link"], url_num=num), + url_num=num, + log=log, + ) + + if url: + entry = { + "url": url, + "logo": ev["logo"], + } + + key = f"[{ev['sport']}] {ev['event']}" + + urls[key] = cached_urls[key] = entry + + CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8") + + new_count = len(cached_urls) - cached_count + + log.info(f"Cached {cached_count} event(s)") + + log.info(f"Collected {new_count} event(s)") + + +# works if no cloudflare bot detection diff --git a/M3U8/scrape/tvpass.py b/M3U8/scrape/tvpass.py index 72127f7..fa37182 100644 --- a/M3U8/scrape/tvpass.py +++ b/M3U8/scrape/tvpass.py @@ -4,9 +4,8 @@ from datetime import datetime from pathlib import Path import httpx -import pytz -from .utils.logger import get_logger +from .utils import LOGOS, TZ, get_logger log = get_logger(__name__) @@ -16,18 +15,6 @@ BASE_URL = "https://tvpass.org/playlist/m3u" CACHE_FILE = Path(__file__).parent / "caches" / "tvpass.json" -logos = { - "MLB": "https://i.gyazo.com/0fe7865ef2f06c9507791b24f04dbca8.png", - "NBA": "https://i.gyazo.com/773c23570f095a5d549c23b9401d83f4.png", - "NCAAF": "https://i.gyazo.com/ca63b40c86e757436de9d34d369b24f8.png", - "NCAAB": "https://i.gyazo.com/ca63b40c86e757436de9d34d369b24f8.png", - "NFL": "https://i.gyazo.com/fb4956d7a2fe54a1bac54cd81e1b3f11.png", - "NHL": "https://i.gyazo.com/526607d4e886d5ed1fecca4bff3115e2.png", - "WNBA": "https://i.gyazo.com/02d665a5704118d195dbcd5fa20d5462.png", -} - -TZ = pytz.timezone("America/New_York") - def load_cache() -> dict[str, str]: try: @@ -78,7 +65,7 @@ async def main(client: httpx.AsyncClient) -> None: if url.endswith("/hd"): urls[f"[{sport}] {tvg_name}"] = { "url": f"http://origin.thetvapp.to/hls/{url.split('/')[-2]}/mono.m3u8", - "logo": logos.get( + "logo": LOGOS.get( sport, "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png", ), diff --git a/M3U8/scrape/utils/__init__.py b/M3U8/scrape/utils/__init__.py new file mode 100644 index 0000000..50cbf88 --- /dev/null +++ b/M3U8/scrape/utils/__init__.py @@ -0,0 +1,3 @@ +from .config import LOGOS, TZ, get_base, get_logger, safe_process_event + +__all__ = ["LOGOS", "TZ", "get_base", "get_logger", "safe_process_event"] diff --git a/M3U8/scrape/utils/config.py b/M3U8/scrape/utils/config.py new file mode 100644 index 0000000..8e64fe1 --- /dev/null +++ b/M3U8/scrape/utils/config.py @@ -0,0 +1,106 @@ +import asyncio +import logging +from pathlib import Path +from typing import Any + +import httpx +import pytz + +TZ = pytz.timezone("America/New_York") + +LOGOS = { + "MLB": "https://i.gyazo.com/0fe7865ef2f06c9507791b24f04dbca8.png", + "NBA": "https://i.gyazo.com/773c23570f095a5d549c23b9401d83f4.png", + "NCAAF": "https://i.gyazo.com/ca63b40c86e757436de9d34d369b24f8.png", + "NCAAB": "https://i.gyazo.com/ca63b40c86e757436de9d34d369b24f8.png", + "NFL": "https://i.gyazo.com/fb4956d7a2fe54a1bac54cd81e1b3f11.png", + "NHL": "https://i.gyazo.com/526607d4e886d5ed1fecca4bff3115e2.png", + "WNBA": "https://i.gyazo.com/02d665a5704118d195dbcd5fa20d5462.png", +} + +LOG_FMT = ( + "[%(asctime)s] " + "%(levelname)-8s " + "[%(name)s] " + "%(message)-70s " + "(%(filename)s:%(lineno)d)" +) + +COLORS = { + "DEBUG": "\033[37m", + "INFO": "\033[32m", + "WARNING": "\033[33m", + "ERROR": "\033[31m", + "CRITICAL": "\033[41m", + "reset": "\033[0m", +} + + +class ColorFormatter(logging.Formatter): + def format(self, record) -> str: + color = COLORS.get(record.levelname, "") + levelname = record.levelname + record.levelname = f"{color}{levelname}{COLORS['reset']}" + formatted = super().format(record) + record.levelname = levelname + return formatted + + +def get_logger(name: str | None = None) -> logging.Logger: + if not name: + name = Path(__file__).stem + + logger = logging.getLogger(name) + + if not logger.hasHandlers(): + handler = logging.StreamHandler() + formatter = ColorFormatter(LOG_FMT, datefmt="%Y-%m-%d | %H:%M:%S") + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + + return logger + + +async def safe_process_event( + fn, + url_num: int, + timeout=20, + log: logging.Logger | None = None, +) -> Any | None: + + if not log: + log = logging.getLogger(__name__) + + task = asyncio.create_task(fn()) + + try: + return await asyncio.wait_for(task, timeout=timeout) + except asyncio.TimeoutError: + log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event") + + task.cancel() + + try: + await task + except asyncio.CancelledError: + pass + except Exception as e: + log.debug(f"URL {url_num}) Ignore exception after timeout: {e}") + + +async def check_status(client: httpx.AsyncClient, url: str) -> bool: + try: + r = await client.get(url) + r.raise_for_status() + except Exception: + return False + + return r.status_code == 200 + + +async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str: + tasks = [check_status(client, link) for link in mirrors] + results = await asyncio.gather(*tasks) + + return [url for url, ok in zip(mirrors, results) if ok][0] diff --git a/M3U8/scrape/utils/logger.py b/M3U8/scrape/utils/logger.py deleted file mode 100644 index 738f316..0000000 --- a/M3U8/scrape/utils/logger.py +++ /dev/null @@ -1,38 +0,0 @@ -import logging - -log_format = "[%(asctime)s] %(levelname)-8s %(message)-70s %(filename)s:%(lineno)d" - -colors = { - "DEBUG": "\033[37m", - "INFO": "\033[32m", - "WARNING": "\033[33m", - "ERROR": "\033[31m", - "CRITICAL": "\033[41m", - "reset": "\033[0m", -} - - -class ColorFormatter(logging.Formatter): - def format(self, record) -> str: - color = colors.get(record.levelname, "") - - record.levelname = f"{color}{record.levelname}{colors['reset']}" - - return super().format(record) - - -def get_logger(name: str = __name__) -> logging.Logger: - logger = logging.getLogger(name) - - if not logger.hasHandlers(): - handler = logging.StreamHandler() - - formatter = ColorFormatter(log_format, datefmt="%Y-%m-%d | %H:%M:%S") - - handler.setFormatter(formatter) - - logger.addHandler(handler) - - logger.setLevel(logging.INFO) - - return logger