From 00000d979521822849617ee3a1c3621d645dc103 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Wed, 17 Sep 2025 22:52:40 -0400 Subject: [PATCH] e --- .github/workflows/m3u8.yml | 2 +- .gitignore | 2 +- M3U8/scrapers/utils/__init__.py | 17 +-- M3U8/scrapers/utils/cache.py | 48 +++++++ M3U8/scrapers/utils/config.py | 229 -------------------------------- M3U8/scrapers/utils/logger.py | 45 +++++++ M3U8/scrapers/utils/network.py | 144 ++++++++++++++++++++ 7 files changed, 243 insertions(+), 244 deletions(-) create mode 100644 M3U8/scrapers/utils/cache.py create mode 100644 M3U8/scrapers/utils/logger.py create mode 100644 M3U8/scrapers/utils/network.py diff --git a/.github/workflows/m3u8.yml b/.github/workflows/m3u8.yml index ed87277..6cb3b7b 100644 --- a/.github/workflows/m3u8.yml +++ b/.github/workflows/m3u8.yml @@ -79,7 +79,7 @@ jobs: uses: stefanzweifel/git-auto-commit-action@v6 with: commit_message: "update M3U8" - file_pattern: "M3U8/TV.m3u8 M3U8/scrapers/caches/*.json" + file_pattern: "M3U8/TV.m3u8" commit_author: "GitHub Actions Bot " commit_user_name: "GitHub Actions Bot" commit_user_email: "actions@github.com" diff --git a/.gitignore b/.gitignore index 7ba9518..dd7c7c8 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,4 @@ wheels/ .python-version stuff/ cached-ca.pem -*.json +M3U8/scrapers/caches/*.json diff --git a/M3U8/scrapers/utils/__init__.py b/M3U8/scrapers/utils/__init__.py index 66a2fa0..a090b51 100644 --- a/M3U8/scrapers/utils/__init__.py +++ b/M3U8/scrapers/utils/__init__.py @@ -1,16 +1,7 @@ -from .config import ( - LOGOS, - TZ, - UA, - capture_req, - get_base, - get_logger, - load_cache, - new_browser, - now, - safe_process_event, - write_cache, -) +from .cache import load_cache, write_cache +from .config import LOGOS, TZ, UA, now +from .logger import get_logger +from .network import capture_req, get_base, new_browser, safe_process_event __all__ = [ "LOGOS", diff --git a/M3U8/scrapers/utils/cache.py b/M3U8/scrapers/utils/cache.py new file mode 100644 index 0000000..3ee4310 --- /dev/null +++ b/M3U8/scrapers/utils/cache.py @@ -0,0 +1,48 @@ +import json +from datetime import datetime +from pathlib import Path + +from .config import now + + +def near_hr(dt: datetime) -> float: + return dt.replace(minute=0, second=0, microsecond=0).timestamp() + + +def is_fresh( + entry: dict, + nearest_hr: bool, + exp: int, +) -> bool: + ts: float | int = entry.get("timestamp", 31496400) + + if nearest_hr: + ts = near_hr(datetime.fromtimestamp(ts)) + + return now.timestamp() - ts < exp + + +def load_cache( + file: Path, + exp: int | float, + nearest_hr: bool = False, + per_entry: bool = True, +) -> dict[str, dict[str, str | float]]: + try: + data: dict = json.loads(file.read_text(encoding="utf-8")) + except (FileNotFoundError, json.JSONDecodeError): + return {} + + if per_entry: + return {k: v for k, v in data.items() if is_fresh(v, nearest_hr, exp)} + + ts: float | int = data.get("timestamp", 31496400) + + if nearest_hr: + ts = near_hr(datetime.fromtimestamp(ts)) + + return data if now.timestamp() - ts < exp else {} + + +def write_cache(file: Path, data: dict) -> None: + file.write_text(json.dumps(data, indent=2), encoding="utf-8") diff --git a/M3U8/scrapers/utils/config.py b/M3U8/scrapers/utils/config.py index 05d48ca..042dd38 100644 --- a/M3U8/scrapers/utils/config.py +++ b/M3U8/scrapers/utils/config.py @@ -1,15 +1,6 @@ -import asyncio -import json -import logging -import re -from collections.abc import Callable from datetime import datetime -from pathlib import Path -from typing import Any -import httpx import pytz -from playwright.async_api import Browser, BrowserContext, Playwright, Request TZ = pytz.timezone("America/New_York") @@ -53,223 +44,3 @@ alias_map = { for base, aliases in alias_map.items(): for alias in aliases: LOGOS[alias] = LOGOS[base] - -LOG_FMT = ( - "[%(asctime)s] " - "%(levelname)-8s " - "[%(name)s] " - "%(message)-70s " - "(%(filename)s:%(lineno)d)" -) - -COLORS = { - "DEBUG": "\033[37m", - "INFO": "\033[32m", - "WARNING": "\033[33m", - "ERROR": "\033[31m", - "CRITICAL": "\033[41m", - "reset": "\033[0m", -} - - -class ColorFormatter(logging.Formatter): - def format(self, record) -> str: - color = COLORS.get(record.levelname, "") - levelname = record.levelname - record.levelname = f"{color}{levelname}{COLORS['reset']}" - formatted = super().format(record) - record.levelname = levelname - return formatted - - -def get_logger(name: str | None = None) -> logging.Logger: - if not name: - name = Path(__file__).stem - - logger = logging.getLogger(name) - - if not logger.hasHandlers(): - handler = logging.StreamHandler() - formatter = ColorFormatter(LOG_FMT, datefmt="%Y-%m-%d | %H:%M:%S") - handler.setFormatter(formatter) - logger.addHandler(handler) - logger.setLevel(logging.INFO) - - return logger - - -def near_hr(dt: datetime) -> float: - return dt.replace(minute=0, second=0, microsecond=0).timestamp() - - -def is_fresh( - entry: dict, - nearest_hr: bool, - exp: int, -) -> bool: - ts: float | int = entry.get("timestamp", 31496400) - - if nearest_hr: - ts = near_hr(datetime.fromtimestamp(ts)) - - return now.timestamp() - ts < exp - - -def load_cache( - file: Path, - exp: int | float, - nearest_hr: bool = False, - per_entry: bool = True, -) -> dict[str, dict[str, str | float]]: - try: - data: dict = json.loads(file.read_text(encoding="utf-8")) - except (FileNotFoundError, json.JSONDecodeError): - return {} - - if per_entry: - return {k: v for k, v in data.items() if is_fresh(v, nearest_hr, exp)} - - ts: float | int = data.get("timestamp", 31496400) - - if nearest_hr: - ts = near_hr(datetime.fromtimestamp(ts)) - - return data if now.timestamp() - ts < exp else {} - - -def write_cache(file: Path, data: dict) -> None: - file.write_text(json.dumps(data, indent=2), encoding="utf-8") - - -async def safe_process_event( - fn: Callable, - url_num: int, - timeout: int | float = 20, - log: logging.Logger | None = None, -) -> Any | None: - - if not log: - log = logging.getLogger(__name__) - - task = asyncio.create_task(fn()) - - try: - return await asyncio.wait_for(task, timeout=timeout) - except asyncio.TimeoutError: - log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event") - - task.cancel() - - try: - await task - except asyncio.CancelledError: - pass - except Exception as e: - log.debug(f"URL {url_num}) Ignore exception after timeout: {e}") - - -async def check_status(client: httpx.AsyncClient, url: str) -> bool: - try: - r = await client.get(url) - r.raise_for_status() - except Exception: - return False - - return r.status_code == 200 - - -async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str | None: - tasks = [check_status(client, link) for link in mirrors] - results = await asyncio.gather(*tasks) - - try: - return [url for url, ok in zip(mirrors, results) if ok][0] - except IndexError: - return - - -def capture_req( - req: Request, - captured: list[str], - got_one: asyncio.Event, -) -> None: - valid_m3u8 = re.compile(r"^(?!.*(amazonaws|knitcdn)).*\.m3u8") - - if valid_m3u8.search(req.url): - captured.append(req.url) - got_one.set() - - -async def new_browser( - playwright: Playwright, - browser: str = "firefox", - ignore_https_errors: bool = False, -) -> tuple[Browser, BrowserContext]: - - if browser == "brave": - brwsr = await playwright.chromium.connect_over_cdp("http://localhost:9222") - context = brwsr.contexts[0] - else: - brwsr = await playwright.firefox.launch(headless=True) - - context = await brwsr.new_context( - user_agent=UA, - ignore_https_errors=ignore_https_errors, - viewport={"width": 1366, "height": 768}, - device_scale_factor=1, - locale="en-US", - timezone_id="America/New_York", - color_scheme="dark", - permissions=["geolocation"], - extra_http_headers={ - "Accept-Language": "en-US,en;q=0.9", - "Upgrade-Insecure-Requests": "1", - }, - ) - - await context.add_init_script( - """ - Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); - - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'] - }); - - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4] - }); - - const elementDescriptor = Object.getOwnPropertyDescriptor(HTMLElement.prototype, 'offsetHeight'); - Object.defineProperty(HTMLDivElement.prototype, 'offsetHeight', { - ...elementDescriptor, - get: function() { - if (this.id === 'modernizr') { return 24; } - return elementDescriptor.get.apply(this); - } - }); - - Object.defineProperty(window.screen, 'width', { get: () => 1366 }); - Object.defineProperty(window.screen, 'height', { get: () => 768 }); - - const getParameter = WebGLRenderingContext.prototype. getParameter; - WebGLRenderingContext.prototype.getParameter = function (param) { - if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL - if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL - return getParameter.apply(this, [param]); - }; - - const observer = new MutationObserver(mutations => { - mutations.forEach(mutation => { - mutation.addedNodes.forEach(node => { - if (node.tagName === 'IFRAME' && node.hasAttribute('sandbox')) { - node.removeAttribute('sandbox'); - } - }); - }); - }); - - observer.observe(document.documentElement, { childList: true, subtree: true }); - """ - ) - - return brwsr, context diff --git a/M3U8/scrapers/utils/logger.py b/M3U8/scrapers/utils/logger.py new file mode 100644 index 0000000..872498e --- /dev/null +++ b/M3U8/scrapers/utils/logger.py @@ -0,0 +1,45 @@ +import logging +from pathlib import Path + +LOG_FMT = ( + "[%(asctime)s] " + "%(levelname)-8s " + "[%(name)s] " + "%(message)-70s " + "(%(filename)s:%(lineno)d)" +) + +COLORS = { + "DEBUG": "\033[37m", + "INFO": "\033[32m", + "WARNING": "\033[33m", + "ERROR": "\033[31m", + "CRITICAL": "\033[41m", + "reset": "\033[0m", +} + + +class ColorFormatter(logging.Formatter): + def format(self, record) -> str: + color = COLORS.get(record.levelname, "") + levelname = record.levelname + record.levelname = f"{color}{levelname}{COLORS['reset']}" + formatted = super().format(record) + record.levelname = levelname + return formatted + + +def get_logger(name: str | None = None) -> logging.Logger: + if not name: + name = Path(__file__).stem + + logger = logging.getLogger(name) + + if not logger.hasHandlers(): + handler = logging.StreamHandler() + formatter = ColorFormatter(LOG_FMT, datefmt="%Y-%m-%d | %H:%M:%S") + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + + return logger diff --git a/M3U8/scrapers/utils/network.py b/M3U8/scrapers/utils/network.py new file mode 100644 index 0000000..ca8ac39 --- /dev/null +++ b/M3U8/scrapers/utils/network.py @@ -0,0 +1,144 @@ +import asyncio +import logging +import re +from collections.abc import Callable +from typing import Any + +import httpx +from playwright.async_api import Browser, BrowserContext, Playwright, Request + +from .config import UA + + +async def check_status(client: httpx.AsyncClient, url: str) -> bool: + try: + r = await client.get(url) + r.raise_for_status() + except Exception: + return False + + return r.status_code == 200 + + +async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str | None: + tasks = [check_status(client, link) for link in mirrors] + results = await asyncio.gather(*tasks) + + try: + return [url for url, ok in zip(mirrors, results) if ok][0] + except IndexError: + return + + +async def safe_process_event( + fn: Callable, + url_num: int, + timeout: int | float = 20, + log: logging.Logger | None = None, +) -> Any | None: + + if not log: + log = logging.getLogger(__name__) + + task = asyncio.create_task(fn()) + + try: + return await asyncio.wait_for(task, timeout=timeout) + except asyncio.TimeoutError: + log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event") + + task.cancel() + + try: + await task + except asyncio.CancelledError: + pass + except Exception as e: + log.debug(f"URL {url_num}) Ignore exception after timeout: {e}") + + +def capture_req( + req: Request, + captured: list[str], + got_one: asyncio.Event, +) -> None: + valid_m3u8 = re.compile(r"^(?!.*(amazonaws|knitcdn)).*\.m3u8") + + if valid_m3u8.search(req.url): + captured.append(req.url) + got_one.set() + + +async def new_browser( + playwright: Playwright, + browser: str = "firefox", + ignore_https_errors: bool = False, +) -> tuple[Browser, BrowserContext]: + + if browser == "brave": + brwsr = await playwright.chromium.connect_over_cdp("http://localhost:9222") + context = brwsr.contexts[0] + else: + brwsr = await playwright.firefox.launch(headless=True) + + context = await brwsr.new_context( + user_agent=UA, + ignore_https_errors=ignore_https_errors, + viewport={"width": 1366, "height": 768}, + device_scale_factor=1, + locale="en-US", + timezone_id="America/New_York", + color_scheme="dark", + permissions=["geolocation"], + extra_http_headers={ + "Accept-Language": "en-US,en;q=0.9", + "Upgrade-Insecure-Requests": "1", + }, + ) + + await context.add_init_script( + """ + Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); + + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4] + }); + + const elementDescriptor = Object.getOwnPropertyDescriptor(HTMLElement.prototype, 'offsetHeight'); + Object.defineProperty(HTMLDivElement.prototype, 'offsetHeight', { + ...elementDescriptor, + get: function() { + if (this.id === 'modernizr') { return 24; } + return elementDescriptor.get.apply(this); + } + }); + + Object.defineProperty(window.screen, 'width', { get: () => 1366 }); + Object.defineProperty(window.screen, 'height', { get: () => 768 }); + + const getParameter = WebGLRenderingContext.prototype. getParameter; + WebGLRenderingContext.prototype.getParameter = function (param) { + if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL + if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL + return getParameter.apply(this, [param]); + }; + + const observer = new MutationObserver(mutations => { + mutations.forEach(mutation => { + mutation.addedNodes.forEach(node => { + if (node.tagName === 'IFRAME' && node.hasAttribute('sandbox')) { + node.removeAttribute('sandbox'); + } + }); + }); + }); + + observer.observe(document.documentElement, { childList: true, subtree: true }); + """ + ) + + return brwsr, context