This commit is contained in:
doms9 2025-09-17 22:52:40 -04:00
parent 4761a15bdf
commit 00000d9795
7 changed files with 243 additions and 244 deletions

View file

@ -79,7 +79,7 @@ jobs:
uses: stefanzweifel/git-auto-commit-action@v6 uses: stefanzweifel/git-auto-commit-action@v6
with: with:
commit_message: "update M3U8" commit_message: "update M3U8"
file_pattern: "M3U8/TV.m3u8 M3U8/scrapers/caches/*.json" file_pattern: "M3U8/TV.m3u8"
commit_author: "GitHub Actions Bot <actions@github.com>" commit_author: "GitHub Actions Bot <actions@github.com>"
commit_user_name: "GitHub Actions Bot" commit_user_name: "GitHub Actions Bot"
commit_user_email: "actions@github.com" commit_user_email: "actions@github.com"

2
.gitignore vendored
View file

@ -13,4 +13,4 @@ wheels/
.python-version .python-version
stuff/ stuff/
cached-ca.pem cached-ca.pem
*.json M3U8/scrapers/caches/*.json

View file

@ -1,16 +1,7 @@
from .config import ( from .cache import load_cache, write_cache
LOGOS, from .config import LOGOS, TZ, UA, now
TZ, from .logger import get_logger
UA, from .network import capture_req, get_base, new_browser, safe_process_event
capture_req,
get_base,
get_logger,
load_cache,
new_browser,
now,
safe_process_event,
write_cache,
)
__all__ = [ __all__ = [
"LOGOS", "LOGOS",

View file

@ -0,0 +1,48 @@
import json
from datetime import datetime
from pathlib import Path
from .config import now
def near_hr(dt: datetime) -> float:
return dt.replace(minute=0, second=0, microsecond=0).timestamp()
def is_fresh(
entry: dict,
nearest_hr: bool,
exp: int,
) -> bool:
ts: float | int = entry.get("timestamp", 31496400)
if nearest_hr:
ts = near_hr(datetime.fromtimestamp(ts))
return now.timestamp() - ts < exp
def load_cache(
file: Path,
exp: int | float,
nearest_hr: bool = False,
per_entry: bool = True,
) -> dict[str, dict[str, str | float]]:
try:
data: dict = json.loads(file.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError):
return {}
if per_entry:
return {k: v for k, v in data.items() if is_fresh(v, nearest_hr, exp)}
ts: float | int = data.get("timestamp", 31496400)
if nearest_hr:
ts = near_hr(datetime.fromtimestamp(ts))
return data if now.timestamp() - ts < exp else {}
def write_cache(file: Path, data: dict) -> None:
file.write_text(json.dumps(data, indent=2), encoding="utf-8")

View file

@ -1,15 +1,6 @@
import asyncio
import json
import logging
import re
from collections.abc import Callable
from datetime import datetime from datetime import datetime
from pathlib import Path
from typing import Any
import httpx
import pytz import pytz
from playwright.async_api import Browser, BrowserContext, Playwright, Request
TZ = pytz.timezone("America/New_York") TZ = pytz.timezone("America/New_York")
@ -53,223 +44,3 @@ alias_map = {
for base, aliases in alias_map.items(): for base, aliases in alias_map.items():
for alias in aliases: for alias in aliases:
LOGOS[alias] = LOGOS[base] LOGOS[alias] = LOGOS[base]
LOG_FMT = (
"[%(asctime)s] "
"%(levelname)-8s "
"[%(name)s] "
"%(message)-70s "
"(%(filename)s:%(lineno)d)"
)
COLORS = {
"DEBUG": "\033[37m",
"INFO": "\033[32m",
"WARNING": "\033[33m",
"ERROR": "\033[31m",
"CRITICAL": "\033[41m",
"reset": "\033[0m",
}
class ColorFormatter(logging.Formatter):
def format(self, record) -> str:
color = COLORS.get(record.levelname, "")
levelname = record.levelname
record.levelname = f"{color}{levelname}{COLORS['reset']}"
formatted = super().format(record)
record.levelname = levelname
return formatted
def get_logger(name: str | None = None) -> logging.Logger:
if not name:
name = Path(__file__).stem
logger = logging.getLogger(name)
if not logger.hasHandlers():
handler = logging.StreamHandler()
formatter = ColorFormatter(LOG_FMT, datefmt="%Y-%m-%d | %H:%M:%S")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
def near_hr(dt: datetime) -> float:
return dt.replace(minute=0, second=0, microsecond=0).timestamp()
def is_fresh(
entry: dict,
nearest_hr: bool,
exp: int,
) -> bool:
ts: float | int = entry.get("timestamp", 31496400)
if nearest_hr:
ts = near_hr(datetime.fromtimestamp(ts))
return now.timestamp() - ts < exp
def load_cache(
file: Path,
exp: int | float,
nearest_hr: bool = False,
per_entry: bool = True,
) -> dict[str, dict[str, str | float]]:
try:
data: dict = json.loads(file.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError):
return {}
if per_entry:
return {k: v for k, v in data.items() if is_fresh(v, nearest_hr, exp)}
ts: float | int = data.get("timestamp", 31496400)
if nearest_hr:
ts = near_hr(datetime.fromtimestamp(ts))
return data if now.timestamp() - ts < exp else {}
def write_cache(file: Path, data: dict) -> None:
file.write_text(json.dumps(data, indent=2), encoding="utf-8")
async def safe_process_event(
fn: Callable,
url_num: int,
timeout: int | float = 20,
log: logging.Logger | None = None,
) -> Any | None:
if not log:
log = logging.getLogger(__name__)
task = asyncio.create_task(fn())
try:
return await asyncio.wait_for(task, timeout=timeout)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event")
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
except Exception as e:
log.debug(f"URL {url_num}) Ignore exception after timeout: {e}")
async def check_status(client: httpx.AsyncClient, url: str) -> bool:
try:
r = await client.get(url)
r.raise_for_status()
except Exception:
return False
return r.status_code == 200
async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str | None:
tasks = [check_status(client, link) for link in mirrors]
results = await asyncio.gather(*tasks)
try:
return [url for url, ok in zip(mirrors, results) if ok][0]
except IndexError:
return
def capture_req(
req: Request,
captured: list[str],
got_one: asyncio.Event,
) -> None:
valid_m3u8 = re.compile(r"^(?!.*(amazonaws|knitcdn)).*\.m3u8")
if valid_m3u8.search(req.url):
captured.append(req.url)
got_one.set()
async def new_browser(
playwright: Playwright,
browser: str = "firefox",
ignore_https_errors: bool = False,
) -> tuple[Browser, BrowserContext]:
if browser == "brave":
brwsr = await playwright.chromium.connect_over_cdp("http://localhost:9222")
context = brwsr.contexts[0]
else:
brwsr = await playwright.firefox.launch(headless=True)
context = await brwsr.new_context(
user_agent=UA,
ignore_https_errors=ignore_https_errors,
viewport={"width": 1366, "height": 768},
device_scale_factor=1,
locale="en-US",
timezone_id="America/New_York",
color_scheme="dark",
permissions=["geolocation"],
extra_http_headers={
"Accept-Language": "en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
},
)
await context.add_init_script(
"""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4]
});
const elementDescriptor = Object.getOwnPropertyDescriptor(HTMLElement.prototype, 'offsetHeight');
Object.defineProperty(HTMLDivElement.prototype, 'offsetHeight', {
...elementDescriptor,
get: function() {
if (this.id === 'modernizr') { return 24; }
return elementDescriptor.get.apply(this);
}
});
Object.defineProperty(window.screen, 'width', { get: () => 1366 });
Object.defineProperty(window.screen, 'height', { get: () => 768 });
const getParameter = WebGLRenderingContext.prototype. getParameter;
WebGLRenderingContext.prototype.getParameter = function (param) {
if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL
if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL
return getParameter.apply(this, [param]);
};
const observer = new MutationObserver(mutations => {
mutations.forEach(mutation => {
mutation.addedNodes.forEach(node => {
if (node.tagName === 'IFRAME' && node.hasAttribute('sandbox')) {
node.removeAttribute('sandbox');
}
});
});
});
observer.observe(document.documentElement, { childList: true, subtree: true });
"""
)
return brwsr, context

View file

@ -0,0 +1,45 @@
import logging
from pathlib import Path
LOG_FMT = (
"[%(asctime)s] "
"%(levelname)-8s "
"[%(name)s] "
"%(message)-70s "
"(%(filename)s:%(lineno)d)"
)
COLORS = {
"DEBUG": "\033[37m",
"INFO": "\033[32m",
"WARNING": "\033[33m",
"ERROR": "\033[31m",
"CRITICAL": "\033[41m",
"reset": "\033[0m",
}
class ColorFormatter(logging.Formatter):
def format(self, record) -> str:
color = COLORS.get(record.levelname, "")
levelname = record.levelname
record.levelname = f"{color}{levelname}{COLORS['reset']}"
formatted = super().format(record)
record.levelname = levelname
return formatted
def get_logger(name: str | None = None) -> logging.Logger:
if not name:
name = Path(__file__).stem
logger = logging.getLogger(name)
if not logger.hasHandlers():
handler = logging.StreamHandler()
formatter = ColorFormatter(LOG_FMT, datefmt="%Y-%m-%d | %H:%M:%S")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger

View file

@ -0,0 +1,144 @@
import asyncio
import logging
import re
from collections.abc import Callable
from typing import Any
import httpx
from playwright.async_api import Browser, BrowserContext, Playwright, Request
from .config import UA
async def check_status(client: httpx.AsyncClient, url: str) -> bool:
try:
r = await client.get(url)
r.raise_for_status()
except Exception:
return False
return r.status_code == 200
async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str | None:
tasks = [check_status(client, link) for link in mirrors]
results = await asyncio.gather(*tasks)
try:
return [url for url, ok in zip(mirrors, results) if ok][0]
except IndexError:
return
async def safe_process_event(
fn: Callable,
url_num: int,
timeout: int | float = 20,
log: logging.Logger | None = None,
) -> Any | None:
if not log:
log = logging.getLogger(__name__)
task = asyncio.create_task(fn())
try:
return await asyncio.wait_for(task, timeout=timeout)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event")
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
except Exception as e:
log.debug(f"URL {url_num}) Ignore exception after timeout: {e}")
def capture_req(
req: Request,
captured: list[str],
got_one: asyncio.Event,
) -> None:
valid_m3u8 = re.compile(r"^(?!.*(amazonaws|knitcdn)).*\.m3u8")
if valid_m3u8.search(req.url):
captured.append(req.url)
got_one.set()
async def new_browser(
playwright: Playwright,
browser: str = "firefox",
ignore_https_errors: bool = False,
) -> tuple[Browser, BrowserContext]:
if browser == "brave":
brwsr = await playwright.chromium.connect_over_cdp("http://localhost:9222")
context = brwsr.contexts[0]
else:
brwsr = await playwright.firefox.launch(headless=True)
context = await brwsr.new_context(
user_agent=UA,
ignore_https_errors=ignore_https_errors,
viewport={"width": 1366, "height": 768},
device_scale_factor=1,
locale="en-US",
timezone_id="America/New_York",
color_scheme="dark",
permissions=["geolocation"],
extra_http_headers={
"Accept-Language": "en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
},
)
await context.add_init_script(
"""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4]
});
const elementDescriptor = Object.getOwnPropertyDescriptor(HTMLElement.prototype, 'offsetHeight');
Object.defineProperty(HTMLDivElement.prototype, 'offsetHeight', {
...elementDescriptor,
get: function() {
if (this.id === 'modernizr') { return 24; }
return elementDescriptor.get.apply(this);
}
});
Object.defineProperty(window.screen, 'width', { get: () => 1366 });
Object.defineProperty(window.screen, 'height', { get: () => 768 });
const getParameter = WebGLRenderingContext.prototype. getParameter;
WebGLRenderingContext.prototype.getParameter = function (param) {
if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL
if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL
return getParameter.apply(this, [param]);
};
const observer = new MutationObserver(mutations => {
mutations.forEach(mutation => {
mutation.addedNodes.forEach(node => {
if (node.tagName === 'IFRAME' && node.hasAttribute('sandbox')) {
node.removeAttribute('sandbox');
}
});
});
});
observer.observe(document.documentElement, { childList: true, subtree: true });
"""
)
return brwsr, context