From 00000d905f2f40df8e630a81fe7c9e37772e0980 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Thu, 4 Sep 2025 19:53:27 -0400 Subject: [PATCH] e --- M3U8/fetch.py | 7 +-- M3U8/scrape/livetvsx.py | 74 +++++++++------------- M3U8/scrape/ppv.py | 63 +++++++++---------- M3U8/scrape/streambtw.py | 112 ++++++++++++++++++++++++++++++++++ M3U8/scrape/tvpass.py | 15 ++--- M3U8/scrape/utils/__init__.py | 13 +++- M3U8/scrape/utils/config.py | 33 ++++++++++ 7 files changed, 224 insertions(+), 93 deletions(-) create mode 100644 M3U8/scrape/streambtw.py diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 918ebe1..8f4c8bf 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -3,7 +3,7 @@ import asyncio from pathlib import Path import httpx -from scrape import ace, fstv, livetvsx, ppv, tvpass +from scrape import livetvsx, ppv, streambtw, tvpass from scrape.utils import get_logger log = get_logger(__name__) @@ -40,10 +40,9 @@ async def vanilla_fetch() -> tuple[list[str], int]: async def main() -> None: tasks = [ - # asyncio.create_task(ace.main(client)), - # asyncio.create_task(fstv.main(client)), asyncio.create_task(livetvsx.main(CLIENT)), asyncio.create_task(ppv.main(CLIENT)), + asyncio.create_task(streambtw.main(CLIENT)), asyncio.create_task(tvpass.main(CLIENT)), vanilla_fetch(), ] @@ -52,7 +51,7 @@ async def main() -> None: base_m3u8, tvg_chno = results[-1] - additions = ace.urls | fstv.urls | livetvsx.urls | ppv.urls | tvpass.urls + additions = livetvsx.urls | ppv.urls | streambtw.urls | tvpass.urls lines = [ f'#EXTINF:-1 tvg-chno="{chnl_num}" tvg-id="(N/A)" tvg-name="{event}" tvg-logo="{info["logo"]}" group-title="Live Events",{event}\n{info["url"]}' diff --git a/M3U8/scrape/livetvsx.py b/M3U8/scrape/livetvsx.py index 0aa9a97..6f46e14 100644 --- a/M3U8/scrape/livetvsx.py +++ b/M3U8/scrape/livetvsx.py @@ -4,12 +4,21 @@ import json import ssl import xml.etree.ElementTree as ET from datetime import datetime, timedelta +from functools import partial from pathlib import Path import httpx -from playwright.async_api import Request, async_playwright +from playwright.async_api import async_playwright -from .utils import LOGOS, TZ, get_logger, now, safe_process_event +from .utils import ( + LOGOS, + TZ, + capture_req, + get_logger, + load_ts_cache, + now, + safe_process_event, +) log = get_logger(__name__) @@ -62,21 +71,6 @@ async def get_cert(client: httpx.AsyncClient) -> ssl.SSLContext: return ssl.create_default_context(cafile=CERT_FILE) -def load_cache() -> dict[str, dict[str, str | float]]: - try: - data: dict[str, dict[str, str | float]] = json.loads( - CACHE_FILE.read_text(encoding="utf-8") - ) - - return { - k: v - for k, v in data.items() - if now.timestamp() - data[k].get("timestamp", 0) < 14400 # 4 hours - } - except (FileNotFoundError, json.JSONDecodeError): - return {} - - async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO: buffer = io.BytesIO() @@ -104,44 +98,38 @@ async def process_event(url: str, url_num: int) -> str | None: context = await browser.new_context( ignore_https_errors=True # website doesn't send valid certs ) - ev_page = await context.new_page() + + page = await context.new_page() captured: list[str] = [] got_one = asyncio.Event() - def capture_req(req: Request) -> None: - if ( - ".m3u8" in req.url - and "amazonaws" not in req.url - and "knitcdn" not in req.url - ): - captured.append(req.url) - got_one.set() + handler = partial(capture_req, captured=captured, got_one=got_one) popup = None try: - await ev_page.goto( + await page.goto( url, wait_until="domcontentloaded", timeout=10_000, ) - btn = await ev_page.query_selector(".lnkhdr > tbody > tr > td:nth-child(2)") + btn = await page.query_selector(".lnkhdr > tbody > tr > td:nth-child(2)") if btn: try: await btn.click() - await ev_page.wait_for_timeout(500) + await page.wait_for_timeout(500) except Exception as e: log.debug(f"URL {url_num}) Failed to click Browser Links tab: {e}") return else: log.warning(f"URL {url_num}) Browser Links tab not found") - link_img = await ev_page.query_selector( + link_img = await page.query_selector( "tr:nth-child(2) > td:nth-child(1) td:nth-child(6) img" ) @@ -149,10 +137,10 @@ async def process_event(url: str, url_num: int) -> str | None: log.warning(f"URL {url_num}) No browser link to click.") return - ev_page.on("request", capture_req) + page.on("request", handler) try: - async with ev_page.expect_popup(timeout=5_000) as popup_info: + async with page.expect_popup(timeout=5_000) as popup_info: try: await link_img.click() except Exception as e: @@ -162,22 +150,20 @@ async def process_event(url: str, url_num: int) -> str | None: popup = await popup_info.value - popup.on("request", capture_req) + popup.on("request", handler) except Exception: try: await link_img.click() except Exception as e: log.debug(f"URL {url_num}) Fallback click failed: {e}") - return wait_task = asyncio.create_task(got_one.wait()) try: await asyncio.wait_for(wait_task, timeout=1.5e1) - except asyncio.TimeoutError: - log.warning(f"URL {url_num}) Timed out waiting for m3u8.") + log.warning(f"URL {url_num}) Timed out waiting for M3U8.") return finally: @@ -189,32 +175,32 @@ async def process_event(url: str, url_num: int) -> str | None: except asyncio.CancelledError: pass - ev_page.remove_listener("request", capture_req) + page.remove_listener("request", handler) if popup: - popup.remove_listener("request", capture_req) + popup.remove_listener("request", handler) await popup.close() - await ev_page.close() + await page.close() if captured: log.info(f"URL {url_num}) Captured M3U8") return captured[-1] - log.warning(f"URL {url_num}) No m3u8 captured in popup or inline playback.") + log.warning(f"URL {url_num}) No M3U8 captured in popup or inline playback.") return except Exception: try: - ev_page.remove_listener("request", capture_req) + page.remove_listener("request", handler) if popup: - popup.remove_listener("request", capture_req) + popup.remove_listener("request", handler) await popup.close() - await ev_page.close() + await page.close() except Exception: pass @@ -283,7 +269,7 @@ async def main(client: httpx.AsyncClient) -> None: cert = await get_cert(client) - cached_urls = load_cache() + cached_urls = load_ts_cache(CACHE_FILE, 14400) cached_count = len(cached_urls) log.info(f"Collected {cached_count} event(s) from cache") diff --git a/M3U8/scrape/ppv.py b/M3U8/scrape/ppv.py index 5b79eb9..7c104af 100644 --- a/M3U8/scrape/ppv.py +++ b/M3U8/scrape/ppv.py @@ -4,13 +4,22 @@ import asyncio import json import re from datetime import datetime, timedelta +from functools import partial from pathlib import Path from urllib.parse import urljoin import httpx -from playwright.async_api import Request, async_playwright +from playwright.async_api import async_playwright -from .utils import TZ, get_base, get_logger, now, safe_process_event +from .utils import ( + TZ, + capture_req, + get_base, + get_logger, + load_ts_cache, + now, + safe_process_event, +) log = get_logger(__name__) @@ -20,7 +29,13 @@ API_FILE = Path(__file__).parent / "caches" / "ppv_api.json" CACHE_FILE = Path(__file__).parent / "caches" / "ppv.json" -MIRRORS = ["https://ppv.to", "https://ppvs.su"] +MIRRORS = [ + "https://ppvs.su", + "https://ppv.to", + "https://ppv.wtf", + "https://ppv.land", + "https://freeppv.fun", +] async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict: @@ -36,21 +51,6 @@ async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict: return r.json() -def load_cache() -> dict[str, dict[str, str | float]]: - try: - data: dict[str, dict[str, str | float]] = json.loads( - CACHE_FILE.read_text(encoding="utf-8") - ) - - return { - k: v - for k, v in data.items() - if now.timestamp() - data[k].get("timestamp", 0) < 14400 # 4 hours - } - except (FileNotFoundError, json.JSONDecodeError): - return {} - - def load_api_cache() -> dict[str, dict[str, str | str]]: try: data: dict = json.loads(API_FILE.read_text(encoding="utf-8")) @@ -74,16 +74,9 @@ async def process_event(url: str, url_num: int) -> str | None: got_one = asyncio.Event() - def capture_req(req: Request) -> None: - if ( - ".m3u8" in req.url - and "amazonaws" not in req.url - and "knitcdn" not in req.url - ): - captured.append(req.url) - got_one.set() + handler = partial(capture_req, captured=captured, got_one=got_one) - page.on("request", capture_req) + page.on("request", handler) try: await page.goto(url, wait_until="domcontentloaded", timeout=15_000) @@ -93,8 +86,8 @@ async def process_event(url: str, url_num: int) -> str | None: try: await asyncio.wait_for(wait_task, timeout=10) except asyncio.TimeoutError: - log.warning(f"URL {url_num}) Timed out waiting for m3u8.") - return None + log.warning(f"URL {url_num}) Timed out waiting for M3U8.") + return finally: if not wait_task.done(): @@ -110,15 +103,15 @@ async def process_event(url: str, url_num: int) -> str | None: return captured[-1] - log.warning(f"URL {url_num}) No m3u8 captured after waiting.") - return None + log.warning(f"URL {url_num}) No M3U8 captured after waiting.") + return except Exception as e: log.warning(f"URL {url_num}) Exception while processing: {e}") - return None + return finally: - page.remove_listener("request", capture_req) + page.remove_listener("request", handler) await page.close() await browser.close() @@ -127,7 +120,7 @@ async def get_events( client: httpx.AsyncClient, api_url: str, cached_keys: set[str], -) -> dict[str, dict[str, str | str]]: +) -> list[dict[str, str]]: events: list[dict[str, str]] = [] @@ -186,7 +179,7 @@ async def main(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{base_url}"') - cached_urls = load_cache() + cached_urls = load_ts_cache(CACHE_FILE, 14400) cached_count = len(cached_urls) log.info(f"Collected {cached_count} event(s) from cache") diff --git a/M3U8/scrape/streambtw.py b/M3U8/scrape/streambtw.py new file mode 100644 index 0000000..2726b9f --- /dev/null +++ b/M3U8/scrape/streambtw.py @@ -0,0 +1,112 @@ +import json +import re +from pathlib import Path +from urllib.parse import urljoin + +import httpx +from selectolax.parser import HTMLParser + +from .utils import get_logger, load_ts_cache, now, safe_process_event + +log = get_logger(__name__) + +urls: dict[str, dict[str, str | float]] = {} + +BASE_URL = "https://streambtw.com/" + +CACHE_FILE = Path(__file__).parent / "caches" / "streambtw.json" + + +async def process_event( + client: httpx.AsyncClient, + url: str, + url_num: int, +) -> str | None: + + try: + r = await client.get(url) + r.raise_for_status() + except Exception as e: + log.error(f'URL {url_num}) Failed to fetch "{url}"\n{e}') + return + + valid_m3u8 = re.compile( + r'var\s+randomM3u8\s*=\s*[\'"]([^\'"]+)[\'"]', + re.IGNORECASE, + ) + + if match := valid_m3u8.search(r.text): + log.info(f"URL {url_num}) Captured M3U8") + return match[1] + + log.info(f"URL {url_num}) No M3U8 found") + + +async def get_events(client: httpx.AsyncClient) -> list[dict[str, str]]: + try: + r = await client.get(BASE_URL) + r.raise_for_status() + except Exception as e: + log.error(f'Failed to fetch "{BASE_URL}"\n{e}') + + return [] + + soup = HTMLParser(r.text) + + events = [] + + for card in soup.css("div.container div.card"): + img = card.css_first("img.league-logo") + + logo = img.attrs.get("src") if img else None + + sport = card.css_first("h5.card-title").text(strip=True) + + name = card.css_first("p.card-text").text(strip=True) + + link = card.css_first("a.btn.btn-primary") + + if href := link.attrs.get("href"): + events.append( + { + "sport": sport, + "event": name, + "link": urljoin(BASE_URL, href), + "logo": logo, + } + ) + + return events + + +async def main(client: httpx.AsyncClient) -> None: + if cached := load_ts_cache(CACHE_FILE, 86400): # find out when site updates + urls.update(cached) + log.info(f"Collected {len(urls)} event(s) from cache") + return + + log.info(f'Scraping from "{BASE_URL}"') + + events = await get_events(client) + + log.info(f"Processing {len(events)} new URLs") + + for i, ev in enumerate(events, start=1): + url = await safe_process_event( + lambda: process_event(client, url=ev["link"], url_num=i), + url_num=i, + log=log, + ) + + if url: + entry = { + "url": url, + "logo": ev["logo"], + "timestamp": now.timestamp(), + } + + urls[f"[{ev['sport']}] {ev['event']}"] = entry + + log.info(f"Collected {len(urls)} event(s)") + + CACHE_FILE.write_text(json.dumps(urls, indent=2), encoding="utf-8") diff --git a/M3U8/scrape/tvpass.py b/M3U8/scrape/tvpass.py index 7b075dc..51559a0 100644 --- a/M3U8/scrape/tvpass.py +++ b/M3U8/scrape/tvpass.py @@ -18,17 +18,18 @@ CACHE_FILE = Path(__file__).parent / "caches" / "tvpass.json" def load_cache() -> dict[str, str]: try: data = json.loads(CACHE_FILE.read_text(encoding="utf-8")) - return {} if 8 <= now.hour <= 12 else data + return {} if now.hour <= 12 else data except (FileNotFoundError, json.JSONDecodeError): return {} -async def fetch_m3u8(client: httpx.AsyncClient) -> list[str] | None: +async def fetch_m3u8(client: httpx.AsyncClient) -> list[str]: try: r = await client.get(BASE_URL) r.raise_for_status() except Exception as e: log.error(f'Failed to fetch "{BASE_URL}"\n{e}') + return [] return r.text.splitlines() @@ -41,10 +42,7 @@ async def main(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - if not (data := await fetch_m3u8(client)): - return - - for i, line in enumerate(data): + for i, line in enumerate(data := await fetch_m3u8(client)): if line.startswith("#EXTINF"): tvg_id_match = re.search(r'tvg-id="([^"]*)"', line) tvg_name_match = re.search(r'tvg-name="([^"]*)"', line) @@ -69,7 +67,6 @@ async def main(client: httpx.AsyncClient) -> None: ), } - if urls: - CACHE_FILE.write_text(json.dumps(urls, indent=2), encoding="utf-8") + CACHE_FILE.write_text(json.dumps(urls, indent=2), encoding="utf-8") - log.info(f"Cached {len(urls)} event(s)") + log.info(f"Cached {len(urls)} event(s)") diff --git a/M3U8/scrape/utils/__init__.py b/M3U8/scrape/utils/__init__.py index d7cfcb4..b8884d4 100644 --- a/M3U8/scrape/utils/__init__.py +++ b/M3U8/scrape/utils/__init__.py @@ -1,10 +1,21 @@ -from .config import LOGOS, TZ, get_base, get_logger, now, safe_process_event +from .config import ( + LOGOS, + TZ, + capture_req, + get_base, + get_logger, + load_ts_cache, + now, + safe_process_event, +) __all__ = [ "LOGOS", "TZ", + "capture_req", "get_base", "get_logger", + "load_ts_cache", "now", "safe_process_event", ] diff --git a/M3U8/scrape/utils/config.py b/M3U8/scrape/utils/config.py index 862df3f..36b4c51 100644 --- a/M3U8/scrape/utils/config.py +++ b/M3U8/scrape/utils/config.py @@ -1,11 +1,14 @@ import asyncio +import json import logging +import re from datetime import datetime from pathlib import Path from typing import Any import httpx import pytz +from playwright.async_api import Request TZ = pytz.timezone("America/New_York") @@ -65,6 +68,24 @@ def get_logger(name: str | None = None) -> logging.Logger: return logger +def load_ts_cache( + file: Path, + cache_exp: int | float, +) -> dict[str, dict[str, str | float]]: + try: + data: dict[str, dict[str, str | float]] = json.loads( + file.read_text(encoding="utf-8") + ) + + return { + k: v + for k, v in data.items() + if now.timestamp() - v.get("timestamp", 0) < cache_exp + } + except (FileNotFoundError, json.JSONDecodeError): + return {} + + async def safe_process_event( fn, url_num: int, @@ -107,3 +128,15 @@ async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str: results = await asyncio.gather(*tasks) return [url for url, ok in zip(mirrors, results) if ok][0] + + +def capture_req( + req: Request, + captured: list[str], + got_one: asyncio.Event, +) -> None: + valid_m3u8 = re.compile(r"^(?!.*(amazonaws|knitcdn)).*\.m3u8") + + if valid_m3u8.search(req.url): + captured.append(req.url) + got_one.set()