From 00000d9ef12842ade8371f8ee1f167ba6a1ac48e Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Thu, 11 Sep 2025 14:55:53 -0400 Subject: [PATCH] e --- M3U8/fetch.py | 13 +- M3U8/scrapers/fstv.py | 8 +- M3U8/scrapers/livetvsx.py | 58 ++++----- M3U8/scrapers/ppv.py | 9 +- M3U8/scrapers/streameast.py | 206 ++++++++++++++++++++++++++++++++ M3U8/scrapers/utils/__init__.py | 4 + M3U8/scrapers/utils/config.py | 64 +++++++++- 7 files changed, 318 insertions(+), 44 deletions(-) create mode 100644 M3U8/scrapers/streameast.py diff --git a/M3U8/fetch.py b/M3U8/fetch.py index fa89b75..dd76016 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -3,8 +3,8 @@ import asyncio from pathlib import Path import httpx -from scrapers import livetvsx, ppv, streambtw, tvpass -from scrapers.utils import get_logger +from scrapers import livetvsx, ppv, streambtw, streameast, tvpass +from scrapers.utils import UA, get_logger log = get_logger(__name__) @@ -15,9 +15,7 @@ M3U8_FILE = Path(__file__).parent / "TV.m3u8" CLIENT = httpx.AsyncClient( timeout=5, follow_redirects=True, - headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0" - }, + headers={"User-Agent": UA}, ) @@ -41,6 +39,7 @@ async def main() -> None: asyncio.create_task(livetvsx.main(CLIENT)), asyncio.create_task(ppv.main(CLIENT)), asyncio.create_task(streambtw.main(CLIENT)), + asyncio.create_task(streameast.main(CLIENT)), asyncio.create_task(tvpass.main(CLIENT)), vanilla_fetch(), ] @@ -49,7 +48,9 @@ async def main() -> None: base_m3u8, tvg_chno = results[-1] - additions = livetvsx.urls | ppv.urls | streambtw.urls | tvpass.urls + additions = ( + livetvsx.urls | ppv.urls | streambtw.urls | streameast.urls | tvpass.urls + ) lines = [ f'#EXTINF:-1 tvg-chno="{chnl_num}" tvg-id="(N/A)" tvg-name="{event}" tvg-logo="{info["logo"]}" group-title="Live Events",{event}\n{info["url"]}' diff --git a/M3U8/scrapers/fstv.py b/M3U8/scrapers/fstv.py index 1abda54..1ab4012 100644 --- a/M3U8/scrapers/fstv.py +++ b/M3U8/scrapers/fstv.py @@ -18,7 +18,7 @@ MIRRORS = [ ] -async def get_hrefs(client: httpx.AsyncClient, base_url: str) -> list[tuple[str, str]]: +async def get_events(client: httpx.AsyncClient, base_url: str) -> list[tuple[str, str]]: log.info(f'Scraping from "{base_url}"') try: @@ -55,7 +55,7 @@ async def get_hrefs(client: httpx.AsyncClient, base_url: str) -> list[tuple[str, return events.items() -async def fetch_m3u8(client: httpx.AsyncClient, url: str) -> tuple[str, list[str]]: +async def process_events(client: httpx.AsyncClient, url: str) -> tuple[str, list[str]]: try: r = await client.get(url) r.raise_for_status() @@ -89,9 +89,9 @@ async def main(client: httpx.AsyncClient) -> None: log.warning("No working FSTV mirrors") return - events = await get_hrefs(client, base_url) + events = await get_events(client, base_url) - tasks = [fetch_m3u8(client, href) for _, href in events if href] + tasks = [process_events(client, href) for _, href in events if href] results = await asyncio.gather(*tasks) for (event, _), (match_name, m3u8_urls) in zip(events, results): diff --git a/M3U8/scrapers/livetvsx.py b/M3U8/scrapers/livetvsx.py index 7afc54e..0c062b0 100644 --- a/M3U8/scrapers/livetvsx.py +++ b/M3U8/scrapers/livetvsx.py @@ -14,6 +14,7 @@ from .utils import ( LOGOS, TZ, capture_req, + firefox, get_logger, load_cache, now, @@ -96,11 +97,7 @@ async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO | No async def process_event(url: str, url_num: int) -> str | None: async with async_playwright() as p: - browser = await p.firefox.launch(headless=True) - - context = await browser.new_context( - ignore_https_errors=True # website doesn't send valid certs - ) + browser, context = await firefox(p, ignore_https_errors=True) page = await context.new_page() @@ -217,7 +214,9 @@ async def get_events( ) -> list[dict[str, str]]: events: list[dict[str, str]] = [] - window_start, window_end = now - timedelta(hours=1), now + timedelta(minutes=30) + + start_dt = now - timedelta(minutes=30) + end_dt = now + timedelta(minutes=30) if buffer := await fetch_xml_stream(url, ssl_ctx): pub_date_format = "%a, %d %b %Y %H:%M:%S %z" @@ -236,30 +235,33 @@ async def get_events( elem.clear() continue - if window_start <= dt <= window_end: - sport, event = ( - ( - desc.split(".")[0].strip(), - " ".join(p.strip() for p in desc.split(".")[1:]), - ) - if desc - else ("", "") + if not start_dt <= dt <= end_dt: + elem.clear() + continue + + sport, event = ( + ( + desc.split(".")[0].strip(), + " ".join(p.strip() for p in desc.split(".")[1:]), ) + if desc + else ("", "") + ) - key = f"[{sport}: {event}] {title}" + key = f"[{sport}: {event}] {title}" - if key in cached_keys: - elem.clear() - continue + if key in cached_keys: + elem.clear() + continue - events.append( - { - "sport": sport, - "event": event, - "title": title, - "link": link, - } - ) + events.append( + { + "sport": sport, + "event": event, + "title": title, + "link": link, + } + ) elem.clear() @@ -312,8 +314,8 @@ async def main(client: httpx.AsyncClient) -> None: urls[key] = cached_urls[key] = entry if new_count := len(cached_urls) - cached_count: - CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8") - log.info(f"Collected and cached {new_count} new event(s)") else: log.info("No new events found") + + CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8") diff --git a/M3U8/scrapers/ppv.py b/M3U8/scrapers/ppv.py index 66be2e8..fbe1da5 100644 --- a/M3U8/scrapers/ppv.py +++ b/M3U8/scrapers/ppv.py @@ -14,6 +14,7 @@ from playwright.async_api import async_playwright from .utils import ( TZ, capture_req, + firefox, get_base, get_logger, load_cache, @@ -55,9 +56,7 @@ async def refresh_api_cache( async def process_event(url: str, url_num: int) -> str | None: async with async_playwright() as p: - browser = await p.firefox.launch(headless=True) - - context = await browser.new_context() + browser, context = await firefox(p) page = await context.new_page() @@ -210,11 +209,11 @@ async def main(client: httpx.AsyncClient) -> None: urls[key] = cached_urls[key] = entry if new_count := len(cached_urls) - cached_count: - CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8") - log.info(f"Collected and cached {new_count} new event(s)") else: log.info("No new events found") + CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8") + # works if no cloudflare bot detection diff --git a/M3U8/scrapers/streameast.py b/M3U8/scrapers/streameast.py new file mode 100644 index 0000000..706d3f3 --- /dev/null +++ b/M3U8/scrapers/streameast.py @@ -0,0 +1,206 @@ +import asyncio +import json +from datetime import datetime, timedelta +from functools import partial +from pathlib import Path +from urllib.parse import urljoin + +import httpx +from playwright.async_api import async_playwright +from selectolax.parser import HTMLParser + +from .utils import ( + LOGOS, + TZ, + capture_req, + firefox, + get_base, + get_logger, + load_cache, + now, + safe_process_event, +) + +log = get_logger(__name__) + +urls: dict[str, dict[str, str | float]] = {} + +CACHE_FILE = Path(__file__).parent / "caches" / "streameast.json" + +MIRRORS = [ + "https://streameast.ga", + "https://streameast.tw", + "https://streameast.ph", + "https://streameast.sg", + "https://streameast.ch", + "https://streameast.ec", + "https://streameast.fi", + "https://streameast.ms", + "https://streameast.ps", + "https://streameast.cf", + "https://streameast.sk", + "https://thestreameast.co", + "https://thestreameast.fun", + "https://thestreameast.ru", + "https://thestreameast.su", +] + +LOGOS["CFB"] = LOGOS["NCAAF"] +LOGOS["CBB"] = LOGOS["NCAAB"] + + +async def process_event(url: str, url_num: int) -> str | None: + async with async_playwright() as p: + browser, context = await firefox(p) + + page = await context.new_page() + + captured: list[str] = [] + + got_one = asyncio.Event() + + handler = partial(capture_req, captured=captured, got_one=got_one) + + page.on("request", handler) + + try: + await page.goto(url, wait_until="domcontentloaded", timeout=15_000) + + wait_task = asyncio.create_task(got_one.wait()) + + try: + await asyncio.wait_for(wait_task, timeout=10) + except asyncio.TimeoutError: + log.warning(f"URL {url_num}) Timed out waiting for M3U8.") + return + + finally: + if not wait_task.done(): + wait_task.cancel() + + try: + await wait_task + except asyncio.CancelledError: + pass + + if captured: + log.info(f"URL {url_num}) Captured M3U8") + + return captured[-1] + + log.warning(f"URL {url_num}) No M3U8 captured after waiting.") + return + + except Exception as e: + log.warning(f"URL {url_num}) Exception while processing: {e}") + return + + finally: + page.remove_listener("request", handler) + await page.close() + await browser.close() + + +async def get_events( + client: httpx.AsyncClient, + url: str, + cached_keys: list[str], +) -> list[dict[str, str]]: + try: + r = await client.get(url) + r.raise_for_status() + except Exception as e: + log.error(f'Failed to fetch "{url}"\n{e}') + + return [] + + soup = HTMLParser(r.text) + events = [] + + start_dt = now - timedelta(minutes=30) + end_dt = now + timedelta(minutes=30) + + for li in soup.css("li.f1-podium--item"): + a = li.css_first("a.f1-podium--link") + + if not a: + continue + + href = urljoin(url, a.attributes.get("href", "")) + + sport = a.css_first(".MacBaslikKat").text(strip=True) + + name = a.css_first(".MacIsimleri").text(strip=True) + + time_span = a.css_first(".f1-podium--time") + time_text = time_span.text(strip=True) + timestamp = int(time_span.attributes.get("data-zaman")) + + key = f"[{sport}] {name}" + + if key in cached_keys: + continue + + event_dt = datetime.fromtimestamp(timestamp, TZ) + + if time_text == "LIVE" or (start_dt <= event_dt < end_dt): + events.append( + { + "sport": sport, + "event": name, + "link": href, + "logo": LOGOS.get( + sport, + "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png", + ), + } + ) + + return events + + +async def main(client: httpx.AsyncClient) -> None: + cached_urls = load_cache(CACHE_FILE, exp=14400) + cached_count = len(cached_urls) + urls.update(cached_urls) + + log.info(f"Collected {cached_count} event(s) from cache") + + if not (base_url := await get_base(client, MIRRORS)): + log.warning("No working StreamEast mirrors") + return + + log.info(f'Scraping from "{base_url}"') + + events = await get_events( + client, + base_url, + set(cached_urls.keys()), + ) + + log.info(f"Processing {len(events)} new URL(s)") + + for i, ev in enumerate(events, start=1): + url = await safe_process_event( + lambda: process_event(ev["link"], url_num=i), + url_num=i, + log=log, + ) + + if url: + entry = { + "url": url, + "logo": ev["logo"], + "timestamp": now.timestamp(), + } + + key = f"[{ev['sport']}] {ev['event']}" + + urls[key] = cached_urls[key] = entry + + if new_count := len(cached_urls) - cached_count: + log.info(f"Collected and cached {new_count} new event(s)") + else: + log.info("No new events found") + + CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8") diff --git a/M3U8/scrapers/utils/__init__.py b/M3U8/scrapers/utils/__init__.py index de1bf57..855bdb0 100644 --- a/M3U8/scrapers/utils/__init__.py +++ b/M3U8/scrapers/utils/__init__.py @@ -1,7 +1,9 @@ from .config import ( LOGOS, TZ, + UA, capture_req, + firefox, get_base, get_logger, load_cache, @@ -12,7 +14,9 @@ from .config import ( __all__ = [ "LOGOS", "TZ", + "UA", "capture_req", + "firefox", "get_base", "get_logger", "load_cache", diff --git a/M3U8/scrapers/utils/config.py b/M3U8/scrapers/utils/config.py index a3eaaae..5731e0f 100644 --- a/M3U8/scrapers/utils/config.py +++ b/M3U8/scrapers/utils/config.py @@ -9,12 +9,18 @@ from typing import Any import httpx import pytz -from playwright.async_api import Request +from playwright.async_api import Request, Playwright, Browser, BrowserContext TZ = pytz.timezone("America/New_York") now = datetime.now(TZ) +UA = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0" +) + LOGOS = { "MLB": "https://i.gyazo.com/0fe7865ef2f06c9507791b24f04dbca8.png", "NBA": "https://i.gyazo.com/773c23570f095a5d549c23b9401d83f4.png", @@ -162,3 +168,59 @@ def capture_req( if valid_m3u8.search(req.url): captured.append(req.url) got_one.set() + + +async def firefox( + playwright: Playwright, ignore_https_errors: bool = False +) -> tuple[Browser, BrowserContext]: + browser = await playwright.firefox.launch(headless=True) + + context = await browser.new_context( + user_agent=UA, + viewport={"width": 1366, "height": 768}, + device_scale_factor=1, + locale="en-US", + timezone_id="America/New_York", + color_scheme="dark", + permissions=["geolocation"], + extra_http_headers={ + "Accept-Language": "en-US,en;q=0.9", + "Upgrade-Insecure-Requests": "1", + }, + ignore_https_errors=ignore_https_errors, + ) + + await context.add_init_script( + """ + Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); + + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4] + }); + + const elementDescriptor = Object.getOwnPropertyDescriptor(HTMLElement.prototype, 'offsetHeight'); + Object.defineProperty(HTMLDivElement.prototype, 'offsetHeight', { + ...elementDescriptor, + get: function() { + if (this.id === 'modernizr') { return 24; } + return elementDescriptor.get.apply(this); + } + }); + + Object.defineProperty(window.screen, 'width', { get: () => 1366 }); + Object.defineProperty(window.screen, 'height', { get: () => 768 }); + + const getParameter = WebGLRenderingContext.prototype. getParameter; + WebGLRenderingContext.prototype.getParameter = function (param) { + if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL + if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL + return getParameter.apply(this, [param]); + }; + """ + ) + + return browser, context