e

2025-09-11 14:55:53 -04:00 · 2025-09-11 14:55:53 -04:00 · 00000d9ef1
commit 00000d9ef1
parent 8796e2dfc6
7 changed files with 318 additions and 44 deletions
--- a/M3U8/fetch.py
+++ b/M3U8/fetch.py
@ -3,8 +3,8 @@ import asyncio
 from pathlib import Path
 import httpx
-from scrapers import livetvsx, ppv, streambtw, tvpass
+from scrapers import livetvsx, ppv, streambtw, streameast, tvpass
-from scrapers.utils import get_logger
+from scrapers.utils import UA, get_logger
 log = get_logger(__name__)
@ -15,9 +15,7 @@ M3U8_FILE = Path(__file__).parent / "TV.m3u8"
 CLIENT = httpx.AsyncClient(
    timeout=5,
    follow_redirects=True,
-    headers={
+    headers={"User-Agent": UA},
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0"
    },
 )
@ -41,6 +39,7 @@ async def main() -> None:
        asyncio.create_task(livetvsx.main(CLIENT)),
        asyncio.create_task(ppv.main(CLIENT)),
        asyncio.create_task(streambtw.main(CLIENT)),
        asyncio.create_task(streameast.main(CLIENT)),
        asyncio.create_task(tvpass.main(CLIENT)),
        vanilla_fetch(),
    ]
@ -49,7 +48,9 @@ async def main() -> None:
    base_m3u8, tvg_chno = results[-1]
-    additions = livetvsx.urls | ppv.urls | streambtw.urls | tvpass.urls
+    additions = (
        livetvsx.urls | ppv.urls | streambtw.urls | streameast.urls | tvpass.urls
    )
    lines = [
        f'#EXTINF:-1 tvg-chno="{chnl_num}" tvg-id="(N/A)" tvg-name="{event}" tvg-logo="{info["logo"]}" group-title="Live Events",{event}\n{info["url"]}'
--- a/M3U8/scrapers/fstv.py
+++ b/M3U8/scrapers/fstv.py
@ -18,7 +18,7 @@ MIRRORS = [
 ]
-async def get_hrefs(client: httpx.AsyncClient, base_url: str) -> list[tuple[str, str]]:
+async def get_events(client: httpx.AsyncClient, base_url: str) -> list[tuple[str, str]]:
    log.info(f'Scraping from "{base_url}"')
    try:
@ -55,7 +55,7 @@ async def get_hrefs(client: httpx.AsyncClient, base_url: str) -> list[tuple[str,
    return events.items()
-async def fetch_m3u8(client: httpx.AsyncClient, url: str) -> tuple[str, list[str]]:
+async def process_events(client: httpx.AsyncClient, url: str) -> tuple[str, list[str]]:
    try:
        r = await client.get(url)
        r.raise_for_status()
@ -89,9 +89,9 @@ async def main(client: httpx.AsyncClient) -> None:
        log.warning("No working FSTV mirrors")
        return
-    events = await get_hrefs(client, base_url)
+    events = await get_events(client, base_url)
-    tasks = [fetch_m3u8(client, href) for _, href in events if href]
+    tasks = [process_events(client, href) for _, href in events if href]
    results = await asyncio.gather(*tasks)
    for (event, _), (match_name, m3u8_urls) in zip(events, results):
--- a/M3U8/scrapers/livetvsx.py
+++ b/M3U8/scrapers/livetvsx.py
@ -14,6 +14,7 @@ from .utils import (
    LOGOS,
    TZ,
    capture_req,
    firefox,
    get_logger,
    load_cache,
    now,
@ -96,11 +97,7 @@ async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO | No
 async def process_event(url: str, url_num: int) -> str | None:
    async with async_playwright() as p:
-        browser = await p.firefox.launch(headless=True)
+        browser, context = await firefox(p, ignore_https_errors=True)
        context = await browser.new_context(
            ignore_https_errors=True  # website doesn't send valid certs
        )
        page = await context.new_page()
@ -217,7 +214,9 @@ async def get_events(
 ) -> list[dict[str, str]]:
    events: list[dict[str, str]] = []
-    window_start, window_end = now - timedelta(hours=1), now + timedelta(minutes=30)
+
    start_dt = now - timedelta(minutes=30)
    end_dt = now + timedelta(minutes=30)
    if buffer := await fetch_xml_stream(url, ssl_ctx):
        pub_date_format = "%a, %d %b %Y %H:%M:%S %z"
@ -236,30 +235,33 @@ async def get_events(
                    elem.clear()
                    continue
-                if window_start <= dt <= window_end:
+                if not start_dt <= dt <= end_dt:
-                    sport, event = (
+                    elem.clear()
-                        (
+                    continue
-                            desc.split(".")[0].strip(),
+
-                            " ".join(p.strip() for p in desc.split(".")[1:]),
+                sport, event = (
-                        )
+                    (
-                        if desc
+                        desc.split(".")[0].strip(),
-                        else ("", "")
+                        " ".join(p.strip() for p in desc.split(".")[1:]),
                    )
                    if desc
                    else ("", "")
                )
-                    key = f"[{sport}: {event}] {title}"
+                key = f"[{sport}: {event}] {title}"
-                    if key in cached_keys:
+                if key in cached_keys:
-                        elem.clear()
+                    elem.clear()
-                        continue
+                    continue
-                    events.append(
+                events.append(
-                        {
+                    {
-                            "sport": sport,
+                        "sport": sport,
-                            "event": event,
+                        "event": event,
-                            "title": title,
+                        "title": title,
-                            "link": link,
+                        "link": link,
-                        }
+                    }
-                    )
+                )
                elem.clear()
@ -312,8 +314,8 @@ async def main(client: httpx.AsyncClient) -> None:
            urls[key] = cached_urls[key] = entry
    if new_count := len(cached_urls) - cached_count:
        CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
        log.info(f"Collected and cached {new_count} new event(s)")
    else:
        log.info("No new events found")
    CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
--- a/M3U8/scrapers/ppv.py
+++ b/M3U8/scrapers/ppv.py
@ -14,6 +14,7 @@ from playwright.async_api import async_playwright
 from .utils import (
    TZ,
    capture_req,
    firefox,
    get_base,
    get_logger,
    load_cache,
@ -55,9 +56,7 @@ async def refresh_api_cache(
 async def process_event(url: str, url_num: int) -> str | None:
    async with async_playwright() as p:
-        browser = await p.firefox.launch(headless=True)
+        browser, context = await firefox(p)
        context = await browser.new_context()
        page = await context.new_page()
@ -210,11 +209,11 @@ async def main(client: httpx.AsyncClient) -> None:
            urls[key] = cached_urls[key] = entry
    if new_count := len(cached_urls) - cached_count:
        CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
        log.info(f"Collected and cached {new_count} new event(s)")
    else:
        log.info("No new events found")
    CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
 # works if no cloudflare bot detection
--- a/M3U8/scrapers/streameast.py
+++ b/M3U8/scrapers/streameast.py
@ -0,0 +1,206 @@
 import asyncio
 import json
 from datetime import datetime, timedelta
 from functools import partial
 from pathlib import Path
 from urllib.parse import urljoin
 import httpx
 from playwright.async_api import async_playwright
 from selectolax.parser import HTMLParser
 from .utils import (
    LOGOS,
    TZ,
    capture_req,
    firefox,
    get_base,
    get_logger,
    load_cache,
    now,
    safe_process_event,
 )
 log = get_logger(__name__)
 urls: dict[str, dict[str, str | float]] = {}
 CACHE_FILE = Path(__file__).parent / "caches" / "streameast.json"
 MIRRORS = [
    "https://streameast.ga",
    "https://streameast.tw",
    "https://streameast.ph",
    "https://streameast.sg",
    "https://streameast.ch",
    "https://streameast.ec",
    "https://streameast.fi",
    "https://streameast.ms",
    "https://streameast.ps",
    "https://streameast.cf",
    "https://streameast.sk",
    "https://thestreameast.co",
    "https://thestreameast.fun",
    "https://thestreameast.ru",
    "https://thestreameast.su",
 ]
 LOGOS["CFB"] = LOGOS["NCAAF"]
 LOGOS["CBB"] = LOGOS["NCAAB"]
 async def process_event(url: str, url_num: int) -> str | None:
    async with async_playwright() as p:
        browser, context = await firefox(p)
        page = await context.new_page()
        captured: list[str] = []
        got_one = asyncio.Event()
        handler = partial(capture_req, captured=captured, got_one=got_one)
        page.on("request", handler)
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=15_000)
            wait_task = asyncio.create_task(got_one.wait())
            try:
                await asyncio.wait_for(wait_task, timeout=10)
            except asyncio.TimeoutError:
                log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
                return
            finally:
                if not wait_task.done():
                    wait_task.cancel()
                    try:
                        await wait_task
                    except asyncio.CancelledError:
                        pass
            if captured:
                log.info(f"URL {url_num}) Captured M3U8")
                return captured[-1]
            log.warning(f"URL {url_num}) No M3U8 captured after waiting.")
            return
        except Exception as e:
            log.warning(f"URL {url_num}) Exception while processing: {e}")
            return
        finally:
            page.remove_listener("request", handler)
            await page.close()
            await browser.close()
 async def get_events(
    client: httpx.AsyncClient,
    url: str,
    cached_keys: list[str],
 ) -> list[dict[str, str]]:
    try:
        r = await client.get(url)
        r.raise_for_status()
    except Exception as e:
        log.error(f'Failed to fetch "{url}"\n{e}')
        return []
    soup = HTMLParser(r.text)
    events = []
    start_dt = now - timedelta(minutes=30)
    end_dt = now + timedelta(minutes=30)
    for li in soup.css("li.f1-podium--item"):
        a = li.css_first("a.f1-podium--link")
        if not a:
            continue
        href = urljoin(url, a.attributes.get("href", ""))
        sport = a.css_first(".MacBaslikKat").text(strip=True)
        name = a.css_first(".MacIsimleri").text(strip=True)
        time_span = a.css_first(".f1-podium--time")
        time_text = time_span.text(strip=True)
        timestamp = int(time_span.attributes.get("data-zaman"))
        key = f"[{sport}] {name}"
        if key in cached_keys:
            continue
        event_dt = datetime.fromtimestamp(timestamp, TZ)
        if time_text == "LIVE" or (start_dt <= event_dt < end_dt):
            events.append(
                {
                    "sport": sport,
                    "event": name,
                    "link": href,
                    "logo": LOGOS.get(
                        sport,
                        "https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
                    ),
                }
            )
    return events
 async def main(client: httpx.AsyncClient) -> None:
    cached_urls = load_cache(CACHE_FILE, exp=14400)
    cached_count = len(cached_urls)
    urls.update(cached_urls)
    log.info(f"Collected {cached_count} event(s) from cache")
    if not (base_url := await get_base(client, MIRRORS)):
        log.warning("No working StreamEast mirrors")
        return
    log.info(f'Scraping from "{base_url}"')
    events = await get_events(
        client,
        base_url,
        set(cached_urls.keys()),
    )
    log.info(f"Processing {len(events)} new URL(s)")
    for i, ev in enumerate(events, start=1):
        url = await safe_process_event(
            lambda: process_event(ev["link"], url_num=i),
            url_num=i,
            log=log,
        )
        if url:
            entry = {
                "url": url,
                "logo": ev["logo"],
                "timestamp": now.timestamp(),
            }
            key = f"[{ev['sport']}] {ev['event']}"
            urls[key] = cached_urls[key] = entry
    if new_count := len(cached_urls) - cached_count:
        log.info(f"Collected and cached {new_count} new event(s)")
    else:
        log.info("No new events found")
    CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
--- a/M3U8/scrapers/utils/init.py
+++ b/M3U8/scrapers/utils/init.py
@ -1,7 +1,9 @@
 from .config import (
    LOGOS,
    TZ,
    UA,
    capture_req,
    firefox,
    get_base,
    get_logger,
    load_cache,
@ -12,7 +14,9 @@ from .config import (
 __all__ = [
    "LOGOS",
    "TZ",
    "UA",
    "capture_req",
    "firefox",
    "get_base",
    "get_logger",
    "load_cache",
--- a/M3U8/scrapers/utils/config.py
+++ b/M3U8/scrapers/utils/config.py
@ -9,12 +9,18 @@ from typing import Any
 import httpx
 import pytz
-from playwright.async_api import Request
+from playwright.async_api import Request, Playwright, Browser, BrowserContext
 TZ = pytz.timezone("America/New_York")
 now = datetime.now(TZ)
 UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0"
 )
 LOGOS = {
    "MLB": "https://i.gyazo.com/0fe7865ef2f06c9507791b24f04dbca8.png",
    "NBA": "https://i.gyazo.com/773c23570f095a5d549c23b9401d83f4.png",
@ -162,3 +168,59 @@ def capture_req(
    if valid_m3u8.search(req.url):
        captured.append(req.url)
        got_one.set()
 async def firefox(
    playwright: Playwright, ignore_https_errors: bool = False
 ) -> tuple[Browser, BrowserContext]:
    browser = await playwright.firefox.launch(headless=True)
    context = await browser.new_context(
        user_agent=UA,
        viewport={"width": 1366, "height": 768},
        device_scale_factor=1,
        locale="en-US",
        timezone_id="America/New_York",
        color_scheme="dark",
        permissions=["geolocation"],
        extra_http_headers={
            "Accept-Language": "en-US,en;q=0.9",
            "Upgrade-Insecure-Requests": "1",
        },
        ignore_https_errors=ignore_https_errors,
    )
    await context.add_init_script(
        """
        Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
        Object.defineProperty(navigator, 'languages', {
          get: () => ['en-US', 'en']
        });
        Object.defineProperty(navigator, 'plugins', {
          get: () => [1, 2, 3, 4]
        });
        const elementDescriptor = Object.getOwnPropertyDescriptor(HTMLElement.prototype, 'offsetHeight');
        Object.defineProperty(HTMLDivElement.prototype, 'offsetHeight', {
          ...elementDescriptor,
          get: function() {
            if (this.id === 'modernizr') { return 24; }
            return elementDescriptor.get.apply(this);
          }
        });
        Object.defineProperty(window.screen, 'width', { get: ()     => 1366 });
        Object.defineProperty(window.screen, 'height', { get: ()    => 768 });
        const getParameter = WebGLRenderingContext.prototype.   getParameter;
        WebGLRenderingContext.prototype.getParameter = function (param) {
          if (param === 37445) return "Intel Inc."; //  UNMASKED_VENDOR_WEBGL
          if (param === 37446) return "Intel Iris OpenGL    Engine"; // UNMASKED_RENDERER_WEBGL
          return getParameter.apply(this, [param]);
        };
        """
    )
    return browser, context