From 00000d922c0b107e73a90d7d45a73b2372e5733f Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Fri, 10 Oct 2025 17:14:32 -0400 Subject: [PATCH] e --- M3U8/fetch.py | 8 +- M3U8/scrapers/old/ace.py | 127 ++++++++++++ M3U8/scrapers/{ => old}/streameast.py | 3 + M3U8/scrapers/streamed.py | 2 +- M3U8/scrapers/strmd.py | 272 ++++++++++++++++++++++++++ M3U8/scrapers/utils/caching.py | 12 +- 6 files changed, 417 insertions(+), 7 deletions(-) create mode 100644 M3U8/scrapers/old/ace.py rename M3U8/scrapers/{ => old}/streameast.py (95%) create mode 100644 M3U8/scrapers/strmd.py diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 4952458..2040b75 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -2,7 +2,7 @@ import asyncio from pathlib import Path -from scrapers import fstv, livetvsx, ppv, streambtw, streameast, streamed, tvpass +from scrapers import fstv, livetvsx, ppv, streambtw, streamed, strmd, tvpass from scrapers.utils import get_logger, network log = get_logger(__name__) @@ -27,11 +27,11 @@ async def main() -> None: tasks = [ asyncio.create_task(fstv.scrape(network.client)), - # asyncio.create_task(livetvsx.scrape(network.client)), + asyncio.create_task(livetvsx.scrape(network.client)), asyncio.create_task(ppv.scrape(network.client)), asyncio.create_task(streambtw.scrape(network.client)), - # asyncio.create_task(streameast.scrape(network.client)), asyncio.create_task(streamed.scrape(network.client)), + asyncio.create_task(strmd.scrape(network.client)), asyncio.create_task(tvpass.scrape(network.client)), ] @@ -42,8 +42,8 @@ async def main() -> None: | livetvsx.urls | ppv.urls | streambtw.urls - | streameast.urls | streamed.urls + | strmd.urls | tvpass.urls ) diff --git a/M3U8/scrapers/old/ace.py b/M3U8/scrapers/old/ace.py new file mode 100644 index 0000000..ae7e7c1 --- /dev/null +++ b/M3U8/scrapers/old/ace.py @@ -0,0 +1,127 @@ +import asyncio +import re +from urllib.parse import urljoin + +import httpx +from selectolax.parser import HTMLParser, Node + +from .utils import get_base, get_logger, leagues + +log = get_logger(__name__) + +urls: dict[str, dict[str, str]] = {} + +MIRRORS = ["https://aceztrims.pages.dev/", "https://acestrlms.pages.dev/"] + + +def is_valid_href(a: Node) -> bool: + href = a.attributes.get("href", "") + return href.startswith("/") and href != "/news/" + + +async def get_schedule(client: httpx.AsyncClient, base_url: str) -> list[dict]: + log.info(f'Scraping from "{base_url}"') + + try: + r = await client.get(base_url) + r.raise_for_status() + except Exception as e: + log.error(f'Failed to fetch "{base_url}": {e}') + return [] + + html = re.sub(r"", "", r.text, flags=re.DOTALL) + + tree = HTMLParser(html) + + events = [] + + for a in filter(is_valid_href, tree.css("a[href]")): + href = a.attributes.get("href", "") + + title_text = a.text(strip=True) + + after_time = ( + title_text.split("//", 1)[1].strip() if "//" in title_text else title_text + ) + + if " - " in after_time: + sport, event_name = (x.strip() for x in after_time.split(" - ", 1)) + else: + sport, event_name = "", after_time + + events.append( + {"sport": sport, "event": event_name, "href": urljoin(base_url, href)} + ) + + return events + + +async def get_m3u8_links(client: httpx.AsyncClient, url: str) -> list[str]: + try: + r = await client.get(url) + r.raise_for_status() + except Exception as e: + log.error(f'Failed to fetch "{url}": {e}') + return [] + + html = re.sub(r"", "", r.text, flags=re.DOTALL) + + soup = HTMLParser(html) + + m3u8_links = [] + + for btn in soup.css("button[onclick]"): + onclick = btn.attributes.get("onclick", "") + + if match := re.search(r"src\s*=\s*['\"](.*?)['\"]", onclick): + link = match[1] + + if ".m3u8" in link: + m3u8_links.append(link) + + if iframe := soup.css_first("iframe#iframe"): + src = iframe.attributes.get("src", "") + + if ".m3u8" in src and src not in m3u8_links: + m3u8_links.insert( + 0, + src.split("cors.ricohspaces.app/")[-1], + ) + + return m3u8_links + + +async def scrape(client: httpx.AsyncClient) -> None: + if not (base_url := await get_base(client, MIRRORS)): + log.warning("No working ace mirrors") + return + + schedule = await get_schedule(client, base_url) + + tasks = [get_m3u8_links(client, item["href"]) for item in schedule] + + results = await asyncio.gather(*tasks) + + for item, m3u8_urls in zip(schedule, results): + if not m3u8_urls: + continue + + for i, link in enumerate(m3u8_urls, start=1): + sport, event = item["sport"], item["event"] + + key = f"[{sport}] {event} (S{i})" + + tvg_id, logo = leagues.info(sport) + + entry = { + "url": link, + "logo": logo, + "id": tvg_id, + } + + urls[key] = entry + + log.info(f"Collected {len(urls)} events") + + +# need to update diff --git a/M3U8/scrapers/streameast.py b/M3U8/scrapers/old/streameast.py similarity index 95% rename from M3U8/scrapers/streameast.py rename to M3U8/scrapers/old/streameast.py index d0d36fe..59760f1 100644 --- a/M3U8/scrapers/streameast.py +++ b/M3U8/scrapers/old/streameast.py @@ -216,3 +216,6 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info("No new events found") CACHE_FILE.write(cached_urls) + + +# cloudflare bot protection added diff --git a/M3U8/scrapers/streamed.py b/M3U8/scrapers/streamed.py index b854d8a..5d3fa94 100644 --- a/M3U8/scrapers/streamed.py +++ b/M3U8/scrapers/streamed.py @@ -110,7 +110,7 @@ async def refresh_html_cache(client: httpx.AsyncClient, url: str) -> dict[str, s if not (m := re.search(r"openPlayerPopup\(\s*(\d+)\s*\)", onclick)): continue - key = f"[{sport}] {event} (STRMD)" + key = f"[{sport}] {event} (STRMED)" events[key] = { "sport": sport, diff --git a/M3U8/scrapers/strmd.py b/M3U8/scrapers/strmd.py new file mode 100644 index 0000000..d77534a --- /dev/null +++ b/M3U8/scrapers/strmd.py @@ -0,0 +1,272 @@ +import asyncio +import re +from functools import partial +from pathlib import Path +from typing import Any +from urllib.parse import urljoin + +import httpx +from playwright.async_api import BrowserContext, async_playwright + +from .utils import Cache, Time, get_logger, leagues, network + +log = get_logger(__name__) + +urls: dict[str, dict[str, str | float]] = {} + +API_FILE = Cache(Path(__file__).parent / "caches" / "strmd_api.json", exp=28_800) + +CACHE_FILE = Cache(Path(__file__).parent / "caches" / "strmd.json", exp=10_800) + +MIRRORS = ["https://streamed.pk", "https://streami.su", "https://streamed.st"] + + +def validate_category(s: str) -> str: + if "-" in s: + return " ".join([i.capitalize() for i in s.split("-")]) + + elif s == "fight": + return "Fight (UFC/Boxing)" + + return s.capitalize() + + +def get_tvg(sport: str, event: str) -> str: + match sport: + case "American Football": + if leagues.is_valid(event, "NFL"): + return "NFL.Dummy.us" + + else: + return "NCAA.Sports.Dummy.us" + + case "Basketball": + if leagues.is_valid(event, "NBA"): + return "NBA.Basketball.Dummy.us" + + elif leagues.is_valid(event, "WNBA"): + return "WNBA.dummy.us" + + # NCAA + + else: + return "Basketball.Dummy.us" + + case "Hockey": + return "NHL.Hockey.Dummy.us" + + case _: + return leagues.info(sport)[0] + + +async def refresh_api_cache( + client: httpx.AsyncClient, url: str +) -> list[dict[str, Any]]: + log.info("Refreshing API cache") + + try: + r = await client.get(url) + r.raise_for_status() + except Exception as e: + log.error(f'Failed to fetch "{url}"\n{e}') + return {} + + data = r.json() + + data[0]["timestamp"] = Time.now().timestamp() + + return data + + +async def process_event( + url: str, + url_num: int, + context: BrowserContext, +) -> str | None: + page = await context.new_page() + + captured: list[str] = [] + + got_one = asyncio.Event() + + handler = partial(network.capture_req, captured=captured, got_one=got_one) + + page.on("request", handler) + + try: + await page.goto( + url, + wait_until="domcontentloaded", + timeout=15_000, + ) + + wait_task = asyncio.create_task(got_one.wait()) + + try: + await asyncio.wait_for(wait_task, timeout=10) + except asyncio.TimeoutError: + log.warning(f"URL {url_num}) Timed out waiting for M3U8.") + return + + finally: + if not wait_task.done(): + wait_task.cancel() + + try: + await wait_task + except asyncio.CancelledError: + pass + + if captured: + log.info(f"URL {url_num}) Captured M3U8") + + return captured[-1] + + log.warning(f"URL {url_num}) No M3U8 captured after waiting.") + return + + except Exception as e: + log.warning(f"URL {url_num}) Exception while processing: {e}") + return + + finally: + page.remove_listener("request", handler) + await page.close() + + +async def get_events( + client: httpx.AsyncClient, + base_url: str, + cached_keys: set[str], +) -> list[dict[str, str]]: + if not (api_data := API_FILE.load(per_entry=False, index=True)): + api_data = await refresh_api_cache( + client, urljoin(base_url, "api/matches/all-today") + ) + + API_FILE.write(api_data) + + events: list[dict[str, str]] = [] + + now = Time.clean(Time.now()) + start_dt = now.delta(minutes=-30) + end_dt = now.delta(minutes=30) + pattern = re.compile(r"[\n\r]+|\s{2,}") + + for event in api_data: + category = event["category"] + + if category == "other": + continue + + sport = validate_category(category) + + parts = pattern.split(event["title"].strip()) + name = " | ".join(p.strip() for p in parts if p.strip()) + + logo = urljoin(base_url, poster) if (poster := event.get("poster")) else None + key = f"[{sport}] {name} (STRMD)" + + if cached_keys & {key}: + continue + + if not (ts := event["date"]): + continue + + start_ts = int(str(ts)[:-3]) + + event_dt = Time.from_ts(start_ts) + + if not start_dt <= event_dt <= end_dt: + continue + + source: list[dict[str, str]] = event["sources"] + + for s in source: + source_type = s.get("source") + + stream_id = s.get("id") + + if not source_type: + continue + + if not stream_id: + continue + + events.append( + { + "sport": sport, + "event": name, + "link": f"https://embedsports.top/embed/{source_type}/{stream_id}/1", + "logo": logo, + "timestamp": event_dt.timestamp(), + } + ) + + return events + + +async def scrape(client: httpx.AsyncClient) -> None: + cached_urls = CACHE_FILE.load() + cached_count = len(cached_urls) + urls.update(cached_urls) + + log.info(f"Loaded {cached_count} event(s) from cache") + + if not (base_url := await network.get_base(MIRRORS)): + log.warning("No working PPV mirrors") + CACHE_FILE.write(cached_urls) + return + + log.info(f'Scraping from "{base_url}"') + + events = await get_events( + client, + base_url, + set(cached_urls.keys()), + ) + + log.info(f"Processing {len(events)} new URL(s)") + + async with async_playwright() as p: + browser, context = await network.browser(p, "brave") + + for i, ev in enumerate(events, start=1): + url = await network.safe_process( + lambda: process_event( + ev["link"], + url_num=i, + context=context, + ), + url_num=i, + log=log, + ) + + if url: + sport, event, logo, ts = ( + ev["sport"], + ev["event"], + ev["logo"], + ev["timestamp"], + ) + + key = f"[{sport}] {event} (STRMD)" + + entry = { + "url": url, + "logo": logo or leagues.info(sport)[1], + "base": "https://embedsports.top/", + "timestamp": ts, + "id": get_tvg(sport, event) or "Live.Event.us", + } + + urls[key] = cached_urls[key] = entry + + await browser.close() + + if new_count := len(cached_urls) - cached_count: + log.info(f"Collected and cached {new_count} new event(s)") + else: + log.info("No new events found") + + CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/utils/caching.py b/M3U8/scrapers/utils/caching.py index ac93079..e80fb3a 100644 --- a/M3U8/scrapers/utils/caching.py +++ b/M3U8/scrapers/utils/caching.py @@ -17,7 +17,11 @@ class Cache: return self.now_ts - dt_ts < self.exp - def load(self, per_entry: bool = True) -> dict[str, dict[str, str | float]]: + def load( + self, + per_entry: bool = True, + index: bool = False, + ) -> dict[str, dict[str, str | float]]: try: data: dict = json.loads(self.file.read_text(encoding="utf-8")) except (FileNotFoundError, json.JSONDecodeError): @@ -26,7 +30,11 @@ class Cache: if per_entry: return {k: v for k, v in data.items() if self.is_fresh(v)} - ts: float | int = data.get("timestamp", 31496400) + if index: + ts: float | int = data[0].get("timestamp", 31496400) + + else: + ts: float | int = data.get("timestamp", 31496400) dt_ts = Time.clean(Time.from_ts(ts)).timestamp()