From 00000d93cc9889262090c69d31714ef5094714bd Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Sun, 7 Jun 2026 23:16:49 -0400 Subject: [PATCH] e - add streambiz.py - misc. edits --- M3U8/fetch.py | 3 + M3U8/scrapers/fawa.py | 14 +-- M3U8/scrapers/streambiz.py | 208 +++++++++++++++++++++++++++++++++ M3U8/scrapers/utils/webwork.py | 2 +- M3U8/scrapers/watchfooty.py | 2 +- 5 files changed, 218 insertions(+), 11 deletions(-) create mode 100644 M3U8/scrapers/streambiz.py diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 0f06ecdf..63fe4572 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -13,6 +13,7 @@ from scrapers import ( mainportal, roxie, shark, + streambiz, streamcenter, streamsgate, streamtpnew, @@ -59,6 +60,7 @@ async def main() -> None: asyncio.create_task(embedhd.scrape(hdl_brwsr)), asyncio.create_task(fsports.scrape(xtrnl_brwsr)), asyncio.create_task(roxie.scrape(hdl_brwsr)), + asyncio.create_task(streambiz.scrape(xtrnl_brwsr)), ] httpx_tasks = [ @@ -96,6 +98,7 @@ async def main() -> None: | mainportal.urls | roxie.urls | shark.urls + | streambiz.urls | streamcenter.urls | streamsgate.urls | streamtpnew.urls diff --git a/M3U8/scrapers/fawa.py b/M3U8/scrapers/fawa.py index 1722e58b..05cf6e19 100644 --- a/M3U8/scrapers/fawa.py +++ b/M3U8/scrapers/fawa.py @@ -38,7 +38,7 @@ async def process_event(url: str, url_num: int) -> str | None: return match[2] -async def get_events(cached_hrefs: set[str]) -> list[dict[str, str]]: +async def get_events(cached_links: set[str]) -> list[dict[str, str]]: events = [] if not (html_data := await network.request(BASE_URL, log=log)): @@ -57,9 +57,7 @@ async def get_events(cached_hrefs: set[str]) -> list[dict[str, str]]: if not (href := link.attributes.get("href")): continue - href = quote(href) - - if cached_hrefs & {href}: + elif cached_links & {link := urljoin(f"{html_data.url}", quote(href))}: continue if not (text and subtext): @@ -76,8 +74,7 @@ async def get_events(cached_hrefs: set[str]) -> list[dict[str, str]]: { "sport": sport, "event": clean_event.sub("", event_name), - "link": urljoin(f"{html_data.url}", href), - "href": href, + "link": link, } ) @@ -87,7 +84,7 @@ async def get_events(cached_hrefs: set[str]) -> list[dict[str, str]]: async def scrape() -> None: cached_urls = CACHE_FILE.load() - cached_hrefs = {entry["href"] for entry in cached_urls.values()} + cached_links = {entry["link"] for entry in cached_urls.values()} valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} @@ -99,7 +96,7 @@ async def scrape() -> None: log.info(f'Scraping from "{BASE_URL}"') - if events := await get_events(cached_hrefs): + if events := await get_events(cached_links): log.info(f"Processing {len(events)} new URL(s)") now = Time.clean(Time.now()) @@ -130,7 +127,6 @@ async def scrape() -> None: "base": BASE_URL, "timestamp": now.timestamp(), "id": tvg_id or "Live.Event.us", - "href": ev["href"], "link": link, } diff --git a/M3U8/scrapers/streambiz.py b/M3U8/scrapers/streambiz.py new file mode 100644 index 00000000..d4e507f3 --- /dev/null +++ b/M3U8/scrapers/streambiz.py @@ -0,0 +1,208 @@ +import asyncio +from functools import partial +from urllib.parse import urljoin + +from playwright.async_api import Browser, Page, TimeoutError +from selectolax.parser import HTMLParser + +from .utils import Cache, Time, get_logger, leagues, network + +log = get_logger(__name__) + +urls: dict[str, dict[str, str | float]] = {} + +TAG = "SPRTPASS" + +CACHE_FILE = Cache(TAG, exp=28_800) + +BASE_URL = "https://streamseast.biz/" + +SPORT_URLS = { + sport: urljoin(BASE_URL, sport.lower()) + for sport in [ + # "Boxing", + # "F1", + "MLB", + # "MMA", + "NBA", + # "NFL", + "NHL", + "Soccer", + ] +} + + +async def process_event( + url: str, + url_num: int, + page: Page, +) -> tuple[str | None, str | None, str | None]: + + nones = None, None + + captured: list[str] = [] + + got_one = asyncio.Event() + + handler = partial( + network.capture_req, + captured=captured, + got_one=got_one, + ) + + page.on("request", handler) + + event_name = "Sporting Event" + + try: + resp = await page.goto( + url, + wait_until="domcontentloaded", + timeout=6_000, + ) + + if not resp or resp.status != 200: + log.warning( + f"URL {url_num}) Status Code: {resp.status if resp else 'None'}" + ) + return (event_name, *nones) + + event_name_elem = page.locator("h1.match-head") + + event_name = await event_name_elem.inner_text(timeout=1_250) + + try: + ifr = page.locator("iframe.embed-responsive-item") + + await ifr.wait_for(timeout=1_250) + + ifr_src = await ifr.get_attribute("src") + except TimeoutError: + log.warning(f"URL {url_num}) No iframe found.") + return (event_name, *nones) + + await page.goto( + ifr_src, + wait_until="domcontentloaded", + timeout=2_250, + ) + + wait_task = asyncio.create_task(got_one.wait()) + + try: + await asyncio.wait_for(wait_task, timeout=5) + except asyncio.TimeoutError: + log.warning(f"URL {url_num}) Timed out waiting for M3U8.") + return (event_name, *nones) + + finally: + if not wait_task.done(): + wait_task.cancel() + + try: + await wait_task + except asyncio.CancelledError: + pass + + if captured: + log.info(f"URL {url_num}) Captured M3U8") + + return event_name, ifr_src, captured[0] + except Exception as e: + log.warning(f"URL {url_num}) {e}") + return (event_name, *nones) + + finally: + page.remove_listener("request", handler) + + +async def get_events(cached_links: set[str]) -> list[dict[str, str]]: + tasks = [network.request(url, log=log) for url in SPORT_URLS.values()] + + results = await asyncio.gather(*tasks) + + events = [] + + if not ( + soups := [(HTMLParser(html.content), html.url) for html in results if html] + ): + return events + + for soup, url in soups: + sport = next((k for k, v in SPORT_URLS.items() if v == url), "Live Event") + + for event in soup.css("a.matches"): + if not (href := event.attributes.get("href")): + continue + + elif cached_links & {link := urljoin(BASE_URL, href)}: + continue + + events.append({"sport": sport, "link": link}) + + return events + + +async def scrape(browser: Browser) -> None: + cached_urls = CACHE_FILE.load() + + cached_links = {entry["link"] for entry in cached_urls.values()} + + valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + + valid_count = cached_count = len(valid_urls) + + urls.update(valid_urls) + + log.info(f"Loaded {cached_count} event(s) from cache") + + log.info(f'Scraping from "{BASE_URL}"') + + if events := await get_events(cached_links): + log.info(f"Processing {len(events)} URL(s)") + + now = Time.clean(Time.now()) + + async with network.event_context(browser, stealth=False) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: + handler = partial( + process_event, + url=(link := ev["link"]), + url_num=i, + page=page, + ) + + event, ifr_src, url = await network.safe_process( + handler, + url_num=i, + semaphore=network.PW_S, + log=log, + ) + + tvg_id, logo = leagues.get_tvg_info((sport := ev["sport"]), event) + + key = f"[{sport}] {event} ({TAG})" + + entry = { + "url": url, + "logo": logo, + "base": ifr_src, + "timestamp": now.timestamp(), + "id": tvg_id or "Live.Event.us", + "link": link, + } + + cached_urls[key] = entry + + if url: + valid_count += 1 + + urls[key] = entry + + log.info(f"Collected and cached {valid_count - cached_count} event(s)") + + else: + log.info("No new events found") + + CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/utils/webwork.py b/M3U8/scrapers/utils/webwork.py index 5deb91cb..7d3047ac 100644 --- a/M3U8/scrapers/utils/webwork.py +++ b/M3U8/scrapers/utils/webwork.py @@ -222,7 +222,7 @@ class Network: escaped = [ re.escape(i) for i in { - "amazonaws", + # "amazonaws", "knitcdn", "jwpltx", } diff --git a/M3U8/scrapers/watchfooty.py b/M3U8/scrapers/watchfooty.py index f58f49ca..4de7c953 100644 --- a/M3U8/scrapers/watchfooty.py +++ b/M3U8/scrapers/watchfooty.py @@ -111,7 +111,7 @@ async def process_event( url: str, url_num: int, page: Page, -) -> str | None: +) -> tuple[str | None, str | None]: nones = None, None