diff --git a/M3U8/fetch.py b/M3U8/fetch.py index beb141bc..3bf5722b 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -65,6 +65,7 @@ async def main() -> None: asyncio.create_task(embedhd.scrape(hdl_brwsr)), asyncio.create_task(ppv.scrape(xtrnl_brwsr)), asyncio.create_task(roxie.scrape(hdl_brwsr)), + asyncio.create_task(streamhub.scrape(xtrnl_brwsr)), ] httpx_tasks = [ @@ -75,7 +76,6 @@ async def main() -> None: asyncio.create_task(pawa.scrape()), asyncio.create_task(shark.scrape()), asyncio.create_task(streamcenter.scrape()), - asyncio.create_task(streamhub.scrape()), asyncio.create_task(streamsgate.scrape()), asyncio.create_task(streamtpnew.scrape()), asyncio.create_task(totalsportek.scrape()), diff --git a/M3U8/scrapers/streamhub.py b/M3U8/scrapers/streamhub.py index 01c31284..3815abb2 100644 --- a/M3U8/scrapers/streamhub.py +++ b/M3U8/scrapers/streamhub.py @@ -1,8 +1,8 @@ import asyncio -import re from functools import partial -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin +from playwright.async_api import Browser, Page, TimeoutError from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -34,59 +34,86 @@ SPORT_ENDPOINTS = [ ] -async def process_event(url: str, url_num: int) -> tuple[str | None, str | None]: - if not (event_data := await network.request(url, log=log)): - log.warning(f"URL {url_num}) Failed to load url.") - return +async def process_event( + url: str, + url_num: int, + page: Page, +) -> str | None: - soup_1 = HTMLParser(event_data.content) + captured: list[str] = [] - ifr_1 = soup_1.css_first("iframe#playerIframe") + got_one = asyncio.Event() - if not ifr_1 or not (src := ifr_1.attributes.get("src")): - log.warning(f"URL {url_num}) No iframe element found.") - return - - parsed = urlparse(src) - - ifr_1_src = urljoin( - BASE_URL, - f"embed1/{parsed.path.split('/')[-1].split('_')[0]}.php", + handler = partial( + network.capture_req, + captured=captured, + got_one=got_one, ) - if not ( - ifr_1_src_data := await network.request( - ifr_1_src, - headers={"Referer": url}, - log=log, + page.on("request", handler) + + try: + resp = await page.goto( + url, + wait_until="domcontentloaded", + timeout=6_000, ) - ): - log.warning(f"URL {url_num}) Failed to load iframe source. (IFR1)") + + if not resp or resp.status != 200: + log.warning( + f"URL {url_num}) Status Code: {resp.status if resp else 'None'}" + ) + + return + + try: + btn = page.locator("button.btn.btn-sm.btn-success.streamLink") + + iframe_src = await btn.get_attribute("data-src", timeout=1_250) + except TimeoutError: + log.warning(f"URL {url_num}) No iframe source found.") + return + + await page.goto( + iframe_src, + wait_until="domcontentloaded", + timeout=5_000, + ) + + wait_task = asyncio.create_task(got_one.wait()) + + try: + await asyncio.wait_for(wait_task, timeout=10) + except asyncio.TimeoutError: + log.warning(f"URL {url_num}) Timed out waiting for M3U8.") + + return + + finally: + if not wait_task.done(): + wait_task.cancel() + + try: + await wait_task + except asyncio.CancelledError: + pass + + if captured: + log.info(f"URL {url_num}) Captured M3U8") + + return captured[0] + + log.warning(f"URL {url_num}) No M3U8 captured after waiting.") + return - soup_2 = HTMLParser(ifr_1_src_data.content) + except Exception as e: + log.warning(f"URL {url_num}) {e}") - ifr_2 = soup_2.css_first("center iframe") - - if not ifr_2 or not (ifr_2_src := ifr_2.attributes.get("src")): - log.warning(f"URL {url_num}) Unable to locate iframe. (IFR2)") return - ifr_2_src = f"https:{ifr_2_src}" if ifr_2_src.startswith("//") else ifr_2_src - - if not (ifr_2_src_data := await network.request(ifr_2_src, log=log)): - log.warning(f"URL {url_num}) Failed to load iframe source.") - return - - valid_m3u8 = re.compile(r"src:\s+(\'|\")([^\']+)(\'|\")", re.I) - - if not (match := valid_m3u8.search(ifr_2_src_data.text)): - log.warning(f"URL {url_num}) No source found.") - return - - log.info(f"URL {url_num}) Captured M3U8") - - return match[2] + finally: + page.remove_listener("request", handler) async def refresh_html_cache( @@ -185,7 +212,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return live -async def scrape() -> None: +async def scrape(browser: Browser) -> None: cached_urls = CACHE_FILE.load() valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} @@ -200,48 +227,51 @@ async def scrape() -> None: if events := await get_events(cached_urls.keys()): log.info(f"Processing {len(events)} new URL(s)") + async with network.event_context(browser) as context: + for i, ev in enumerate(events, start=1): + async with network.event_page(context) as page: - for i, ev in enumerate(events, start=1): + handler = partial( + process_event, + url=(link := ev["link"]), + url_num=i, + page=page, + ) - handler = partial( - process_event, - url=(link := ev["link"]), - url_num=i, - ) + url = await network.safe_process( + handler, + url_num=i, + semaphore=network.PW_S, + log=log, + ) - url = await network.safe_process( - handler, - url_num=i, - semaphore=network.PW_S, - log=log, - ) + sport, event, ts = ( + ev["sport"], + ev["event"], + ev["event_ts"], + ) - sport, event, ts = ( - ev["sport"], - ev["event"], - ev["event_ts"], - ) + key = f"[{sport}] {event} ({TAG})" - key = f"[{sport}] {event} ({TAG})" + tvg_id, logo = leagues.get_tvg_info(sport, event) - tvg_id, logo = leagues.get_tvg_info(sport, event) + entry = { + "url": url, + "logo": logo, + "base": "https://hardsmart.click", + "timestamp": ts, + "id": tvg_id or "Live.Event.us", + "link": link, + } - entry = { - "url": url, - "logo": logo, - "base": "https://hardsmart.click", - "timestamp": ts, - "id": tvg_id or "Live.Event.us", - "link": link, - "UA": "curl/8.19.0", - } + cached_urls[key] = entry - cached_urls[key] = entry + if url: + valid_count += 1 - if url: - valid_count += 1 + entry["url"] = url.split("?st")[0] - urls[key] = entry + urls[key] = entry log.info(f"Collected and cached {valid_count - cached_count} new event(s)")