From 00000d940c22629e18ef8517a3c7b9e136461a20 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Wed, 18 Feb 2026 15:47:50 -0500 Subject: [PATCH] e - change scrape window for istreameast.py - harden scraping method for roxie.py - catch nulls for ppv.py - change scraping method for totalsportek.py - misc edits. --- M3U8/fetch.py | 2 +- M3U8/scrapers/istreameast.py | 6 +- M3U8/scrapers/ppv.py | 54 +++++++------- M3U8/scrapers/roxie.py | 12 +++- M3U8/scrapers/streambtw.py | 3 +- M3U8/scrapers/streamhub.py | 19 ++--- M3U8/scrapers/totalsportek.py | 132 ++++++++++++++++++++++++---------- 7 files changed, 141 insertions(+), 87 deletions(-) diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 1f7517af..6feb0578 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -71,7 +71,6 @@ async def main() -> None: asyncio.create_task(streamcenter.scrape(xtrnl_brwsr)), # asyncio.create_task(streamhub.scrape(xtrnl_brwsr)), asyncio.create_task(streamsgate.scrape(xtrnl_brwsr)), - asyncio.create_task(totalsportek.scrape(hdl_brwsr)), # asyncio.create_task(tvapp.scrape(hdl_brwsr)), asyncio.create_task(webcast.scrape(hdl_brwsr)), ] @@ -83,6 +82,7 @@ async def main() -> None: asyncio.create_task(pawa.scrape()), asyncio.create_task(shark.scrape()), asyncio.create_task(streambtw.scrape()), + asyncio.create_task(totalsportek.scrape()), asyncio.create_task(xstreameast.scrape()), ] diff --git a/M3U8/scrapers/istreameast.py b/M3U8/scrapers/istreameast.py index 787ce5fd..35046c23 100644 --- a/M3U8/scrapers/istreameast.py +++ b/M3U8/scrapers/istreameast.py @@ -58,8 +58,6 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: if not (html_data := await network.request(BASE_URL, log=log)): return events - pattern = re.compile(r"^(?:LIVE|(?:[1-9]|[12]\d|30)\s+minutes?\b)", re.I) - soup = HTMLParser(html_data.content) for link in soup.css("li.f1-podium--item > a.f1-podium--link"): @@ -71,9 +69,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: if not (time_elem := li_item.css_first(".SaatZamanBilgisi")): continue - time_text = time_elem.text(strip=True) - - if not pattern.search(time_text): + if time_elem.text(strip=True).lower() != "live": continue sport = rank_elem.text(strip=True) diff --git a/M3U8/scrapers/ppv.py b/M3U8/scrapers/ppv.py index 08568a8a..ae9ecb4a 100644 --- a/M3U8/scrapers/ppv.py +++ b/M3U8/scrapers/ppv.py @@ -81,9 +81,11 @@ async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]: async def scrape(browser: Browser) -> None: cached_urls = CACHE_FILE.load() - cached_count = len(cached_urls) + valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} - urls.update(cached_urls) + valid_count = cached_count = len(valid_urls) + + urls.update(valid_urls) log.info(f"Loaded {cached_count} event(s) from cache") @@ -120,31 +122,35 @@ async def scrape(browser: Browser) -> None: log=log, ) + sport, event, logo, ts, link = ( + ev["sport"], + ev["event"], + ev["logo"], + ev["timestamp"], + ev["link"], + ) + + key = f"[{sport}] {event} ({TAG})" + + tvg_id, pic = leagues.get_tvg_info(sport, event) + + entry = { + "url": url, + "logo": logo or pic, + "base": link, + "timestamp": ts, + "id": tvg_id or "Live.Event.us", + "link": link, + } + + cached_urls[key] = entry + if url: - sport, event, logo, ts, link = ( - ev["sport"], - ev["event"], - ev["logo"], - ev["timestamp"], - ev["link"], - ) + valid_count += 1 - key = f"[{sport}] {event} ({TAG})" + urls[key] = entry - tvg_id, pic = leagues.get_tvg_info(sport, event) - - entry = { - "url": url, - "logo": logo or pic, - "base": link, - "timestamp": ts, - "id": tvg_id or "Live.Event.us", - "link": link, - } - - urls[key] = cached_urls[key] = entry - - if new_count := len(cached_urls) - cached_count: + if new_count := valid_count - cached_count: log.info(f"Collected and cached {new_count} new event(s)") else: diff --git a/M3U8/scrapers/roxie.py b/M3U8/scrapers/roxie.py index 33f58cdb..b8bf32aa 100644 --- a/M3U8/scrapers/roxie.py +++ b/M3U8/scrapers/roxie.py @@ -104,7 +104,13 @@ async def process_event( "button:has-text('Stream 1')", timeout=5_000, ): - await btn.click() + await btn.click(force=True, click_count=2) + except TimeoutError: + pass + + try: + if player := await page.wait_for_selector(".play-wrapper", timeout=5_000): + await player.click(force=True, click_count=3) except TimeoutError: pass @@ -165,8 +171,8 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: live = [] - start_ts = now.delta(hours=-1).timestamp() - end_ts = now.delta(minutes=5).timestamp() + start_ts = now.delta(hours=-1.5).timestamp() + end_ts = now.delta(minutes=1).timestamp() for k, v in events.items(): if k in cached_keys: diff --git a/M3U8/scrapers/streambtw.py b/M3U8/scrapers/streambtw.py index a64476e9..6fa4c99c 100644 --- a/M3U8/scrapers/streambtw.py +++ b/M3U8/scrapers/streambtw.py @@ -79,7 +79,8 @@ async def get_events() -> list[dict[str, str]]: for event in items: event_name: str = event["title"] - link: str = event["url"] + if not (link := event.get("url")): + continue events.append( { diff --git a/M3U8/scrapers/streamhub.py b/M3U8/scrapers/streamhub.py index ebf7c728..311bcf3f 100644 --- a/M3U8/scrapers/streamhub.py +++ b/M3U8/scrapers/streamhub.py @@ -17,7 +17,7 @@ CACHE_FILE = Cache(TAG, exp=10_800) HTML_CACHE = Cache(f"{TAG}-html", exp=28_800) -MIRRORS = ["https://streamhub.pro", "https://livesports4u.net"] +BASE_URL = "https://livesports4u.net" CATEGORIES = { "Soccer": "sport_68c02a4464a38", @@ -35,7 +35,6 @@ CATEGORIES = { async def refresh_html_cache( - url: str, date: str, sport_id: str, ts: float, @@ -45,7 +44,7 @@ async def refresh_html_cache( if not ( html_data := await network.request( - urljoin(url, f"events/{date}"), + urljoin(BASE_URL, f"events/{date}"), log=log, params={"sport_id": sport_id}, ) @@ -95,7 +94,7 @@ async def refresh_html_cache( return events -async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]: +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) if not (events := HTML_CACHE.load()): @@ -103,7 +102,6 @@ async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]: tasks = [ refresh_html_cache( - url, date, sport_id, now.timestamp(), @@ -146,16 +144,9 @@ async def scrape(browser: Browser) -> None: log.info(f"Loaded {cached_count} event(s) from cache") - if not (base_url := await network.get_base(MIRRORS)): - log.warning("No working PPV mirrors") + log.info(f'Scraping from "{BASE_URL}"') - CACHE_FILE.write(cached_urls) - - return - - log.info(f'Scraping from "{base_url}"') - - events = await get_events(base_url, cached_urls.keys()) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/totalsportek.py b/M3U8/scrapers/totalsportek.py index ec8c41f7..f8f8386c 100644 --- a/M3U8/scrapers/totalsportek.py +++ b/M3U8/scrapers/totalsportek.py @@ -1,7 +1,8 @@ +import json +import re from functools import partial from urllib.parse import urljoin, urlparse -from playwright.async_api import Browser from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -23,6 +24,64 @@ def fix_txt(s: str) -> str: return s.upper() if s.islower() else s +async def process_event(url: str, url_num: int) -> str | None: + if not (event_data := await network.request(url, log=log)): + log.info(f"URL {url_num}) Failed to load url.") + + return + + soup_1 = HTMLParser(event_data.content) + + if not (iframe_1 := soup_1.css_first("iframe")): + log.warning(f"URL {url_num}) No iframe element found.") + + return + + if not (iframe_1_src := iframe_1.attributes.get("src")): + log.warning(f"URL {url_num}) No iframe source found.") + + return + + if not (iframe_1_src_data := await network.request(iframe_1_src, log=log)): + log.info(f"URL {url_num}) Failed to load iframe source.") + + return + + soup_2 = HTMLParser(iframe_1_src_data.content) + + if not (iframe_2 := soup_2.css_first("iframe")): + log.warning(f"URL {url_num}) No iframe element found.") + + return + + if not (iframe_2_src := iframe_2.attributes.get("src")): + log.warning(f"URL {url_num}) No iframe source found.") + + return + + if not ( + iframe_2_src_data := await network.request( + iframe_2_src, + log=log, + headers={"Referer": iframe_1_src}, + ) + ): + log.info(f"URL {url_num}) Failed to load iframe source.") + + return + + valid_m3u8 = re.compile(r'currentStreamUrl\s+=\s+"([^"]*)"', re.I) + + if not (match := valid_m3u8.search(iframe_2_src_data.text)): + log.warning(f"URL {url_num}) No Clappr source found.") + + return + + log.info(f"URL {url_num}) Captured M3U8") + + return json.loads(f'"{match[1]}"') + + async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: events = [] @@ -54,7 +113,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: if not (time_node := node.css_first(".col-3 span")): continue - if time_node.text(strip=True) != "MatchStarted": + if time_node.text(strip=True).lower() != "matchstarted": continue event_name = fix_txt(" vs ".join(teams)) @@ -73,7 +132,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return events -async def scrape(browser: Browser) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} @@ -93,50 +152,45 @@ async def scrape(browser: Browser) -> None: if events: now = Time.clean(Time.now()) - async with network.event_context(browser) as context: - for i, ev in enumerate(events, start=1): - async with network.event_page(context) as page: - handler = partial( - network.process_event, - url=ev["link"], - url_num=i, - page=page, - log=log, - ) + for i, ev in enumerate(events, start=1): + handler = partial( + process_event, + url=ev["link"], + url_num=i, + ) - url = await network.safe_process( - handler, - url_num=i, - semaphore=network.PW_S, - log=log, - timeout=6, - ) + url = await network.safe_process( + handler, + url_num=i, + semaphore=network.HTTP_S, + log=log, + ) - sport, event, link = ( - ev["sport"], - ev["event"], - ev["link"], - ) + sport, event, link = ( + ev["sport"], + ev["event"], + ev["link"], + ) - key = f"[{sport}] {event} ({TAG})" + key = f"[{sport}] {event} ({TAG})" - tvg_id, logo = leagues.get_tvg_info(sport, event) + tvg_id, logo = leagues.get_tvg_info(sport, event) - entry = { - "url": url, - "logo": logo, - "base": link, - "timestamp": now.timestamp(), - "id": tvg_id or "Live.Event.us", - "link": link, - } + entry = { + "url": url, + "logo": logo, + "base": link, + "timestamp": now.timestamp(), + "id": tvg_id or "Live.Event.us", + "link": link, + } - cached_urls[key] = entry + cached_urls[key] = entry - if url: - valid_count += 1 + if url: + valid_count += 1 - urls[key] = entry + urls[key] = entry if new_count := valid_count - cached_count: log.info(f"Collected and cached {new_count} new event(s)")