From 00000d9fe26784612023379f92a4e7fb934d3a06 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Tue, 21 Apr 2026 17:48:21 -0400 Subject: [PATCH] e - edit scraping for livetvsx.py - edit caching for streamsgate.py - edit caching for streamhub.py - misc edits. --- M3U8/scrapers/livetvsx.py | 174 +++++++++++++++------------------- M3U8/scrapers/mainportal.py | 2 +- M3U8/scrapers/streamcenter.py | 2 +- M3U8/scrapers/streamhub.py | 148 +++++++++++------------------ M3U8/scrapers/streamsgate.py | 73 +++++--------- M3U8/scrapers/webcast.py | 2 +- 6 files changed, 152 insertions(+), 249 deletions(-) diff --git a/M3U8/scrapers/livetvsx.py b/M3U8/scrapers/livetvsx.py index 4dfbec88..68d773b8 100644 --- a/M3U8/scrapers/livetvsx.py +++ b/M3U8/scrapers/livetvsx.py @@ -1,7 +1,6 @@ -import asyncio +import re from functools import partial -from playwright.async_api import Browser, Page, TimeoutError from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -17,83 +16,61 @@ CACHE_FILE = Cache(TAG, exp=10_800) BASE_URL = "https://livetv.sx/export/webmasters.php" -async def process_event( - url: str, - url_num: int, - page: Page, -) -> str | None: +async def process_event(url: str, url_num: int) -> tuple[str | None, str | None]: + nones = None, None - captured: list[str] = [] + if not (ev_data_1 := await network.request(url, log=log)): + log.warning(f"URL {url_num}) Failed to load url. (EVD1)") + return nones - got_one = asyncio.Event() + soup_1 = HTMLParser(ev_data_1.content) - handler = partial( - network.capture_req, - captured=captured, - got_one=got_one, - ) + for a_elem in soup_1.css("a"): + if not (src_title := a_elem.attributes.get("title")) or ( + "aliez" not in src_title.lower() + ): + continue - page.on("request", handler) - - try: - resp = await page.goto( - url, - wait_until="domcontentloaded", - timeout=10_000, - ) - - if not resp or resp.status != 200: - log.warning( - f"URL {url_num}) Status Code: {resp.status if resp else 'None'}" - ) - return - - try: - event_a = page.locator('a[title*="Aliez"]').first - - href = await event_a.get_attribute("href", timeout=1_250) - except TimeoutError: - log.warning(f"URL {url_num}) No valid sources found.") - return + href = a_elem.attributes["href"] event_url = href if href.startswith("http") else f"https:{href}" + break - await page.goto( - event_url, - wait_until="domcontentloaded", - timeout=5_000, - ) + else: + log.warning(f"URL {url_num}) No valid sources found.") + return nones - wait_task = asyncio.create_task(got_one.wait()) + if not (ev_data_2 := await network.request(event_url, log=log)): + log.warning(f"URL {url_num}) Failed to load url. (EVD2)") + return nones - try: - await asyncio.wait_for(wait_task, timeout=6) - except asyncio.TimeoutError: - log.warning(f"URL {url_num}) Timed out waiting for M3U8.") - return + soup_2 = HTMLParser(ev_data_2.content) - finally: - if not wait_task.done(): - wait_task.cancel() + ifr_1 = soup_2.css_first("tr > td > iframe") - try: - await wait_task - except asyncio.CancelledError: - pass + if not ifr_1 or not (ifr_1_src := ifr_1.attributes.get("src")): + log.warning(f"URL {url_num}) No iframe element found.") + return nones - if captured: - log.info(f"URL {url_num}) Captured M3U8") - return captured[0] + ifr_1_src = "".join( + (ifr_1_src if ifr_1_src.startswith("http") else f"https:{ifr_1_src}").split() + ) - log.warning(f"URL {url_num}) No M3U8 captured after waiting.") - return + if not (ev_data_3 := await network.request(ifr_1_src, log=log)): + log.warning(f"URL {url_num}) Failed to load url. (EVD3)") + return nones - except Exception as e: - log.warning(f"URL {url_num}) {e}") - return + pattern = re.compile(r'pl\.init\((\'|\")([^"]*)(\'|\")\)', re.I) - finally: - page.remove_listener("request", handler) + if not (match := pattern.search(ev_data_3.text)): + log.warning(f"URL {url_num}) No M3U8 source found.") + return nones + + log.info(f"URL {url_num}) Captured M3U8") + + m3u: str = match[2] if match[2].startswith("http") else f"https:{match[2]}" + + return m3u, ifr_1_src async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: @@ -102,6 +79,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: php_data = await network.unvd_client.get(BASE_URL, params={"lang": "en"}) if php_data.status_code != 200: + log.warning("Failed to get php data.") return events soup = HTMLParser(php_data.content) @@ -143,7 +121,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: return events -async def scrape(browser: Browser) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} @@ -161,49 +139,45 @@ async def scrape(browser: Browser) -> None: now = Time.clean(Time.now()) - async with network.event_context(browser, ignore_https=True) as context: - for i, ev in enumerate(events, start=1): - async with network.event_page(context) as page: - handler = partial( - process_event, - url=(link := ev["link"]), - url_num=i, - page=page, - ) + for i, ev in enumerate(events, start=1): + handler = partial( + process_event, + url=(link := ev["link"]), + url_num=i, + ) - url = await network.safe_process( - handler, - url_num=i, - semaphore=network.PW_S, - log=log, - timeout=20, - ) + url, iframe = await network.safe_process( + handler, + url_num=i, + semaphore=network.HTTP_S, + log=log, + ) - sport, league, event = ( - ev["sport"], - ev["league"], - ev["event"], - ) + sport, league, event = ( + ev["sport"], + ev["league"], + ev["event"], + ) - key = f"[{sport} - {league}] {event} ({TAG})" + key = f"[{sport} - {league}] {event} ({TAG})" - tvg_id, logo = leagues.get_tvg_info(sport, event) + tvg_id, logo = leagues.get_tvg_info(sport, event) - entry = { - "url": url, - "logo": logo, - "base": "https://livetv.sx/enx/", - "timestamp": now.timestamp(), - "id": tvg_id or "Live.Event.us", - "link": link, - } + entry = { + "url": url, + "logo": logo, + "base": iframe, + "timestamp": now.timestamp(), + "id": tvg_id or "Live.Event.us", + "link": link, + } - cached_urls[key] = entry + cached_urls[key] = entry - if url: - valid_count += 1 + if url: + valid_count += 1 - urls[key] = entry + urls[key] = entry log.info(f"Collected and cached {valid_count - cached_count} new event(s)") diff --git a/M3U8/scrapers/mainportal.py b/M3U8/scrapers/mainportal.py index e1930c38..3a1873f7 100644 --- a/M3U8/scrapers/mainportal.py +++ b/M3U8/scrapers/mainportal.py @@ -163,7 +163,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, - semaphore=network.PW_S, + semaphore=network.HTTP_S, log=log, ) diff --git a/M3U8/scrapers/streamcenter.py b/M3U8/scrapers/streamcenter.py index 52e95991..297c504d 100644 --- a/M3U8/scrapers/streamcenter.py +++ b/M3U8/scrapers/streamcenter.py @@ -117,7 +117,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, - semaphore=network.PW_S, + semaphore=network.HTTP_S, log=log, ) diff --git a/M3U8/scrapers/streamhub.py b/M3U8/scrapers/streamhub.py index 738c2547..0289ccfa 100644 --- a/M3U8/scrapers/streamhub.py +++ b/M3U8/scrapers/streamhub.py @@ -13,9 +13,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "STRMHUB" -CACHE_FILE = Cache(TAG, exp=10_800) - -HTML_FILE = Cache(f"{TAG}-html", exp=19_800) +CACHE_FILE = Cache(TAG, exp=28_800) BASE_URL = "https://livesports4u.net" @@ -116,116 +114,78 @@ async def process_event( page.remove_listener("request", handler) -async def refresh_html_cache( - date: str, - sport_id: str, - ts: float, -) -> dict[str, dict[str, str | float]]: +async def get_events() -> list[dict[str, str]]: + now = Time.clean(Time.now()) - events = {} - - if not ( - html_data := await network.request( + tasks = [ + network.request( urljoin(BASE_URL, f"events/{date}"), params={"sport_id": sport_id}, log=log, ) - ): + for date in [now.date(), now.delta(days=1).date()] + for sport_id in SPORT_ENDPOINTS + ] + + results = await asyncio.gather(*tasks) + + events = [] + + if not (soups := [HTMLParser(html.content) for html in results if html]): return events - soup = HTMLParser(html_data.content) - - for section in soup.css(".events-section"): - if not (sport_node := section.css_first(".section-titlte")): - continue - - sport = sport_node.text(strip=True) - - for event in section.css(".section-event"): - event_name = "Live Event" - - if teams := event.css_first(".event-competitors"): - home, away = teams.text(strip=True).split("vs.") - - event_name = f"{away} vs {home}" - - if not (event_button := event.css_first(".event-button a")) or not ( - href := event_button.attributes.get("href") - ): + for soup in soups: + for section in soup.css(".events-section"): + if not (sport_node := section.css_first(".section-titlte")): continue - event_date = event.css_first(".event-countdown").attributes.get( - "data-start" - ) + sport = sport_node.text(strip=True) - event_dt = Time.from_str(event_date, timezone="UTC") + for event in section.css(".section-event"): + event_name = "Live Event" - key = f"[{sport}] {event_name} ({TAG})" + if teams := event.css_first(".event-competitors"): + home, away = teams.text(strip=True).split("vs.") - events[key] = { - "sport": sport, - "event": event_name, - "link": href, - "event_ts": event_dt.timestamp(), - "timestamp": ts, - } + event_name = f"{away} vs {home}" + + if not (event_button := event.css_first(".event-button a")) or not ( + href := event_button.attributes.get("href") + ): + continue + + event_date = event.css_first(".event-countdown").attributes.get( + "data-start" + ) + + event_dt = Time.from_str(event_date, timezone="UTC") + + if event_dt.date() != now.date(): + continue + + events.append( + { + "sport": sport, + "event": event_name, + "link": href, + "timestamp": now.timestamp(), + } + ) return events -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: - now = Time.clean(Time.now()) - - if not (events := HTML_FILE.load()): - log.info("Refreshing HTML cache") - - tasks = [ - refresh_html_cache( - date, - sport_id, - now.timestamp(), - ) - for date in [now.date(), now.delta(days=1).date()] - for sport_id in SPORT_ENDPOINTS - ] - - results = await asyncio.gather(*tasks) - - events = {k: v for data in results for k, v in data.items()} - - HTML_FILE.write(events) - - live = [] - - start_ts = now.delta(minutes=-30).timestamp() - end_ts = now.delta(minutes=30).timestamp() - - for k, v in events.items(): - if k in cached_keys: - continue - - if not start_ts <= v["event_ts"] <= end_ts: - continue - - live.append(v) - - return live - - async def scrape(browser: Browser) -> None: - cached_urls = CACHE_FILE.load() + if cached_urls := CACHE_FILE.load(): + urls.update({k: v for k, v in cached_urls.items() if v["url"]}) - valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + log.info(f"Loaded {len(urls)} event(s) from cache") - valid_count = cached_count = len(valid_urls) - - urls.update(valid_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") + return log.info(f'Scraping from "{BASE_URL}"') - if events := await get_events(cached_urls.keys()): + if events := await get_events(): log.info(f"Processing {len(events)} new URL(s)") async with network.event_context(browser) as context: @@ -249,7 +209,7 @@ async def scrape(browser: Browser) -> None: sport, event, ts = ( ev["sport"], ev["event"], - ev["event_ts"], + ev["timestamp"], ) key = f"[{sport}] {event} ({TAG})" @@ -268,13 +228,11 @@ async def scrape(browser: Browser) -> None: cached_urls[key] = entry if url: - valid_count += 1 - entry["url"] = url.split("?st")[0] urls[key] = entry - log.info(f"Collected and cached {valid_count - cached_count} new event(s)") + log.info(f"Collected and cached {len(urls)} new event(s)") else: log.info("No new events found") diff --git a/M3U8/scrapers/streamsgate.py b/M3U8/scrapers/streamsgate.py index 1e703124..920ee74a 100644 --- a/M3U8/scrapers/streamsgate.py +++ b/M3U8/scrapers/streamsgate.py @@ -2,7 +2,6 @@ import asyncio import re from functools import partial from itertools import chain -from typing import Any from urllib.parse import urljoin from selectolax.parser import HTMLParser @@ -15,9 +14,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "STRMSGATE" -CACHE_FILE = Cache(TAG, exp=10_800) - -API_FILE = Cache(f"{TAG}-api", exp=19_800) +CACHE_FILE = Cache(TAG, exp=28_800) BASE_URL = "https://streamsgates.io" @@ -85,36 +82,17 @@ async def process_event(url: str, url_num: int) -> tuple[str | None, str | None] return match[3], ifr_src -async def refresh_api_cache(now_ts: float) -> list[dict[str, Any]]: +async def get_events() -> list[dict[str, str]]: + now = Time.clean(Time.now()) + tasks = [network.request(url, log=log) for url in SPORT_URLS] results = await asyncio.gather(*tasks) - if not (data := [*chain.from_iterable(r.json() for r in results if r)]): - return [{"timestamp": now_ts}] - - for ev in data: - ev["ts"] = ev.pop("timestamp") - - data[-1]["timestamp"] = now_ts - - return data - - -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: - now = Time.clean(Time.now()) - - if not (api_data := API_FILE.load(per_entry=False, index=-1)): - log.info("Refreshing API cache") - - api_data = await refresh_api_cache(now.timestamp()) - - API_FILE.write(api_data) - events = [] - start_dt = now.delta(hours=-2.5) - end_dt = now.delta(minutes=30) + if not (api_data := [*chain.from_iterable(r.json() for r in results if r)]): + return events for stream_group in api_data: date = stream_group.get("time") @@ -123,34 +101,30 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: t1, t2 = stream_group.get("away"), stream_group.get("home") - if not (t1 and t2): - continue - - event = get_event(t1, t2) - if not (date and sport): continue - if f"[{sport}] {event} ({TAG})" in cached_keys: - continue - event_dt = Time.from_str(date, timezone="UTC") - if not start_dt <= event_dt <= end_dt: + if event_dt.date() != now.date(): continue - if not (streams := stream_group.get("streams")): + if not (streams := stream_group.get("streams")) or not ( + url := streams[0].get("url") + ): continue - if not (url := streams[0].get("url")): + if not (t1 and t2): continue + event = get_event(t1, t2) + events.append( { "sport": sport, "event": event, "link": url, - "timestamp": event_dt.timestamp(), + "timestamp": now.timestamp(), } ) @@ -158,19 +132,16 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: async def scrape() -> None: - cached_urls = CACHE_FILE.load() + if cached_urls := CACHE_FILE.load(): + urls.update({k: v for k, v in cached_urls.items() if v["url"]}) - valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + log.info(f"Loaded {len(urls)} event(s) from cache") - valid_count = cached_count = len(valid_urls) - - urls.update(valid_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") + return log.info(f'Scraping from "{BASE_URL}"') - if events := await get_events(cached_urls.keys()): + if events := await get_events(): log.info(f"Processing {len(events)} new URL(s)") for i, ev in enumerate(events, start=1): @@ -183,7 +154,7 @@ async def scrape() -> None: url, iframe = await network.safe_process( handler, url_num=i, - semaphore=network.PW_S, + semaphore=network.HTTP_S, log=log, ) @@ -209,11 +180,11 @@ async def scrape() -> None: cached_urls[key] = entry if url: - valid_count += 1 + entry["url"] = url.split("?st")[0] urls[key] = entry - log.info(f"Collected and cached {valid_count - cached_count} new event(s)") + log.info(f"Collected and cached {len(urls)} new event(s)") else: log.info("No new events found") diff --git a/M3U8/scrapers/webcast.py b/M3U8/scrapers/webcast.py index 0abf7585..90415113 100644 --- a/M3U8/scrapers/webcast.py +++ b/M3U8/scrapers/webcast.py @@ -157,7 +157,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, - semaphore=network.PW_S, + semaphore=network.HTTP_S, log=log, )