From 00000d947699e9c606da6b86c81e62ab7819ac32 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Tue, 24 Feb 2026 17:33:51 -0500 Subject: [PATCH] e - re-add volokit.py --- M3U8/fetch.py | 3 + M3U8/scrapers/utils/config.py | 1 + M3U8/scrapers/volokit.py | 228 ++++++++++++++++++++++++++++++++++ 3 files changed, 232 insertions(+) create mode 100644 M3U8/scrapers/volokit.py diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 91d55266..cee31635 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -23,6 +23,7 @@ from scrapers import ( streamsgate, totalsportek, tvapp, + volokit, watchfooty, webcast, xstreameast, @@ -83,6 +84,7 @@ async def main() -> None: # asyncio.create_task(streambtw.scrape()), asyncio.create_task(totalsportek.scrape()), asyncio.create_task(tvapp.scrape()), + asyncio.create_task(volokit.scrape()), asyncio.create_task(xstreameast.scrape()), ] @@ -118,6 +120,7 @@ async def main() -> None: | streamsgate.urls | totalsportek.urls | tvapp.urls + | volokit.urls | watchfooty.urls | webcast.urls | xstreameast.urls diff --git a/M3U8/scrapers/utils/config.py b/M3U8/scrapers/utils/config.py index b1898311..3bd8b603 100644 --- a/M3U8/scrapers/utils/config.py +++ b/M3U8/scrapers/utils/config.py @@ -91,6 +91,7 @@ class Time(datetime): "%m/%d/%Y %I:%M %p", "%m/%d/%Y %H:%M:%S", "%a, %d %b %Y %H:%M:%S %z", + "%A, %b %d, %Y %H:%M", ] for frmt in formats: diff --git a/M3U8/scrapers/volokit.py b/M3U8/scrapers/volokit.py new file mode 100644 index 00000000..22c1a0e3 --- /dev/null +++ b/M3U8/scrapers/volokit.py @@ -0,0 +1,228 @@ +import asyncio +import re +from functools import partial +from urllib.parse import urljoin + +from selectolax.parser import HTMLParser + +from .utils import Cache, Time, get_logger, leagues, network + +log = get_logger(__name__) + +urls: dict[str, dict[str, str | float]] = {} + +TAG = "VOLOKIT" + +CACHE_FILE = Cache(TAG, exp=10_800) + +HTML_CACHE = Cache(f"{TAG}-html", exp=28_800) + +BASE_URL = "http://volokit.xyz" + +SPORT_ENDPOINTS = { + "mlb": "MLB", + # "nfl": "NFL", + # "nhl": "NHL", +} + + +def fix_event(s: str) -> str: + return " ".join(x.capitalize() for x in s.split()) + + +async def process_event(url: str, url_num: int) -> str | None: + if not (event_data := await network.request(url, log=log)): + log.info(f"URL {url_num}) Failed to load url.") + + return + + soup = HTMLParser(event_data.content) + + if not (iframe := soup.css_first('iframe[height="100%"]')): + log.warning(f"URL {url_num}) No iframe element found.") + + return + + if not (iframe_src := iframe.attributes.get("src")): + log.warning(f"URL {url_num}) No iframe source found.") + + return + + if not ( + iframe_src_data := await network.request( + iframe_src, + headers={"Referer": url}, + log=log, + ) + ): + log.info(f"URL {url_num}) Failed to load iframe source.") + + return + + pattern = re.compile(r'source:\s+"([^"]*)"', re.I) + + if not (match := pattern.search(iframe_src_data.text)): + log.warning(f"URL {url_num}) No Clappr source found.") + + return + + log.info(f"URL {url_num}) Captured M3U8") + + return match[1] + + +async def refresh_html_cache( + url: str, + sport: str, + now: Time, +) -> dict[str, dict[str, str | float]]: + + events = {} + + if not (html_data := await network.request(url, log=log)): + return events + + soup = HTMLParser(html_data.content) + + date = now.date() + + if date_node := soup.css_first("tr.date"): + date = date_node.text(strip=True) + + for card in soup.css("#events .table .vevent.theevent"): + if not (href := card.css_first("a").attributes.get("href")): + continue + + name_node = card.css_first(".teamtd.event") + + time_node = card.css_first(".time") + + if not (name_node and time_node): + continue + + name, time = name_node.text(strip=True), time_node.text(strip=True) + + event_sport = SPORT_ENDPOINTS[sport] + + event_name = fix_event(name) + + event_dt = Time.from_str(f"{date} {time}", timezone="UTC") + + key = f"[{event_sport}] {event_name} ({TAG})" + + events[key] = { + "sport": event_sport, + "event": event_name, + "link": href, + "event_ts": event_dt.timestamp(), + "timestamp": now.timestamp(), + } + + return events + + +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: + now = Time.clean(Time.now()) + + if not (events := HTML_CACHE.load()): + log.info("Refreshing HTML cache") + + sport_urls = { + sport: urljoin(BASE_URL, f"sport/{sport}") for sport in SPORT_ENDPOINTS + } + + tasks = [ + refresh_html_cache( + url, + sport, + now, + ) + for sport, url in sport_urls.items() + ] + + results = await asyncio.gather(*tasks) + + events = {k: v for data in results for k, v in data.items()} + + HTML_CACHE.write(events) + + live = [] + + start_ts = now.delta(minutes=-30).timestamp() + end_ts = now.delta(minutes=30).timestamp() + + for k, v in events.items(): + if k in cached_keys: + continue + + if not start_ts <= v["event_ts"] <= end_ts: + continue + + live.append(v) + + return live + + +async def scrape() -> None: + cached_urls = CACHE_FILE.load() + + valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + + valid_count = cached_count = len(valid_urls) + + urls.update(valid_urls) + + log.info(f"Loaded {cached_count} event(s) from cache") + + log.info(f'Scraping from "{BASE_URL}"') + + events = await get_events(cached_urls.keys()) + + if events: + log.info(f"Processing {len(events)} new URL(s)") + + now = Time.clean(Time.now()) + + for i, ev in enumerate(events, start=1): + handler = partial( + process_event, + url=(link := ev["link"]), + url_num=i, + ) + + url = await network.safe_process( + handler, + url_num=i, + semaphore=network.HTTP_S, + log=log, + ) + + sport, event = ev["sport"], ev["event"] + + key = f"[{sport}] {event} ({TAG})" + + tvg_id, logo = leagues.get_tvg_info(sport, event) + + entry = { + "url": url, + "logo": logo, + "base": link, + "timestamp": now.timestamp(), + "id": tvg_id or "Live.Event.us", + "link": link, + } + + cached_urls[key] = entry + + if url: + valid_count += 1 + + urls[key] = entry + + if new_count := valid_count - cached_count: + log.info(f"Collected and cached {new_count} new event(s)") + + else: + log.info("No new events found") + + CACHE_FILE.write(cached_urls)