From 00000d9079dd57dae0732057f1d777c4068b5365 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Thu, 18 Dec 2025 03:04:11 -0500 Subject: [PATCH] e misc. edits --- M3U8/fetch.py | 30 +++++++-------- M3U8/scrapers/fawa.py | 40 ++++++------------- M3U8/scrapers/istreameast.py | 67 ++++++++++---------------------- M3U8/scrapers/lotus.py | 70 ++++++++++------------------------ M3U8/scrapers/ppv.py | 43 +++++---------------- M3U8/scrapers/roxie.py | 50 ++++++------------------ M3U8/scrapers/shark.py | 52 ++++++++----------------- M3U8/scrapers/sport9.py | 46 +++++++--------------- M3U8/scrapers/streamcenter.py | 54 +++++++++----------------- M3U8/scrapers/streamfree.py | 38 ++++++++---------- M3U8/scrapers/streamhub.py | 55 ++++++++------------------ M3U8/scrapers/streamsgate.py | 50 +++++++++--------------- M3U8/scrapers/strmd.py | 65 +++++++++---------------------- M3U8/scrapers/tvpass.py | 35 ++++++----------- M3U8/scrapers/utils/webwork.py | 38 +++++++++++------- M3U8/scrapers/watchfooty.py | 62 ++++++++++++------------------ M3U8/scrapers/webcast.py | 30 +++++---------- 17 files changed, 273 insertions(+), 552 deletions(-) diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 5efbdf0..936586f 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -48,22 +48,22 @@ async def main() -> None: base_m3u8, tvg_chno = load_base() tasks = [ - asyncio.create_task(fawa.scrape(network.client)), - asyncio.create_task(istreameast.scrape(network.client)), - asyncio.create_task(lotus.scrape(network.client)), + asyncio.create_task(fawa.scrape()), + asyncio.create_task(istreameast.scrape()), + asyncio.create_task(lotus.scrape()), asyncio.create_task(pixel.scrape()), - asyncio.create_task(ppv.scrape(network.client)), - asyncio.create_task(roxie.scrape(network.client)), - asyncio.create_task(shark.scrape(network.client)), - asyncio.create_task(sport9.scrape(network.client)), - asyncio.create_task(streamcenter.scrape(network.client)), - asyncio.create_task(streamfree.scrape(network.client)), - asyncio.create_task(streamhub.scrape(network.client)), - asyncio.create_task(streamsgate.scrape(network.client)), - asyncio.create_task(strmd.scrape(network.client)), - asyncio.create_task(tvpass.scrape(network.client)), - asyncio.create_task(watchfooty.scrape(network.client)), - asyncio.create_task(webcast.scrape(network.client)), + asyncio.create_task(ppv.scrape()), + asyncio.create_task(roxie.scrape()), + asyncio.create_task(shark.scrape()), + asyncio.create_task(sport9.scrape()), + asyncio.create_task(streamcenter.scrape()), + asyncio.create_task(streamfree.scrape()), + asyncio.create_task(streamhub.scrape()), + asyncio.create_task(streamsgate.scrape()), + asyncio.create_task(strmd.scrape()), + asyncio.create_task(tvpass.scrape()), + asyncio.create_task(watchfooty.scrape()), + asyncio.create_task(webcast.scrape()), ] await asyncio.gather(*tasks) diff --git a/M3U8/scrapers/fawa.py b/M3U8/scrapers/fawa.py index fa6f1fb..2dba43f 100644 --- a/M3U8/scrapers/fawa.py +++ b/M3U8/scrapers/fawa.py @@ -2,7 +2,6 @@ import re from functools import partial from urllib.parse import quote, urljoin -import httpx from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -18,17 +17,9 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) BASE_URL = "http://www.fawanews.sc/" -async def process_event( - client: httpx.AsyncClient, - url: str, - url_num: int, -) -> str | None: - - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'URL {url_num}) Failed to fetch "{url}": {e}') +async def process_event(url: str, url_num: int) -> str | None: + if not (html_data := await network.request(url, log=log)): + log.info(f"URL {url_num}) Failed to load url.") return valid_m3u8 = re.compile( @@ -36,7 +27,7 @@ async def process_event( re.IGNORECASE, ) - if not (match := valid_m3u8.search(r.text)): + if not (match := valid_m3u8.search(html_data.text)): log.info(f"URL {url_num}) No M3U8 found") return @@ -44,25 +35,17 @@ async def process_event( return match[2] -async def get_events( - client: httpx.AsyncClient, - cached_hrefs: set[str], -) -> list[dict[str, str]]: - try: - r = await client.get(BASE_URL) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{BASE_URL}": {e}') +async def get_events(cached_hrefs: set[str]) -> list[dict[str, str]]: + events = [] - return [] + if not (html_data := await network.request(BASE_URL, log=log)): + return events - soup = HTMLParser(r.content) + soup = HTMLParser(html_data.content) valid_event = re.compile(r"\d{1,2}:\d{1,2}") clean_event = re.compile(r"\s+-+\s+\w{1,4}") - events = [] - for item in soup.css(".user-item"): text = item.css_first(".user-item__name") subtext = item.css_first(".user-item__playing") @@ -98,7 +81,7 @@ async def get_events( return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_hrefs = {entry["href"] for entry in cached_urls.values()} cached_count = len(cached_urls) @@ -108,7 +91,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client, cached_hrefs) + events = await get_events(cached_hrefs) log.info(f"Processing {len(events)} new URL(s)") @@ -118,7 +101,6 @@ async def scrape(client: httpx.AsyncClient) -> None: for i, ev in enumerate(events, start=1): handler = partial( process_event, - client=client, url=ev["link"], url_num=i, ) diff --git a/M3U8/scrapers/istreameast.py b/M3U8/scrapers/istreameast.py index 7e2a662..255992c 100644 --- a/M3U8/scrapers/istreameast.py +++ b/M3U8/scrapers/istreameast.py @@ -1,10 +1,9 @@ import base64 import re -import httpx from selectolax.parser import HTMLParser -from .utils import Cache, Time, get_logger, leagues +from .utils import Cache, Time, get_logger, leagues, network log = get_logger(__name__) @@ -17,31 +16,14 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=3_600) BASE_URL = "https://istreameast.app" -async def get_html_data(client: httpx.AsyncClient, url: str) -> str: - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') - - return b"" - - return r.text - - -async def process_event( - client: httpx.AsyncClient, - url: str, - url_num: int, -) -> str | None: - +async def process_event(url: str, url_num: int) -> str | None: pattern = re.compile(r"source:\s*window\.atob\(\s*'([^']+)'\s*\)", re.IGNORECASE) - if not (event_data := await get_html_data(client, url)): - log.warning(f"URL {url_num}) Failed to load event url.") + if not (event_data := await network.request(url, log=log)): + log.info(f"URL {url_num}) Failed to load url.") return - soup = HTMLParser(event_data) + soup = HTMLParser(event_data.content) if not (iframe := soup.css_first("iframe#wp_player")): log.warning(f"URL {url_num}) No iframe element found.") @@ -51,11 +33,11 @@ async def process_event( log.warning(f"URL {url_num}) No iframe source found.") return - if not (iframe_src_data := await get_html_data(client, iframe_src)): - log.warning(f"URL {url_num}) Failed to load iframe source.") + if not (iframe_src_data := await network.request(iframe_src, log=log)): + log.info(f"URL {url_num}) Failed to load iframe source.") return - if not (match := pattern.search(iframe_src_data)): + if not (match := pattern.search(iframe_src_data.text)): log.warning(f"URL {url_num}) No Clappr source found.") return @@ -63,16 +45,15 @@ async def process_event( return base64.b64decode(match[1]).decode("utf-8") -async def get_events( - client: httpx.AsyncClient, cached_keys: set[str] -) -> list[dict[str, str]]: +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: + events = [] + + if not (html_data := await network.request(BASE_URL, log=log)): + return events + pattern = re.compile(r"^(?:LIVE|\d+\s+(minutes?)\b)", re.IGNORECASE) - html_data = await get_html_data(client, BASE_URL) - - soup = HTMLParser(html_data) - - events = [] + soup = HTMLParser(html_data.content) for link in soup.css("li.f1-podium--item > a.f1-podium--link"): li_item = link.parent @@ -90,6 +71,9 @@ async def get_events( if inner_span := driver_elem.css_first("span.d-md-inline"): event_name = inner_span.text(strip=True) + if f"[{sport}] {event_name} ({TAG})" in cached_keys: + continue + if not (href := link.attributes.get("href")): continue @@ -101,11 +85,6 @@ async def get_events( if not pattern.search(time_text): continue - key = f"[{sport}] {event_name} ({TAG})" - - if cached_keys & {key}: - continue - events.append( { "sport": sport, @@ -117,7 +96,7 @@ async def get_events( return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -126,7 +105,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client, set(cached_urls.keys())) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") @@ -134,11 +113,7 @@ async def scrape(client: httpx.AsyncClient) -> None: now = Time.clean(Time.now()).timestamp() for i, ev in enumerate(events, start=1): - if url := await process_event( - client, - ev["link"], - i, - ): + if url := await process_event(ev["link"], i): sport, event, link = ( ev["sport"], ev["event"], diff --git a/M3U8/scrapers/lotus.py b/M3U8/scrapers/lotus.py index 33936fa..f4be812 100644 --- a/M3U8/scrapers/lotus.py +++ b/M3U8/scrapers/lotus.py @@ -1,6 +1,5 @@ from functools import partial -import httpx from playwright.async_api import async_playwright from .utils import Cache, Time, get_logger, leagues, network @@ -22,40 +21,16 @@ def fix_league(s: str) -> str: return " ".join(x.capitalize() for x in s.split()) if len(s) > 5 else s.upper() -async def refresh_api_cache( - client: httpx.AsyncClient, - url: str, - now_ts: float, -) -> dict[str, dict[str, str]]: - log.info("Refreshing API cache") - - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') - - return {} - - if not (data := r.json()): - return {} - - data["timestamp"] = now_ts - - return data - - -async def get_events( - client: httpx.AsyncClient, cached_keys: set[str] -) -> list[dict[str, str]]: - now = Time.now() +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: + now = Time.clean(Time.now()) if not (api_data := API_CACHE.load(per_entry=False)): - api_data = await refresh_api_cache( - client, - BASE_URL, - now.timestamp(), - ) + api_data = {} + + if r := await network.request(BASE_URL, log=log): + api_data: dict = r.json() + + api_data["timestamp"] = now.timestamp() API_CACHE.write(api_data) @@ -68,9 +43,14 @@ async def get_events( continue for event in info["items"]: - event_league = event["league"] + if (event_league := event["league"]) == "channel tv": + continue - if event_league == "channel tv": + sport = fix_league(event_league) + + event_name = event["title"] + + if f"[{sport}] {event_name} ({TAG})" in cached_keys: continue event_streams: list[dict[str, str]] = event["streams"] @@ -78,26 +58,19 @@ async def get_events( if not (event_link := event_streams[0].get("link")): continue - sport = fix_league(event_league) - event_name = event["title"] - - key = f"[{sport}] {event_name} ({TAG})" - - if cached_keys & {key}: - continue - events.append( { "sport": sport, "event": event_name, "link": event_link, + "timestamp": now.timestamp(), } ) return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -106,13 +79,11 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client, set(cached_urls.keys())) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") if events: - now = Time.clean(Time.now()).timestamp() - async with async_playwright() as p: browser, context = await network.browser(p) @@ -132,10 +103,11 @@ async def scrape(client: httpx.AsyncClient) -> None: ) if url: - sport, event, link = ( + sport, event, link, ts = ( ev["sport"], ev["event"], ev["link"], + ev["timestamp"], ) tvg_id, logo = leagues.get_tvg_info(sport, event) @@ -146,7 +118,7 @@ async def scrape(client: httpx.AsyncClient) -> None: "url": url, "logo": logo, "base": "https://vividmosaica.com/", - "timestamp": now, + "timestamp": ts, "id": tvg_id or "Live.Event.us", "link": link, } diff --git a/M3U8/scrapers/ppv.py b/M3U8/scrapers/ppv.py index 8d020c6..74dc5ad 100644 --- a/M3U8/scrapers/ppv.py +++ b/M3U8/scrapers/ppv.py @@ -1,6 +1,5 @@ from functools import partial -import httpx from playwright.async_api import async_playwright from .utils import Cache, Time, get_logger, leagues, network @@ -28,35 +27,17 @@ BASE_MIRRORS = [ ] -async def refresh_api_cache( - client: httpx.AsyncClient, - url: str, -) -> dict[str, dict[str, str]]: - log.info("Refreshing API cache") +async def get_events(api_url: str, cached_keys: list[str]) -> list[dict[str, str]]: + events = [] - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') - - return {} - - return r.json() - - -async def get_events( - client: httpx.AsyncClient, - api_url: str, - cached_keys: set[str], -) -> list[dict[str, str]]: if not (api_data := API_FILE.load(per_entry=False)): - api_data = await refresh_api_cache(client, api_url) + api_data = {} + + if r := await network.request(api_url, log=log): + api_data: dict = r.json() API_FILE.write(api_data) - events = [] - now = Time.clean(Time.now()) start_dt = now.delta(minutes=-30) end_dt = now.delta(minutes=30) @@ -76,9 +57,7 @@ async def get_events( if not (name and start_ts and iframe): continue - key = f"[{sport}] {name} ({TAG})" - - if cached_keys & {key}: + if f"[{sport}] {name} ({TAG})" in cached_keys: continue event_dt = Time.from_ts(start_ts) @@ -99,7 +78,7 @@ async def get_events( return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -117,11 +96,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{base_url}"') - events = await get_events( - client, - api_url, - set(cached_urls.keys()), - ) + events = await get_events(api_url, cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/roxie.py b/M3U8/scrapers/roxie.py index b7d17ac..daf4b7d 100644 --- a/M3U8/scrapers/roxie.py +++ b/M3U8/scrapers/roxie.py @@ -3,7 +3,6 @@ import re from functools import partial from urllib.parse import urljoin -import httpx from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -31,17 +30,8 @@ SPORT_ENDPOINTS = { } -async def process_event( - client: httpx.AsyncClient, - url: str, - url_num: int, -) -> str | None: - - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'URL {url_num}) Failed to fetch "{url}": {e}') +async def process_event(url: str, url_num: int) -> str | None: + if not (html_data := await network.request(url, log=log)): return valid_m3u8 = re.compile( @@ -49,7 +39,7 @@ async def process_event( re.IGNORECASE, ) - if not (match := valid_m3u8.search(r.text)): + if not (match := valid_m3u8.search(html_data.text)): log.info(f"URL {url_num}) No M3U8 found") return @@ -57,31 +47,19 @@ async def process_event( return match[1] -async def get_html_data(client: httpx.AsyncClient, url: str) -> bytes: - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') - - return b"" - - return r.content - - async def refresh_html_cache( - client: httpx.AsyncClient, url: str, sport: str, now_ts: float, ) -> dict[str, dict[str, str | float]]: - html_data = await get_html_data(client, url) - - soup = HTMLParser(html_data) - events = {} + if not (html_data := await network.request(url, log=log)): + return events + + soup = HTMLParser(html_data.content) + for row in soup.css("table#eventsTable tbody tr"): if not (a_tag := row.css_first("td a")): continue @@ -113,9 +91,7 @@ async def refresh_html_cache( return events -async def get_events( - client: httpx.AsyncClient, cached_keys: set[str] -) -> list[dict[str, str]]: +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) if not (events := HTML_CACHE.load()): @@ -125,7 +101,6 @@ async def get_events( tasks = [ refresh_html_cache( - client, url, sport, now.timestamp(), @@ -145,7 +120,7 @@ async def get_events( end_ts = now.delta(minutes=30).timestamp() for k, v in events.items(): - if cached_keys & {k}: + if k in cached_keys: continue if not start_ts <= v["event_ts"] <= end_ts: @@ -156,7 +131,7 @@ async def get_events( return live -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -165,7 +140,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client, set(cached_urls.keys())) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") @@ -173,7 +148,6 @@ async def scrape(client: httpx.AsyncClient) -> None: for i, ev in enumerate(events, start=1): handler = partial( process_event, - client=client, url=ev["link"], url_num=i, ) diff --git a/M3U8/scrapers/shark.py b/M3U8/scrapers/shark.py index db8807a..79f0596 100644 --- a/M3U8/scrapers/shark.py +++ b/M3U8/scrapers/shark.py @@ -1,7 +1,6 @@ import re from functools import partial -import httpx from selectolax.parser import HTMLParser from .utils import Cache, Time, get_logger, leagues, network @@ -19,49 +18,32 @@ HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=19_800) BASE_URL = "https://sharkstreams.net" -async def process_event( - client: httpx.AsyncClient, - url: str, - url_num: int, -) -> str | None: - - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'URL {url_num}) Failed to fetch "{url}": {e}') +async def process_event(url: str, url_num: int) -> str | None: + if not (r := await network.request(url, log=log)): + log.info(f"URL {url_num}) Failed to load url.") return data: dict[str, list[str]] = r.json() - if not data.get("urls"): + if not (urls := data.get("urls")): log.info(f"URL {url_num}) No M3U8 found") - return log.info(f"URL {url_num}) Captured M3U8") - - return data["urls"][0] + return urls[0] -async def refresh_html_cache( - client: httpx.AsyncClient, now_ts: float -) -> dict[str, dict[str, str | float]]: +async def refresh_html_cache(now_ts: float) -> dict[str, dict[str, str | float]]: log.info("Refreshing HTML cache") - try: - r = await client.get(BASE_URL) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{BASE_URL}": {e}') + events = {} - return {} + if not (html_data := await network.request(BASE_URL, log=log)): + return events pattern = re.compile(r"openEmbed\('([^']+)'\)", re.IGNORECASE) - soup = HTMLParser(r.content) - - events = {} + soup = HTMLParser(html_data.content) for row in soup.css(".row"): date_node = row.css_first(".ch-date") @@ -98,14 +80,11 @@ async def refresh_html_cache( return events -async def get_events( - client: httpx.AsyncClient, - cached_keys: set[str], -) -> list[dict[str, str]]: +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) if not (events := HTML_CACHE.load()): - events = await refresh_html_cache(client, now.timestamp()) + events = await refresh_html_cache(now.timestamp()) HTML_CACHE.write(events) @@ -115,7 +94,7 @@ async def get_events( end_ts = now.delta(minutes=10).timestamp() for k, v in events.items(): - if cached_keys & {k}: + if k in cached_keys: continue if not start_ts <= v["event_ts"] <= end_ts: @@ -126,7 +105,7 @@ async def get_events( return live -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -135,7 +114,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client, set(cached_urls.keys())) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") @@ -143,7 +122,6 @@ async def scrape(client: httpx.AsyncClient) -> None: for i, ev in enumerate(events, start=1): handler = partial( process_event, - client=client, url=ev["link"], url_num=i, ) diff --git a/M3U8/scrapers/sport9.py b/M3U8/scrapers/sport9.py index fe40803..4e62361 100644 --- a/M3U8/scrapers/sport9.py +++ b/M3U8/scrapers/sport9.py @@ -2,7 +2,6 @@ import asyncio from functools import partial from urllib.parse import urljoin -import httpx from playwright.async_api import async_playwright from selectolax.parser import HTMLParser @@ -16,34 +15,18 @@ TAG = "SPORT9" CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=3_600) -BASE_URL = "https://sport9.ru" +BASE_URL = "https://sport9.ru/" -async def get_html_data( - client: httpx.AsyncClient, - url: str, - date: str, -) -> bytes: - - try: - r = await client.get(url, params={"date": date}) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{r.url}": {e}') - - return b"" - - return r.content - - -async def get_events( - client: httpx.AsyncClient, - cached_keys: set[str], -) -> list[dict[str, str]]: +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.now() tasks = [ - get_html_data(client, BASE_URL, str(d.date())) + network.request( + BASE_URL, + log=log, + params={"date": f"{d.date()}"}, + ) for d in [ now.delta(days=-1), now, @@ -53,10 +36,11 @@ async def get_events( results = await asyncio.gather(*tasks) - soups = [HTMLParser(html) for html in results] - events = [] + if not (soups := [HTMLParser(html.content) for html in results if html]): + return events + for soup in soups: for card in soup.css("a.match-card"): live_badge = card.css_first(".live-badge") @@ -85,12 +69,10 @@ async def get_events( else: continue - if not (href := card.attributes.get("href")): + if f"[{sport}] {event} ({TAG})" in cached_keys: continue - key = f"[{sport}] {event} ({TAG})" - - if cached_keys & {key}: + if not (href := card.attributes.get("href")): continue events.append( @@ -104,7 +86,7 @@ async def get_events( return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -113,7 +95,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client, set(cached_urls.keys())) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/streamcenter.py b/M3U8/scrapers/streamcenter.py index b9e5200..159f5df 100644 --- a/M3U8/scrapers/streamcenter.py +++ b/M3U8/scrapers/streamcenter.py @@ -1,6 +1,5 @@ from functools import partial -import httpx from playwright.async_api import async_playwright from .utils import Cache, Time, get_logger, leagues, network @@ -33,35 +32,20 @@ CATEGORIES = { } -async def refresh_api_cache( - client: httpx.AsyncClient, now_ts: float -) -> list[dict[str, str | int]]: - log.info("Refreshing API cache") - - try: - r = await client.get(BASE_URL, params={"pageNumber": 1, "pageSize": 500}) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{r.url}": {e}') - - return [] - - if not (data := r.json()): - return [] - - data[-1]["timestamp"] = now_ts - - return data - - -async def get_events( - client: httpx.AsyncClient, - cached_keys: set[str], -) -> list[dict[str, str]]: +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) if not (api_data := API_FILE.load(per_entry=False, index=-1)): - api_data = await refresh_api_cache(client, now.timestamp()) + api_data = [] + + if r := await network.request( + BASE_URL, + log=log, + params={"pageNumber": 1, "pageSize": 500}, + ): + api_data: list[dict] = r.json() + + api_data[-1]["timestamp"] = now.timestamp() API_FILE.write(api_data) @@ -82,17 +66,15 @@ async def get_events( if not (name and category_id and iframe and event_time): continue - event_dt = Time.from_str(event_time, timezone="CET") - - if not start_dt <= event_dt <= end_dt: - continue - if not (sport := CATEGORIES.get(category_id)): continue - key = f"[{sport}] {name} ({TAG})" + if f"[{sport}] {name} ({TAG})" in cached_keys: + continue - if cached_keys & {key}: + event_dt = Time.from_str(event_time, timezone="CET") + + if not start_dt <= event_dt <= end_dt: continue events.append( @@ -107,7 +89,7 @@ async def get_events( return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -116,7 +98,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info('Scraping from "https://streamcenter.xyz"') - events = await get_events(client, set(cached_urls.keys())) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/streamfree.py b/M3U8/scrapers/streamfree.py index 809b43e..bfb4fee 100644 --- a/M3U8/scrapers/streamfree.py +++ b/M3U8/scrapers/streamfree.py @@ -1,7 +1,5 @@ from urllib.parse import urljoin -import httpx - from .utils import Cache, Time, get_logger, leagues, network log = get_logger(__name__) @@ -15,24 +13,20 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=19_800) BASE_URL = "https://streamfree.to/" -async def refresh_api_cache(client: httpx.AsyncClient) -> dict[str, dict[str, list]]: - try: - r = await client.get(urljoin(BASE_URL, "streams")) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{r.url}": {e}') - - return {} - - return r.json() - - -async def get_events(client: httpx.AsyncClient) -> dict[str, dict[str, str | float]]: - api_data = await refresh_api_cache(client) - +async def get_events() -> dict[str, dict[str, str | float]]: events = {} - now = Time.clean(Time.now()).timestamp() + if not ( + r := await network.request( + urljoin(BASE_URL, "streams"), + log=log, + ) + ): + return events + + api_data: dict = r.json() + + now = Time.clean(Time.now()) for streams in api_data.get("streams", {}).values(): if not streams: @@ -66,14 +60,14 @@ async def get_events(client: httpx.AsyncClient) -> dict[str, dict[str, str | flo ), "logo": logo or pic, "base": BASE_URL, - "timestamp": now, + "timestamp": now.timestamp(), "id": tvg_id or "Live.Event.us", } return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: if cached := CACHE_FILE.load(): urls.update(cached) log.info(f"Loaded {len(urls)} event(s) from cache") @@ -81,9 +75,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client) - - urls.update(events) + urls.update(await get_events()) CACHE_FILE.write(urls) diff --git a/M3U8/scrapers/streamhub.py b/M3U8/scrapers/streamhub.py index 9817c8d..7d10f9d 100644 --- a/M3U8/scrapers/streamhub.py +++ b/M3U8/scrapers/streamhub.py @@ -2,7 +2,6 @@ import asyncio from functools import partial from urllib.parse import urljoin -import httpx from playwright.async_api import async_playwright from selectolax.parser import HTMLParser @@ -36,40 +35,24 @@ CATEGORIES = { } -async def get_html_data( - client: httpx.AsyncClient, - date: str, - sport_id: str, -) -> bytes: - - try: - r = await client.get( - urljoin(BASE_URL, f"events/{date}"), - params={"sport_id": sport_id}, - ) - - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{r.url}": {e}') - - return b"" - - return r.content - - async def refresh_html_cache( - client: httpx.AsyncClient, date: str, sport_id: str, ts: float, ) -> dict[str, dict[str, str | float]]: - - html_data = await get_html_data(client, date, sport_id) - - soup = HTMLParser(html_data) - events = {} + if not ( + html_data := await network.request( + urljoin(BASE_URL, f"events/{date}"), + log=log, + params={"sport_id": sport_id}, + ) + ): + return events + + soup = HTMLParser(html_data.content) + for section in soup.css(".events-section"): if not (sport_node := section.css_first(".section-titlte")): continue @@ -111,25 +94,19 @@ async def refresh_html_cache( return events -async def get_events( - client: httpx.AsyncClient, - cached_keys: set[str], -) -> list[dict[str, str]]: +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) if not (events := HTML_CACHE.load()): log.info("Refreshing HTML cache") - dates = [now.date(), now.delta(days=1).date()] - tasks = [ refresh_html_cache( - client, date, sport_id, now.timestamp(), ) - for date in dates + for date in [now.date(), now.delta(days=1).date()] for sport_id in CATEGORIES.values() ] @@ -145,7 +122,7 @@ async def get_events( end_ts = now.delta(minutes=5).timestamp() for k, v in events.items(): - if cached_keys & {k}: + if k in cached_keys: continue if not start_ts <= v["event_ts"] <= end_ts: @@ -156,7 +133,7 @@ async def get_events( return live -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -165,7 +142,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client, set(cached_urls.keys())) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/streamsgate.py b/M3U8/scrapers/streamsgate.py index 538b35c..ffa5806 100644 --- a/M3U8/scrapers/streamsgate.py +++ b/M3U8/scrapers/streamsgate.py @@ -4,7 +4,6 @@ from itertools import chain from typing import Any from urllib.parse import urljoin -import httpx from playwright.async_api import async_playwright from .utils import Cache, Time, get_logger, leagues, network @@ -46,32 +45,20 @@ def get_event(t1: str, t2: str) -> str: return f"{t1.strip()} vs {t2.strip()}" -async def get_api_data(client: httpx.AsyncClient, url: str) -> list[dict[str, Any]]: - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') - - return [] - - return r.json() - - -async def refresh_api_cache( - client: httpx.AsyncClient, - now_ts: float, -) -> list[dict[str, Any]]: +async def refresh_api_cache(now_ts: float) -> list[dict[str, Any]]: log.info("Refreshing API cache") tasks = [ - get_api_data(client, urljoin(BASE_URL, f"data/{sport}.json")) + network.request( + urljoin(BASE_URL, f"data/{sport}.json"), + log=log, + ) for sport in SPORT_ENDPOINTS ] results = await asyncio.gather(*tasks) - if not (data := list(chain(*results))): + if not (data := list(chain.from_iterable(r.json() for r in results if r))): return [] for ev in data: @@ -82,13 +69,11 @@ async def refresh_api_cache( return data -async def get_events( - client: httpx.AsyncClient, cached_keys: set[str] -) -> list[dict[str, str]]: +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) if not (api_data := API_FILE.load(per_entry=False, index=-1)): - api_data = await refresh_api_cache(client, now.timestamp()) + api_data = await refresh_api_cache(now.timestamp()) API_FILE.write(api_data) @@ -104,27 +89,28 @@ async def get_events( t1, t2 = stream_group.get("away"), stream_group.get("home") + event = get_event(t1, t2) + if not (event_ts and sport): continue + if f"[{sport}] {event} ({TAG})" in cached_keys: + continue + + if "F1 Abu Dhabi" in event: # api bug + continue + event_dt = Time.from_ts(event_ts) if not start_dt <= event_dt <= end_dt: continue - event = get_event(t1, t2) - if not (streams := stream_group.get("streams")): continue if not (url := streams[0].get("url")): continue - key = f"[{sport}] {event} ({TAG})" - - if cached_keys & {key}: - continue - events.append( { "sport": sport, @@ -137,7 +123,7 @@ async def get_events( return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -146,7 +132,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client, set(cached_urls.keys())) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/strmd.py b/M3U8/scrapers/strmd.py index 6fc6645..54bfd6c 100644 --- a/M3U8/scrapers/strmd.py +++ b/M3U8/scrapers/strmd.py @@ -1,9 +1,7 @@ import re from functools import partial -from typing import Any from urllib.parse import urljoin -import httpx from playwright.async_api import async_playwright from .utils import Cache, Time, get_logger, leagues, network @@ -35,52 +33,28 @@ def fix_sport(s: str) -> str: return s.capitalize() if len(s) >= 4 else s.upper() -async def refresh_api_cache( - client: httpx.AsyncClient, - url: str, - now_ts: float, -) -> list[dict[str, Any]]: - - log.info("Refreshing API cache") - - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') - - return [] - - if not (data := r.json()): - return [] - - data[-1]["timestamp"] = now_ts - - return data - - -async def get_events( - client: httpx.AsyncClient, - url: str, - cached_keys: set[str], -) -> list[dict[str, str]]: - +async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) if not (api_data := API_FILE.load(per_entry=False, index=-1)): - api_data = await refresh_api_cache( - client, + api_data = [] + + if r := await network.request( urljoin(url, "api/matches/all-today"), - now.timestamp(), - ) + log=log, + ): + api_data: list[dict] = r.json() + + api_data[-1]["timestamp"] = now.timestamp() API_FILE.write(api_data) events = [] + pattern = re.compile(r"[\n\r]+|\s{2,}") + start_dt = now.delta(minutes=-30) end_dt = now.delta(minutes=30) - pattern = re.compile(r"[\n\r]+|\s{2,}") for event in api_data: if (category := event.get("category")) == "other": @@ -99,13 +73,12 @@ async def get_events( sport = fix_sport(category) parts = pattern.split(event["title"].strip()) + name = " | ".join(p.strip() for p in parts if p.strip()) logo = urljoin(url, poster) if (poster := event.get("poster")) else None - key = f"[{sport}] {name} ({TAG})" - - if cached_keys & {key}: + if f"[{sport}] {name} ({TAG})" in cached_keys: continue sources: list[dict[str, str]] = event["sources"] @@ -113,7 +86,8 @@ async def get_events( if not sources: continue - skip_types = {"alpha", "bravo"} + skip_types = ["alpha", "bravo"] + valid_sources = [d for d in sources if d.get("source") not in skip_types] if not valid_sources: @@ -122,6 +96,7 @@ async def get_events( srce = valid_sources[0] source_type = srce.get("source") + stream_id = srce.get("id") if not (source_type and stream_id): @@ -140,7 +115,7 @@ async def get_events( return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -154,11 +129,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{base_url}"') - events = await get_events( - client, - base_url, - set(cached_urls.keys()), - ) + events = await get_events(base_url, cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/tvpass.py b/M3U8/scrapers/tvpass.py index 1e6cb8f..ae46218 100644 --- a/M3U8/scrapers/tvpass.py +++ b/M3U8/scrapers/tvpass.py @@ -1,8 +1,6 @@ import re -import httpx - -from .utils import Cache, Time, get_logger, leagues +from .utils import Cache, Time, get_logger, leagues, network log = get_logger(__name__) @@ -15,24 +13,15 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=86_400) BASE_URL = "https://tvpass.org/playlist/m3u" -async def get_data(client: httpx.AsyncClient) -> list[str]: - try: - r = await client.get(BASE_URL) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{BASE_URL}": {e}') - - return [] - - return r.text.splitlines() - - -async def get_events(client: httpx.AsyncClient) -> dict[str, dict[str, str | float]]: - now = Time.clean(Time.now()).timestamp() - +async def get_events() -> dict[str, dict[str, str | float]]: events = {} - data = await get_data(client) + if not (r := await network.request(BASE_URL, log=log)): + return events + + now = Time.clean(Time.now()) + + data = r.text.splitlines() for i, line in enumerate(data, start=1): if line.startswith("#EXTINF"): @@ -59,13 +48,13 @@ async def get_events(client: httpx.AsyncClient) -> dict[str, dict[str, str | flo "logo": logo, "id": tvg_id or "Live.Event.us", "base": "https://tvpass.org", - "timestamp": now, + "timestamp": now.timestamp(), } return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: if cached := CACHE_FILE.load(): urls.update(cached) log.info(f"Loaded {len(urls)} event(s) from cache") @@ -73,9 +62,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events(client) - - urls.update(events) + urls.update(await get_events()) CACHE_FILE.write(urls) diff --git a/M3U8/scrapers/utils/webwork.py b/M3U8/scrapers/utils/webwork.py index 1a801c0..6da6ae2 100644 --- a/M3U8/scrapers/utils/webwork.py +++ b/M3U8/scrapers/utils/webwork.py @@ -51,26 +51,35 @@ class Network: else urljoin(base, f"{tag}/{path}") ) - async def check_status(self, url: str) -> bool: + async def request( + self, + url: str, + log: logging.Logger | None = None, + **kwargs, + ) -> httpx.Response | None: + + log = log or self._logger + try: - r = await self.client.get(url, timeout=5) + r = await self.client.get(url, **kwargs) r.raise_for_status() - return r.status_code == 200 - except (httpx.HTTPError, httpx.TimeoutException) as e: - self._logger.debug(f"Status check failed for {url}: {e}") - return False + except Exception as e: + log.error(f'Failed to fetch "{url}": {e}\n{kwargs = }') + return "" + + return r async def get_base(self, mirrors: list[str]) -> str | None: random.shuffle(mirrors) - tasks = [self.check_status(link) for link in mirrors] - results = await asyncio.gather(*tasks, return_exceptions=True) + for mirror in mirrors: + if not (r := await self.request(mirror)): + continue - working_mirrors = [ - mirror for mirror, success in zip(mirrors, results) if success - ] + elif r.status_code != 200: + continue - return working_mirrors[0] if working_mirrors else None + return mirror @staticmethod async def safe_process( @@ -80,8 +89,7 @@ class Network: log: logging.Logger | None = None, ) -> T | None: - if not log: - log = logging.getLogger(__name__) + log = log or get_logger("network") task = asyncio.create_task(fn()) @@ -133,6 +141,8 @@ class Network: log: logging.Logger | None = None, ) -> str | None: + log = log or self._logger + page = await context.new_page() captured: list[str] = [] diff --git a/M3U8/scrapers/watchfooty.py b/M3U8/scrapers/watchfooty.py index e15a41e..dbbbcb4 100644 --- a/M3U8/scrapers/watchfooty.py +++ b/M3U8/scrapers/watchfooty.py @@ -5,7 +5,6 @@ from itertools import chain from typing import Any from urllib.parse import urljoin -import httpx from playwright.async_api import BrowserContext, async_playwright from .utils import Cache, Time, get_logger, leagues, network @@ -42,37 +41,27 @@ SPORT_ENDPOINTS = [ ] -async def get_api_data(client: httpx.AsyncClient, url: str) -> list[dict[str, Any]]: - try: - r = await client.get(url, timeout=5) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') - - return [] - - return r.json() - - -async def refresh_api_cache( - client: httpx.AsyncClient, url: str -) -> list[dict[str, Any]]: +async def refresh_api_cache(url: str, now_ts: float) -> list[dict[str, Any]]: log.info("Refreshing API cache") tasks = [ - get_api_data(client, urljoin(url, f"api/v1/matches/{sport}")) + network.request( + urljoin(url, f"api/v1/matches/{sport}"), + log=log, + timeout=5, + ) for sport in SPORT_ENDPOINTS ] results = await asyncio.gather(*tasks) - if not (data := list(chain(*results))): + if not (data := list(chain.from_iterable(r.json() for r in results if r))): return [] for ev in data: ev["ts"] = ev.pop("timestamp") - data[-1]["timestamp"] = Time.clean(Time.now()).timestamp() + data[-1]["timestamp"] = now_ts return data @@ -163,33 +152,40 @@ async def process_event( async def get_events( - client: httpx.AsyncClient, - api_url: str, base_url: str, - cached_keys: set[str], + api_url: str, + cached_keys: list[str], ) -> list[dict[str, str]]: + now = Time.clean(Time.now()) + if not (api_data := API_FILE.load(per_entry=False, index=-1)): - api_data = await refresh_api_cache(client, api_url) + api_data = await refresh_api_cache(api_url, now.timestamp()) API_FILE.write(api_data) events = [] - now = Time.clean(Time.now()) + pattern = re.compile(r"\-+|\(") + start_dt = now.delta(minutes=-30) end_dt = now.delta(minutes=5) - pattern = re.compile(r"\-+|\(") - for event in api_data: match_id = event.get("matchId") + name = event.get("title") + league = event.get("league") if not (match_id and name and league): continue + sport = pattern.split(league, 1)[0].strip() + + if f"[{sport}] {name} ({TAG})" in cached_keys: + continue + if not (ts := event.get("ts")): continue @@ -200,15 +196,8 @@ async def get_events( if not start_dt <= event_dt <= end_dt: continue - sport = pattern.split(league, 1)[0].strip() - logo = urljoin(api_url, poster) if (poster := event.get("poster")) else None - key = f"[{sport}] {name} ({TAG})" - - if cached_keys & {key}: - continue - events.append( { "sport": sport, @@ -222,7 +211,7 @@ async def get_events( return events -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} valid_count = cached_count = len(valid_urls) @@ -242,10 +231,9 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{base_url}"') events = await get_events( - client, - api_url, base_url, - set(cached_urls.keys()), + api_url, + cached_urls.keys(), ) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/webcast.py b/M3U8/scrapers/webcast.py index 6343ea3..40d9d7e 100644 --- a/M3U8/scrapers/webcast.py +++ b/M3U8/scrapers/webcast.py @@ -1,7 +1,6 @@ import asyncio from functools import partial -import httpx from playwright.async_api import async_playwright from selectolax.parser import HTMLParser @@ -24,22 +23,15 @@ def fix_event(s: str) -> str: return " vs ".join(s.split("@")) -async def refresh_html_cache( - client: httpx.AsyncClient, url: str -) -> dict[str, dict[str, str | float]]: - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') +async def refresh_html_cache(url: str) -> dict[str, dict[str, str | float]]: + events = {} - return {} + if not (html_data := await network.request(url, log=log)): + return events now = Time.clean(Time.now()) - soup = HTMLParser(r.content) - - events = {} + soup = HTMLParser(html_data.content) title = soup.css_first("title").text(strip=True) @@ -87,15 +79,13 @@ async def refresh_html_cache( return events -async def get_events( - client: httpx.AsyncClient, cached_keys: set[str] -) -> list[dict[str, str]]: +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: now = Time.clean(Time.now()) if not (events := HTML_CACHE.load()): log.info("Refreshing HTML cache") - tasks = [refresh_html_cache(client, url) for url in BASE_URLS.values()] + tasks = [refresh_html_cache(url) for url in BASE_URLS.values()] results = await asyncio.gather(*tasks) @@ -109,7 +99,7 @@ async def get_events( end_ts = now.delta(minutes=30).timestamp() for k, v in events.items(): - if cached_keys & {k}: + if k in cached_keys: continue if not start_ts <= v["event_ts"] <= end_ts: @@ -120,7 +110,7 @@ async def get_events( return live -async def scrape(client: httpx.AsyncClient) -> None: +async def scrape() -> None: cached_urls = CACHE_FILE.load() cached_count = len(cached_urls) urls.update(cached_urls) @@ -129,7 +119,7 @@ async def scrape(client: httpx.AsyncClient) -> None: log.info(f'Scraping from "{' & '.join(BASE_URLS.values())}"') - events = await get_events(client, set(cached_urls.keys())) + events = await get_events(cached_urls.keys()) log.info(f"Processing {len(events)} new URL(s)")