From 00000d9ef768e9211d58e4e92bbf4359b3040b33 Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Thu, 16 Apr 2026 17:42:28 -0400 Subject: [PATCH] e - change caching process for select sites --- M3U8/scrapers/cdnlivetv.py | 18 ++---- M3U8/scrapers/ovogoal.py | 26 +++----- M3U8/scrapers/pawa.py | 26 +++----- M3U8/scrapers/roxie.py | 26 +++----- M3U8/scrapers/shark.py | 116 ++++++++++++++-------------------- M3U8/scrapers/streamcenter.py | 37 +++++------ M3U8/scrapers/streamtpnew.py | 26 +++----- M3U8/scrapers/tvapp.py | 45 +++++++------ M3U8/scrapers/webcast.py | 30 +++------ 9 files changed, 138 insertions(+), 212 deletions(-) diff --git a/M3U8/scrapers/cdnlivetv.py b/M3U8/scrapers/cdnlivetv.py index d4182c5c..41a8d419 100644 --- a/M3U8/scrapers/cdnlivetv.py +++ b/M3U8/scrapers/cdnlivetv.py @@ -30,8 +30,11 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: if r := await network.request( urljoin(API_URL, "api/v1/events/sports"), + params={ + "user": "cdnlivetv", + "plan": "free", + }, log=log, - params={"user": "cdnlivetv", "plan": "free"}, ): api_data = r.json().get("cdn-live-tv") @@ -68,22 +71,11 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: event_links: list[str] = [channel["url"] for channel in channels] - # if not ( - # link := ( - # event_links[0] - # if len(event_links) == 1 - # else await network.get_base(event_links) - # ) - # ): - # continue - - link = event_links[0] - events.append( { "sport": league, "event": name, - "link": link, + "link": event_links[0], "timestamp": event_dt.timestamp(), } ) diff --git a/M3U8/scrapers/ovogoal.py b/M3U8/scrapers/ovogoal.py index beeacbf6..4f419e14 100644 --- a/M3U8/scrapers/ovogoal.py +++ b/M3U8/scrapers/ovogoal.py @@ -53,7 +53,7 @@ async def process_event(url: str, url_num: int) -> tuple[str | None, str | None] return match[3], iframe_src -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: +async def get_events() -> list[dict[str, str]]: events = [] if not (html_data := await network.request(BASE_URL, log=log)): @@ -76,9 +76,6 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: event_name = event_name_elem.text(strip=True) - if f"[{sport}] {event_name} ({TAG})" in cached_keys: - continue - events.append( { "sport": sport, @@ -91,20 +88,17 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: async def scrape() -> None: - cached_urls = CACHE_FILE.load() + if cached_urls := CACHE_FILE.load(): + urls.update(cached_urls) - valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + log.info(f"Loaded {len(urls)} event(s) from cache") - valid_count = cached_count = len(valid_urls) - - urls.update(valid_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") + return log.info(f'Scraping from "{BASE_URL}"') - if events := await get_events(cached_urls.keys()): - log.info(f"Processing {len(events)} new URL(s)") + if events := await get_events(): + log.info(f"Processing {len(events)} URL(s)") now = Time.clean(Time.now()) @@ -140,13 +134,11 @@ async def scrape() -> None: cached_urls[key] = entry if url: - valid_count += 1 - urls[key] = entry - log.info(f"Collected and cached {valid_count - cached_count} new event(s)") + log.info(f"Collected and cached {len(urls)} event(s)") else: - log.info("No new events found") + log.info("No events found") CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/pawa.py b/M3U8/scrapers/pawa.py index ddcce199..a4cf6f96 100644 --- a/M3U8/scrapers/pawa.py +++ b/M3U8/scrapers/pawa.py @@ -55,7 +55,7 @@ async def process_event(url: str, url_num: int) -> str | None: return m3u.split("&remote")[0] -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: +async def get_events() -> list[dict[str, str]]: events = [] if not (html_data := await network.request(BASE_URL, log=log)): @@ -74,9 +74,6 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: title = title.replace(" v ", " vs ") - if f"[{sport}] {title} ({TAG})" in cached_keys: - continue - events.append( { "sport": sport, @@ -89,20 +86,17 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: async def scrape() -> None: - cached_urls = CACHE_FILE.load() + if cached_urls := CACHE_FILE.load(): + urls.update(cached_urls) - valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + log.info(f"Loaded {len(urls)} event(s) from cache") - valid_count = cached_count = len(valid_urls) - - urls.update(valid_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") + return log.info(f'Scraping from "{BASE_URL}"') - if events := await get_events(cached_urls.keys()): - log.info(f"Processing {len(events)} new URL(s)") + if events := await get_events(): + log.info(f"Processing {len(events)} URL(s)") now = Time.clean(Time.now()) @@ -138,13 +132,11 @@ async def scrape() -> None: cached_urls[key] = entry if url: - valid_count += 1 - urls[key] = entry - log.info(f"Collected and cached {valid_count - cached_count} new event(s)") + log.info(f"Collected and cached {len(urls)} event(s)") else: - log.info("No new events found") + log.info("No events found") CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/roxie.py b/M3U8/scrapers/roxie.py index 51aa23dc..7851a305 100644 --- a/M3U8/scrapers/roxie.py +++ b/M3U8/scrapers/roxie.py @@ -78,7 +78,7 @@ async def process_event( return -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: +async def get_events() -> list[dict[str, str]]: tasks = [network.request(url, log=log) for url in SPORT_URLS.values()] results = await asyncio.gather(*tasks) @@ -102,9 +102,6 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: if not (href := a_tag.attributes.get("href")): continue - if f"[{sport}] {event} ({TAG})" in cached_keys: - continue - events.append( { "sport": sport, @@ -117,20 +114,17 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: async def scrape(browser: Browser) -> None: - cached_urls = CACHE_FILE.load() + if cached_urls := CACHE_FILE.load(): + urls.update(cached_urls) - valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + log.info(f"Loaded {len(urls)} event(s) from cache") - valid_count = cached_count = len(valid_urls) - - urls.update(valid_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") + return log.info(f'Scraping from "{BASE_URL}"') - if events := await get_events(cached_urls.keys()): - log.info(f"Processing {len(events)} new URL(s)") + if events := await get_events(): + log.info(f"Processing {len(events)} URL(s)") now = Time.clean(Time.now()) @@ -169,13 +163,11 @@ async def scrape(browser: Browser) -> None: cached_urls[key] = entry if url: - valid_count += 1 - urls[key] = entry - log.info(f"Collected and cached {valid_count - cached_count} new event(s)") + log.info(f"Collected and cached {len(urls)} event(s)") else: - log.info("No new events found") + log.info("No events found") CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/shark.py b/M3U8/scrapers/shark.py index a0985383..4bbfff09 100644 --- a/M3U8/scrapers/shark.py +++ b/M3U8/scrapers/shark.py @@ -11,9 +11,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "SHARK" -CACHE_FILE = Cache(TAG, exp=10_800) - -HTML_FILE = Cache(f"{TAG}-html", exp=19_800) +CACHE_FILE = Cache(TAG, exp=19_800) BASE_URL = "https://sharkstreams.net" @@ -38,8 +36,10 @@ async def process_event(url: str, url_num: int) -> str | None: return pattern.sub(r"chunks.m3u8", urls[0]) -async def refresh_html_cache(now_ts: float) -> dict[str, dict[str, str | float]]: - events = {} +async def get_events() -> dict[str, dict[str, str | float]]: + now = Time.clean(Time.now()) + + events = [] if not (html_data := await network.request(BASE_URL, log=log)): return events @@ -59,6 +59,9 @@ async def refresh_html_cache(now_ts: float) -> dict[str, dict[str, str | float]] event_dt = Time.from_str(date_node.text(strip=True), timezone="EST") + if event_dt.date() != now.date(): + continue + sport = sport_node.text(strip=True) event_name = name_node.text(strip=True) @@ -73,59 +76,30 @@ async def refresh_html_cache(now_ts: float) -> dict[str, dict[str, str | float]] link = match[1].replace("player.php", "get-stream.php") - key = f"[{sport}] {event_name} ({TAG})" - - events[key] = { - "sport": sport, - "event": event_name, - "link": link, - "event_ts": event_dt.timestamp(), - "timestamp": now_ts, - } + events.append( + { + "sport": sport, + "event": event_name, + "link": link, + "timestamp": now.timestamp(), + } + ) return events -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: - now = Time.clean(Time.now()) - - if not (events := HTML_FILE.load()): - log.info("Refreshing HTML cache") - - events = await refresh_html_cache(now.timestamp()) - - HTML_FILE.write(events) - - live = [] - - start_ts = now.delta(hours=-1).timestamp() - end_ts = now.delta(minutes=10).timestamp() - - for k, v in events.items(): - if k in cached_keys: - continue - - if not start_ts <= v["event_ts"] <= end_ts: - continue - - live.append(v) - - return live - - async def scrape() -> None: - cached_urls = CACHE_FILE.load() + if cached_urls := CACHE_FILE.load(): + urls.update(cached_urls) - cached_count = len(cached_urls) + log.info(f"Loaded {len(urls)} event(s) from cache") - urls.update(cached_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") + return log.info(f'Scraping from "{BASE_URL}"') - if events := await get_events(cached_urls.keys()): - log.info(f"Processing {len(events)} new URL(s)") + if events := await get_events(): + log.info(f"Processing {len(events)} URL(s)") for i, ev in enumerate(events, start=1): handler = partial( @@ -141,31 +115,33 @@ async def scrape() -> None: log=log, ) + sport, event, ts = ( + ev["sport"], + ev["event"], + ev["timestamp"], + ) + + tvg_id, logo = leagues.get_tvg_info(sport, event) + + key = f"[{sport}] {event} ({TAG})" + + entry = { + "url": url, + "logo": logo, + "base": BASE_URL, + "timestamp": ts, + "id": tvg_id or "Live.Event.us", + "link": link, + } + + cached_urls[key] = entry + if url: - sport, event, ts = ( - ev["sport"], - ev["event"], - ev["event_ts"], - ) + urls[key] = entry - tvg_id, logo = leagues.get_tvg_info(sport, event) - - key = f"[{sport}] {event} ({TAG})" - - entry = { - "url": url, - "logo": logo, - "base": BASE_URL, - "timestamp": ts, - "id": tvg_id or "Live.Event.us", - "link": link, - } - - urls[key] = cached_urls[key] = entry - - log.info(f"Collected and cached {len(cached_urls) - cached_count} new event(s)") + log.info(f"Collected and cached {len(urls)} event(s)") else: - log.info("No new events found") + log.info("No events found") CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/streamcenter.py b/M3U8/scrapers/streamcenter.py index 6bd83498..3fd6a946 100644 --- a/M3U8/scrapers/streamcenter.py +++ b/M3U8/scrapers/streamcenter.py @@ -46,7 +46,7 @@ async def process_event(url: str, url_num: int) -> str | None: return f"https://mainstreams.pro/hls/{iframe_src.rsplit("=", 1)[-1]}.m3u8" -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: +async def get_events() -> list[dict[str, str]]: now = Time.clean(Time.now()) events = [] @@ -82,14 +82,12 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: if not (sport := CATEGORIES.get(category_id)): continue - if f"[{sport}] {name} ({TAG})" in cached_keys: - continue - events.append( { "sport": sport, "event": name, "link": iframe.split("<")[0], + "timestamp": now.timestamp(), } ) @@ -97,22 +95,17 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: async def scrape() -> None: - cached_urls = CACHE_FILE.load() + if cached_urls := CACHE_FILE.load(): + urls.update(cached_urls) - valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + log.info(f"Loaded {len(urls)} event(s) from cache") - valid_count = cached_count = len(valid_urls) - - urls.update(valid_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") + return log.info('Scraping from "https://streamcenter.xyz"') - if events := await get_events(cached_urls.keys()): - log.info(f"Processing {len(events)} new URL(s)") - - now = Time.clean(Time.now()) + if events := await get_events(): + log.info(f"Processing {len(events)} URL(s)") for i, ev in enumerate(events, start=1): handler = partial( @@ -128,7 +121,11 @@ async def scrape() -> None: log=log, ) - sport, event = ev["sport"], ev["event"] + sport, event, ts = ( + ev["sport"], + ev["event"], + ev["timestamp"], + ) key = f"[{sport}] {event} ({TAG})" @@ -138,7 +135,7 @@ async def scrape() -> None: "url": url, "logo": logo, "base": "https://streamcenter.xyz", - "timestamp": now.timestamp(), + "timestamp": ts, "id": tvg_id or "Live.Event.us", "link": link, } @@ -146,13 +143,11 @@ async def scrape() -> None: cached_urls[key] = entry if url: - valid_count += 1 - urls[key] = entry - log.info(f"Collected and cached {valid_count - cached_count} new event(s)") + log.info(f"Collected and cached {len(urls)} event(s)") else: - log.info("No new events found") + log.info("No events found") CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/streamtpnew.py b/M3U8/scrapers/streamtpnew.py index e6dfae86..f5da675f 100644 --- a/M3U8/scrapers/streamtpnew.py +++ b/M3U8/scrapers/streamtpnew.py @@ -50,7 +50,7 @@ async def process_event(url: str, url_num: int) -> str | None: return m3u8.split("ip=")[0] -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: +async def get_events() -> list[dict[str, str]]: events = [] if not (api_req := await network.request(API_URL, log=log)): @@ -70,9 +70,6 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: if (sport := event.get("category")) and sport == "Other": sport = "Live Event" - if f"[{sport}] {name} ({TAG})" in cached_keys: - continue - events.append( { "sport": sport, @@ -85,20 +82,17 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: async def scrape() -> None: - cached_urls = CACHE_FILE.load() + if cached_urls := CACHE_FILE.load(): + urls.update(cached_urls) - valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + log.info(f"Loaded {len(urls)} event(s) from cache") - valid_count = cached_count = len(valid_urls) - - urls.update(valid_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") + return log.info('Scraping from "https://streamtpnew.com"') - if events := await get_events(cached_urls.keys()): - log.info(f"Processing {len(events)} new URL(s)") + if events := await get_events(): + log.info(f"Processing {len(events)} URL(s)") now = Time.clean(Time.now()) @@ -134,13 +128,11 @@ async def scrape() -> None: cached_urls[key] = entry if url: - valid_count += 1 - urls[key] = entry - log.info(f"Collected and cached {valid_count - cached_count} new event(s)") + log.info(f"Collected and cached {len(urls)} event(s)") else: - log.info("No new events found") + log.info("No events found") CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/tvapp.py b/M3U8/scrapers/tvapp.py index e541991a..f89a6b82 100644 --- a/M3U8/scrapers/tvapp.py +++ b/M3U8/scrapers/tvapp.py @@ -74,8 +74,8 @@ async def get_events() -> list[dict[str, str]]: async def scrape() -> None: - if cached := CACHE_FILE.load(): - urls.update(cached) + if cached_urls := CACHE_FILE.load(): + urls.update(cached_urls) log.info(f"Loaded {len(urls)} event(s) from cache") @@ -84,7 +84,7 @@ async def scrape() -> None: log.info(f'Scraping from "{BASE_URL}"') if events := await get_events(): - log.info(f"Processing {len(events)} new URL(s)") + log.info(f"Processing {len(events)} URL(s)") now = Time.clean(Time.now()) @@ -102,24 +102,29 @@ async def scrape() -> None: log=log, ) + sport, event = ev["sport"], ev["event"] + + key = f"[{sport}] {event} ({TAG})" + + tvg_id, logo = leagues.get_tvg_info(sport, event) + + entry = { + "url": url, + "logo": logo, + "base": BASE_URL, + "timestamp": now.timestamp(), + "id": tvg_id or "Live.Event.us", + "link": link, + } + + cached_urls[key] = entry + if url: - sport, event = ev["sport"], ev["event"] - - key = f"[{sport}] {event} ({TAG})" - - tvg_id, logo = leagues.get_tvg_info(sport, event) - - entry = { - "url": url, - "logo": logo, - "base": BASE_URL, - "timestamp": now.timestamp(), - "id": tvg_id or "Live.Event.us", - "link": link, - } - urls[key] = entry - log.info(f"Collected and cached {len(urls)} new event(s)") + log.info(f"Collected and cached {len(urls)} event(s)") - CACHE_FILE.write(urls) + else: + log.info("No events found") + + CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/webcast.py b/M3U8/scrapers/webcast.py index 95d6f63f..df5ef690 100644 --- a/M3U8/scrapers/webcast.py +++ b/M3U8/scrapers/webcast.py @@ -91,7 +91,7 @@ async def process_event( return data.get("url") -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: +async def get_events() -> list[dict[str, str]]: tasks = [network.request(url, log=log) for url in BASE_URLS.values()] results = await asyncio.gather(*tasks) @@ -120,15 +120,10 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: if not (href := vs_node.attributes.get("href")): continue - event = fix_event(event_name) - - if f"[{sport}] {event} ({TAG})" in cached_keys: - continue - events.append( { "sport": sport, - "event": event, + "event": fix_event(event_name), "link": href, } ) @@ -137,20 +132,17 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: async def scrape() -> None: - cached_urls = CACHE_FILE.load() + if cached_urls := CACHE_FILE.load(): + urls.update(cached_urls) - valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + log.info(f"Loaded {len(urls)} event(s) from cache") - valid_count = cached_count = len(valid_urls) - - urls.update(valid_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") + return log.info(f'Scraping from "{' & '.join(BASE_URLS.values())}"') - if events := await get_events(cached_urls.keys()): - log.info(f"Processing {len(events)} new URL(s)") + if events := await get_events(): + log.info(f"Processing {len(events)} URL(s)") now = Time.clean(Time.now()) @@ -187,13 +179,11 @@ async def scrape() -> None: cached_urls[key] = entry if url: - valid_count += 1 - urls[key] = entry - log.info(f"Collected and cached {valid_count - cached_count} new event(s)") + log.info(f"Collected and cached {len(urls)} event(s)") else: - log.info("No new events found") + log.info("No events found") CACHE_FILE.write(cached_urls)