diff --git a/M3U8/scrapers/streambtw.py b/M3U8/scrapers/streambtw.py index bbe5714..1f4f04f 100644 --- a/M3U8/scrapers/streambtw.py +++ b/M3U8/scrapers/streambtw.py @@ -15,7 +15,7 @@ TAG = "STRMBTW" CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=3_600) -BASE_URL = "https://hiteasport.info/" +BASE_URLS = ["https://hiteasport.info/", "https://streambtw.com/"] def fix_league(s: str) -> str: @@ -45,10 +45,10 @@ async def process_event(url: str, url_num: int) -> str | None: return stream_link -async def get_events() -> list[dict[str, str]]: +async def get_events(url: str) -> list[dict[str, str]]: events = [] - if not (html_data := await network.request(BASE_URL, log=log)): + if not (html_data := await network.request(url, log=log)): return events soup = HTMLParser(html_data.content) @@ -72,7 +72,7 @@ async def get_events() -> list[dict[str, str]]: { "sport": fix_league(league), "event": name, - "link": urljoin(BASE_URL, href), + "link": urljoin(url, href), } ) @@ -87,9 +87,16 @@ async def scrape() -> None: return - log.info(f'Scraping from "{BASE_URL}"') + if not (base_url := await network.get_base(BASE_URLS)): + log.warning("No working StreamBTW mirrors") - events = await get_events() + CACHE_FILE.write(urls) + + return + + log.info(f'Scraping from "{base_url}"') + + events = await get_events(base_url) log.info(f"Processing {len(events)} new URL(s)") diff --git a/M3U8/scrapers/webcast.py b/M3U8/scrapers/webcast.py index 439bbf5..cc3f7f5 100644 --- a/M3U8/scrapers/webcast.py +++ b/M3U8/scrapers/webcast.py @@ -1,3 +1,4 @@ +import asyncio from functools import partial from playwright.async_api import async_playwright @@ -15,23 +16,27 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=86_400) -BASE_URL = "https://slapstreams.com" +BASE_URLS = {"NFL": "https://nflwebcast.com", "NHL": "https://slapstreams.com"} def fix_event(s: str) -> str: return " vs ".join(s.split("@")) -async def refresh_html_cache() -> dict[str, dict[str, str | float]]: +async def refresh_html_cache(url: str) -> dict[str, dict[str, str | float]]: events = {} - if not (html_data := await network.request(BASE_URL, log=log)): + if not (html_data := await network.request(url, log=log)): return events now = Time.clean(Time.now()) soup = HTMLParser(html_data.content) + title = soup.css_first("title").text(strip=True) + + sport = "NFL" if "NFL" in title else "NHL" + date_text = now.strftime("%B %d, %Y") if date_row := soup.css_first("tr.mdatetitle"): @@ -61,10 +66,10 @@ async def refresh_html_cache() -> dict[str, dict[str, str | float]]: event = fix_event(event_name) - key = f"[NHL] {event} ({TAG})" + key = f"[{sport}] {event} ({TAG})" events[key] = { - "sport": "NHL", + "sport": sport, "event": event, "link": href, "event_ts": event_dt.timestamp(), @@ -80,7 +85,11 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: if not (events := HTML_CACHE.load()): log.info("Refreshing HTML cache") - events = await refresh_html_cache() + tasks = [refresh_html_cache(url) for url in BASE_URLS.values()] + + results = await asyncio.gather(*tasks) + + events = {k: v for data in results for k, v in data.items()} HTML_CACHE.write(events) @@ -110,7 +119,7 @@ async def scrape() -> None: log.info(f"Loaded {cached_count} event(s) from cache") - log.info(f'Scraping from "{BASE_URL}"') + log.info(f'Scraping from "{' & '.join(BASE_URLS.values())}"') events = await get_events(cached_urls.keys()) @@ -152,7 +161,7 @@ async def scrape() -> None: entry = { "url": url, "logo": logo, - "base": BASE_URL, + "base": BASE_URLS[sport], "timestamp": ts, "id": tvg_id or "Live.Event.us", "link": link,