diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 74d38f7a..6ffde1d4 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -23,6 +23,7 @@ from scrapers import ( tvapp, watchfooty, webcast, + xstreameast, ) from scrapers.utils import get_logger, network @@ -78,6 +79,7 @@ async def main() -> None: asyncio.create_task(totalsportek.scrape()), asyncio.create_task(tvapp.scrape()), asyncio.create_task(webcast.scrape()), + asyncio.create_task(xstreameast.scrape()), ] await asyncio.gather(*(pw_tasks + httpx_tasks)) @@ -112,6 +114,7 @@ async def main() -> None: | tvapp.urls | watchfooty.urls | webcast.urls + | xstreameast.urls ) live_events: list[str] = [] diff --git a/M3U8/scrapers/livetvsx.py b/M3U8/scrapers/livetvsx.py index 18fd04fa..e72602ca 100644 --- a/M3U8/scrapers/livetvsx.py +++ b/M3U8/scrapers/livetvsx.py @@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {} TAG = "LTVSX" -CACHE_FILE = Cache(TAG, exp=10_800) +CACHE_FILE = Cache(TAG, exp=3_600) BASE_URL = "https://livetv.sx" diff --git a/M3U8/scrapers/totalsportek.py b/M3U8/scrapers/totalsportek.py index 895bfcd2..7a85b54d 100644 --- a/M3U8/scrapers/totalsportek.py +++ b/M3U8/scrapers/totalsportek.py @@ -11,12 +11,11 @@ log = get_logger(__name__) urls: dict[str, dict[str, str | float]] = {} +TAG = "TOTALSPRTK" + CACHE_FILE = Cache("TSPRTK", exp=28_800) -BASES = { - "TSPRTK1": "https://live.totalsportek.fyi", - "TSPRTK3": "https://live3.totalsportek.fyi", -} +BASE_URL = "https://live3.totalsportek.fyi" def fix_txt(s: str) -> str: @@ -25,43 +24,36 @@ def fix_txt(s: str) -> str: return s.upper() if s.islower() else s -async def process_ts1(ifr_src: str, url_num: int) -> str | None: - if not (ifr_src_data := await network.request(ifr_src, log=log)): - log.info(f"URL {url_num}) Failed to load iframe source.") +async def process_event(url: str, url_num: int) -> str | None: + if not (event_data := await network.request(url, log=log)): + log.warning(f"URL {url_num}) Failed to load url.") return - valid_m3u8 = re.compile(r'(var|const)\s+(\w+)\s*=\s*"([^"]*)"', re.I) + soup_1 = HTMLParser(event_data.content) - if not (match := valid_m3u8.search(ifr_src_data.text)): - log.warning(f"URL {url_num}) No Clappr source found.") + iframe_1 = soup_1.css_first("iframe") + + if not iframe_1 or not (iframe_1_src := iframe_1.attributes.get("src")): + log.warning(f"URL {url_num}) No iframe element found. (IFR1)") return - if len(encoded := match[2]) < 20: - encoded = match[3] - - log.info(f"URL {url_num}) Captured M3U8") - - return bytes.fromhex(encoded).decode("utf-8") - - -async def process_ts3(ifr_src: str, url_num: int) -> str | None: - if not (ifr_1_src_data := await network.request(ifr_src, log=log)): + if not (iframe_1_src_data := await network.request(iframe_1_src, log=log)): log.warning(f"URL {url_num}) Failed to load iframe source. (IFR1)") return - soup_2 = HTMLParser(ifr_1_src_data.content) + soup_2 = HTMLParser(iframe_1_src_data.content) - ifr_2 = soup_2.css_first("iframe[width='100%']") + iframe_2 = soup_2.css_first("iframe") - if not ifr_2 or not (ifr_2_src := ifr_2.attributes.get("src")): + if not iframe_2 or not (iframe_2_src := iframe_2.attributes.get("src")): log.warning(f"URL {url_num}) No iframe element found. (IFR2)") return if not ( - ifr_2_src_data := await network.request( - ifr_2_src, - headers={"Referer": ifr_src}, + iframe_2_src_data := await network.request( + iframe_2_src, log=log, + headers={"Referer": iframe_1_src}, ) ): log.warning(f"URL {url_num}) Failed to load iframe source. (IFR2)") @@ -69,7 +61,7 @@ async def process_ts3(ifr_src: str, url_num: int) -> str | None: valid_m3u8 = re.compile(r'currentStreamUrl\s+=\s+"([^"]*)"', re.I) - if not (match := valid_m3u8.search(ifr_2_src_data.text)): + if not (match := valid_m3u8.search(iframe_2_src_data.text)): log.warning(f"URL {url_num}) No Clappr source found.") return @@ -78,77 +70,52 @@ async def process_ts3(ifr_src: str, url_num: int) -> str | None: return json.loads(f'"{match[1]}"') -async def process_event(url: str, url_num: int, tag: str) -> str | None: - if not (event_data := await network.request(url, log=log)): - log.warning(f"URL {url_num}) Failed to load url.") - return - - soup = HTMLParser(event_data.content) - - iframe = soup.css_first("iframe") - - if not iframe or not (iframe_src := iframe.attributes.get("src")): - log.warning(f"URL {url_num}) No valid iframe source found.") - return - - return ( - await process_ts1(iframe_src, url_num) - if tag == "TSPRTK1" - else await process_ts3(iframe_src, url_num) - ) - - async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: events = [] - if not (html_data := await network.request(BASES["TSPRTK1"], log=log)): + if not (html_data := await network.request(BASE_URL, log=log)): return events soup = HTMLParser(html_data.content) sport = "Live Event" - for tag, url in BASES.items(): - for node in soup.css("a"): - if not node.attributes.get("class"): - continue + for node in soup.css("a"): + if not node.attributes.get("class"): + continue - if (parent := node.parent) and "my-1" in parent.attributes.get("class", ""): - if span := node.css_first("span"): - sport = span.text(strip=True) + if (parent := node.parent) and "my-1" in parent.attributes.get("class", ""): + if span := node.css_first("span"): + sport = span.text(strip=True) - sport = fix_txt(sport) + sport = fix_txt(sport) - if not (teams := [t.text(strip=True) for t in node.css(".col-7 .col-12")]): - continue + if not (teams := [t.text(strip=True) for t in node.css(".col-7 .col-12")]): + continue - if not (href := node.attributes.get("href")): - continue + if not (href := node.attributes.get("href")): + continue - href = urlparse(href).path if href.startswith("http") else href + href = urlparse(href).path if href.startswith("http") else href - if not (time_node := node.css_first(".col-3 span")): - continue + if not (time_node := node.css_first(".col-3 span")): + continue - if time_node.text(strip=True).lower() not in [ - "matchstarted", - "1minfrom now", - ]: - continue + if time_node.text(strip=True).lower() != "matchstarted": + continue - event_name = fix_txt(" vs ".join(teams)) + event_name = fix_txt(" vs ".join(teams)) - if f"[{sport}] {event_name} ({tag})" in cached_keys: - continue + if f"[{sport}] {event_name} ({TAG})" in cached_keys: + continue - events.append( - { - "sport": sport, - "event": event_name, - "tag": tag, - "link": urljoin(url, href), - } - ) + events.append( + { + "sport": sport, + "event": event_name, + "link": urljoin(f"{html_data.url}", href), + } + ) return events @@ -164,7 +131,7 @@ async def scrape() -> None: log.info(f"Loaded {cached_count} event(s) from cache") - log.info('Scraping from "https://live.totalsportek.fyi"') + log.info(f'Scraping from "{BASE_URL}"') if events := await get_events(cached_urls.keys()): log.info(f"Processing {len(events)} new URL(s)") @@ -176,7 +143,6 @@ async def scrape() -> None: process_event, url=(link := ev["link"]), url_num=i, - tag=(tag := ev["tag"]), ) url = await network.safe_process( @@ -188,7 +154,7 @@ async def scrape() -> None: sport, event = ev["sport"], ev["event"] - key = f"[{sport}] {event} ({tag})" + key = f"[{sport}] {event} ({TAG})" tvg_id, logo = leagues.get_tvg_info(sport, event) diff --git a/M3U8/scrapers/xstreameast.py b/M3U8/scrapers/xstreameast.py new file mode 100644 index 00000000..abbc0986 --- /dev/null +++ b/M3U8/scrapers/xstreameast.py @@ -0,0 +1,179 @@ +import asyncio +import re +from functools import partial +from urllib.parse import urljoin + +from selectolax.parser import HTMLParser + +from .utils import Cache, Time, get_logger, leagues, network + +log = get_logger(__name__) + +urls: dict[str, dict[str, str | float]] = {} + +TAG = "XSTRMEST" + +CACHE_FILE = Cache(TAG, exp=10_800) + +BASE_URL = "https://xstreameast.com" + +SPORT_URLS = [ + urljoin(BASE_URL, f"categories/{sport}/") + for sport in [ + # "mlb", + # "mma", + "nba", + # "nfl", + # "nhl", + # "soccer", + "wwe", + ] +] + + +async def process_event(url: str, url_num: int) -> tuple[str | None, str | None]: + nones = None, None + + if not (html_data := await network.request(url, log=log)): + log.warning(f"URL {url_num}) Failed to load url.") + return nones + + soup = HTMLParser(html_data.content) + + iframe = soup.css_first("iframe") + + if not iframe or not (iframe_src := iframe.attributes.get("src")): + log.warning(f"URL {url_num}) No iframe element found.") + return nones + + elif iframe_src == "about:blank": + log.warning(f"URL {url_num}) No iframe element found.") + return nones + + if not (iframe_src_data := await network.request(iframe_src, log=log)): + log.warning(f"URL {url_num}) Failed to load iframe source.") + return nones + + valid_m3u8 = re.compile(r'(var|const)\s+(\w+)\s*=\s*"([^"]*)"', re.I) + + if not (match := valid_m3u8.search(iframe_src_data.text)): + log.warning(f"URL {url_num}) No Clappr source found.") + return nones + + if len(encoded := match[2]) < 20: + encoded = match[3] + + log.info(f"URL {url_num}) Captured M3U8") + + return bytes.fromhex(encoded).decode("utf-8"), iframe_src + + +async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: + tasks = [network.request(url, log=log) for url in SPORT_URLS] + + results = await asyncio.gather(*tasks) + + events = [] + + if not (soups := [HTMLParser(html.content) for html in results if html]): + return events + + sport = "Live Event" + + for soup in soups: + if sport_header := soup.css_first("h1.text-3xl"): + header = sport_header.text(strip=True) + + sport = header.split("Streams")[0].strip() + + for card in soup.css("article.game-card"): + if not (team_elem := card.css_first("h2.text-xl.font-semibold")): + continue + + if not (link_elem := card.css_first("a.stream-button")) or not ( + href := link_elem.attributes.get("href") + ): + continue + + if ( + not (live_badge := card.css_first("span.bg-green-600")) + or live_badge.text(strip=True) != "LIVE" + ): + continue + + event_name = team_elem.text(strip=True) + + if f"[{sport}] {event_name} ({TAG})" in cached_keys: + continue + + events.append( + { + "sport": sport, + "event": event_name, + "link": href, + } + ) + + return events + + +async def scrape() -> None: + cached_urls = CACHE_FILE.load() + + valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} + + valid_count = cached_count = len(valid_urls) + + urls.update(valid_urls) + + log.info(f"Loaded {cached_count} event(s) from cache") + + log.info(f'Scraping from "{BASE_URL}"') + + if events := await get_events(cached_urls.keys()): + log.info(f"Processing {len(events)} new URL(s)") + + now = Time.clean(Time.now()) + + for i, ev in enumerate(events, start=1): + handler = partial( + process_event, + url=(link := ev["link"]), + url_num=i, + ) + + url, iframe = await network.safe_process( + handler, + url_num=i, + semaphore=network.HTTP_S, + log=log, + ) + + sport, event = ev["sport"], ev["event"] + + key = f"[{sport}] {event} ({TAG})" + + tvg_id, logo = leagues.get_tvg_info(sport, event) + + entry = { + "url": url, + "logo": logo, + "base": iframe, + "timestamp": now.timestamp(), + "id": tvg_id or "Live.Event.us", + "link": link, + } + + cached_urls[key] = entry + + if url: + valid_count += 1 + + urls[key] = entry + + log.info(f"Collected and cached {valid_count - cached_count} new event(s)") + + else: + log.info("No new events found") + + CACHE_FILE.write(cached_urls)