diff --git a/M3U8/fetch.py b/M3U8/fetch.py index c555ee21..d332d533 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -12,7 +12,6 @@ from scrapers import ( istreameast, mainportal, ovogoal, - pawa, roxie, shark, streamcenter, @@ -23,7 +22,6 @@ from scrapers import ( tvapp, watchfooty, webcast, - xstreameast, ) from scrapers.utils import get_logger, network @@ -71,7 +69,6 @@ async def main() -> None: asyncio.create_task(istreameast.scrape()), asyncio.create_task(mainportal.scrape()), asyncio.create_task(ovogoal.scrape()), - # asyncio.create_task(pawa.scrape()), asyncio.create_task(shark.scrape()), asyncio.create_task(streamcenter.scrape()), asyncio.create_task(streamsgate.scrape()), @@ -79,7 +76,6 @@ async def main() -> None: asyncio.create_task(totalsportek.scrape()), asyncio.create_task(tvapp.scrape()), asyncio.create_task(webcast.scrape()), - # asyncio.create_task(xstreameast.scrape()), ] await asyncio.gather(*(pw_tasks + httpx_tasks)) @@ -103,7 +99,6 @@ async def main() -> None: | istreameast.urls | mainportal.urls | ovogoal.urls - | pawa.urls | roxie.urls | shark.urls | streamcenter.urls @@ -114,7 +109,6 @@ async def main() -> None: | tvapp.urls | watchfooty.urls | webcast.urls - | xstreameast.urls ) live_events: list[str] = [] diff --git a/M3U8/scrapers/pawa.py b/M3U8/scrapers/pawa.py deleted file mode 100644 index 5af3d64c..00000000 --- a/M3U8/scrapers/pawa.py +++ /dev/null @@ -1,142 +0,0 @@ -import base64 -import re -from functools import partial - -import feedparser -from selectolax.parser import HTMLParser - -from .utils import Cache, Time, get_logger, leagues, network - -log = get_logger(__name__) - -urls: dict[str, dict[str, str | float]] = {} - -TAG = "PAWA" - -CACHE_FILE = Cache(TAG, exp=19_800) - -BASE_URL = "https://pawastreams.net/feed/" - - -async def process_event(url: str, url_num: int) -> str | None: - if not (event_data := await network.request(url, log=log)): - log.warning(f"URL {url_num}) Failed to load url.") - - return - - soup = HTMLParser(event_data.content) - - if not (iframe := soup.css_first("iframe")): - log.warning(f"URL {url_num}) No iframe element found.") - - return - - if not (iframe_src := iframe.attributes.get("src")): - log.warning(f"URL {url_num}) No iframe source found.") - - return - - if not (iframe_src_data := await network.request(iframe_src, log=log)): - log.warning(f"URL {url_num}) Failed to load iframe source.") - - return - - pattern = re.compile(r"source:\s*window\.atob\(\s*'([^']+)'\s*\)", re.I) - - if not (match := pattern.search(iframe_src_data.text)): - log.warning(f"URL {url_num}) No Clappr source found.") - - return - - log.info(f"URL {url_num}) Captured M3U8") - - m3u = base64.b64decode(match[1]).decode("utf-8") - - return m3u.split("&remote")[0] - - -async def get_events() -> list[dict[str, str]]: - events = [] - - if not (html_data := await network.request(BASE_URL, log=log)): - return events - - feed = feedparser.parse(html_data.content) - - sport = "Live Event" - - for entry in feed.entries: - if not (link := entry.get("link")): - continue - - if not (title := entry.get("title")): - continue - - title = title.replace(" v ", " vs ") - - events.append( - { - "sport": sport, - "event": title, - "link": link, - } - ) - - return events - - -async def scrape() -> None: - if cached_urls := CACHE_FILE.load(): - urls.update({k: v for k, v in cached_urls.items() if v["url"]}) - - log.info(f"Loaded {len(urls)} event(s) from cache") - - return - - log.info(f'Scraping from "{BASE_URL}"') - - if events := await get_events(): - log.info(f"Processing {len(events)} URL(s)") - - now = Time.clean(Time.now()) - - for i, ev in enumerate(events, start=1): - handler = partial( - process_event, - url=(link := ev["link"]), - url_num=i, - ) - - url = await network.safe_process( - handler, - url_num=i, - semaphore=network.HTTP_S, - log=log, - ) - - sport, event = ev["sport"], ev["event"] - - key = f"[{sport}] {event} ({TAG})" - - tvg_id, logo = leagues.get_tvg_info(sport, event) - - entry = { - "url": url, - "logo": logo, - "base": link, - "timestamp": now.timestamp(), - "id": tvg_id or "Live.Event.us", - "link": link, - } - - cached_urls[key] = entry - - if url: - urls[key] = entry - - log.info(f"Collected and cached {len(urls)} event(s)") - - else: - log.info("No events found") - - CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/streamhub.py b/M3U8/scrapers/streamhub.py index 5476b93d..3a975d6e 100644 --- a/M3U8/scrapers/streamhub.py +++ b/M3U8/scrapers/streamhub.py @@ -97,18 +97,9 @@ async def process_event( pass if captured: - if "smarthard.click" not in (m3u8 := captured[0]).lower(): - log.warning(f"URL {url_num}) Invalid M3U8 found.") - - return - log.info(f"URL {url_num}) Captured M3U8") - return m3u8 - - log.warning(f"URL {url_num}) No M3U8 captured after waiting.") - - return + return captured[0] except Exception as e: log.warning(f"URL {url_num}) {e}") @@ -224,7 +215,7 @@ async def scrape(browser: Browser) -> None: entry = { "url": url, "logo": logo, - "base": "https://hardsmart.click", + "base": "http://streamobs.click/", "timestamp": ts, "id": tvg_id or "Live.Event.us", "link": link, diff --git a/M3U8/scrapers/totalsportek.py b/M3U8/scrapers/totalsportek.py index 64754d80..16612d98 100644 --- a/M3U8/scrapers/totalsportek.py +++ b/M3U8/scrapers/totalsportek.py @@ -15,7 +15,10 @@ TAG = "TSPRTK" CACHE_FILE = Cache(TAG, exp=19_800) -BASE_URL = "https://live3.totalsportek.fyi" +BASES = { + "TSPRTK1": "https://live.totalsportek.fyi", + "TSPRTK3": "https://live3.totalsportek.fyi", +} def fix_txt(s: str) -> str: @@ -24,36 +27,43 @@ def fix_txt(s: str) -> str: return s.upper() if s.islower() else s -async def process_event(url: str, url_num: int) -> str | None: - if not (event_data := await network.request(url, log=log)): - log.warning(f"URL {url_num}) Failed to load url.") +async def process_ts1(ifr_src: str, url_num: int) -> str | None: + if not (ifr_src_data := await network.request(ifr_src, log=log)): + log.info(f"URL {url_num}) Failed to load iframe source.") return - soup_1 = HTMLParser(event_data.content) + valid_m3u8 = re.compile(r'(var|const)\s+(\w+)\s+=\s+"([^"]*)"', re.I) - iframe_1 = soup_1.css_first("iframe") - - if not iframe_1 or not (iframe_1_src := iframe_1.attributes.get("src")): - log.warning(f"URL {url_num}) No iframe element found. (IFR1)") + if not (match := valid_m3u8.search(ifr_src_data.text)): + log.warning(f"URL {url_num}) No Clappr source found.") return - if not (iframe_1_src_data := await network.request(iframe_1_src, log=log)): + if len(encoded := match[2]) < 20: + encoded = match[3] + + log.info(f"URL {url_num}) Captured M3U8") + + return bytes.fromhex(encoded).decode("utf-8") + + +async def process_ts3(ifr_src: str, url_num: int) -> str | None: + if not (ifr_1_src_data := await network.request(ifr_src, log=log)): log.warning(f"URL {url_num}) Failed to load iframe source. (IFR1)") return - soup_2 = HTMLParser(iframe_1_src_data.content) + soup = HTMLParser(ifr_1_src_data.content) - iframe_2 = soup_2.css_first("iframe") + ifr_2 = soup.css_first("iframe") - if not iframe_2 or not (iframe_2_src := iframe_2.attributes.get("src")): + if not ifr_2 or not (ifr_2_src := ifr_2.attributes.get("src")): log.warning(f"URL {url_num}) No iframe element found. (IFR2)") return if not ( - iframe_2_src_data := await network.request( - iframe_2_src, + ifr_2_src_data := await network.request( + ifr_2_src, + headers={"Referer": ifr_src}, log=log, - headers={"Referer": iframe_1_src}, ) ): log.warning(f"URL {url_num}) Failed to load iframe source. (IFR2)") @@ -61,7 +71,7 @@ async def process_event(url: str, url_num: int) -> str | None: valid_m3u8 = re.compile(r'currentStreamUrl\s+=\s+"([^"]*)"', re.I) - if not (match := valid_m3u8.search(iframe_2_src_data.text)): + if not (match := valid_m3u8.search(ifr_2_src_data.text)): log.warning(f"URL {url_num}) No Clappr source found.") return @@ -70,52 +80,86 @@ async def process_event(url: str, url_num: int) -> str | None: return json.loads(f'"{match[1]}"') +async def process_event( + url: str, + url_num: int, + tag: str, +) -> tuple[str | None, str | None]: + + nones = None, None + + if not (event_data := await network.request(url, log=log)): + log.warning(f"URL {url_num}) Failed to load url.") + return nones + + soup = HTMLParser(event_data.content) + + iframe = soup.css_first("iframe") + + if not iframe or not (iframe_src := iframe.attributes.get("src")): + log.warning(f"URL {url_num}) No valid iframe source found.") + return nones + + m3u8 = ( + await process_ts1(iframe_src, url_num) + if tag == "TSPRTK1" + else await process_ts3(iframe_src, url_num) + ) + + return (m3u8, iframe_src) if m3u8 else nones + + async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: events = [] - if not (html_data := await network.request(BASE_URL, log=log)): + if not (html_data := await network.request(BASES["TSPRTK1"], log=log)): return events soup = HTMLParser(html_data.content) sport = "Live Event" - for node in soup.css("a"): - if not node.attributes.get("class"): - continue + for tag, url in BASES.items(): + for node in soup.css("a"): + if not node.attributes.get("class"): + continue - if (parent := node.parent) and "my-1" in parent.attributes.get("class", ""): - if span := node.css_first("span"): - sport = span.text(strip=True) + if (parent := node.parent) and "my-1" in parent.attributes.get("class", ""): + if span := node.css_first("span"): + sport = span.text(strip=True) - sport = fix_txt(sport) + sport = fix_txt(sport) - if not (teams := [t.text(strip=True) for t in node.css(".col-7 .col-12")]): - continue + if not (teams := [t.text(strip=True) for t in node.css(".col-7 .col-12")]): + continue - if not (href := node.attributes.get("href")): - continue + if not (href := node.attributes.get("href")): + continue - href = urlparse(href).path if href.startswith("http") else href + href = urlparse(href).path if href.startswith("http") else href - # if not (time_node := node.css_first(".col-3 span")): - # continue + # if not (time_node := node.css_first(".col-3 span")): + # continue - # if time_node.text(strip=True).lower() != "matchstarted": - # continue + # if time_node.text(strip=True).lower() not in [ + # "matchstarted", + # "1minfrom now", + # ]: + # continue - event_name = fix_txt(" vs ".join(teams)) + event_name = fix_txt(" vs ".join(teams)) - if f"[{sport}] {event_name} ({TAG})" in cached_keys: - continue + if f"[{sport}] {event_name} ({tag})" in cached_keys: + continue - events.append( - { - "sport": sport, - "event": event_name, - "link": urljoin(f"{html_data.url}", href), - } - ) + events.append( + { + "sport": sport, + "event": event_name, + "tag": tag, + "link": urljoin(url, href), + } + ) return events @@ -131,7 +175,7 @@ async def scrape() -> None: log.info(f"Loaded {cached_count} event(s) from cache") - log.info(f'Scraping from "{BASE_URL}"') + log.info('Scraping from "https://live.totalsportek.fyi"') if events := await get_events(cached_urls.keys()): log.info(f"Processing {len(events)} new URL(s)") @@ -143,9 +187,10 @@ async def scrape() -> None: process_event, url=(link := ev["link"]), url_num=i, + tag=(tag := ev["tag"]), ) - url = await network.safe_process( + url, iframe = await network.safe_process( handler, url_num=i, semaphore=network.HTTP_S, @@ -154,14 +199,14 @@ async def scrape() -> None: sport, event = ev["sport"], ev["event"] - key = f"[{sport}] {event} ({TAG})" + key = f"[{sport}] {event} ({tag})" tvg_id, logo = leagues.get_tvg_info(sport, event) entry = { "url": url, "logo": logo, - "base": link, + "base": iframe, "timestamp": now.timestamp(), "id": tvg_id or "Live.Event.us", "link": link, diff --git a/M3U8/scrapers/watchfooty.py b/M3U8/scrapers/watchfooty.py index 1ad3bbd1..7291c343 100644 --- a/M3U8/scrapers/watchfooty.py +++ b/M3U8/scrapers/watchfooty.py @@ -164,9 +164,6 @@ async def process_event( log.info(f"URL {url_num}) Captured M3U8") return captured[0], iframe_url - log.warning(f"URL {url_num}) No M3U8 captured after waiting.") - return nones - except Exception as e: log.warning(f"URL {url_num}) {e}") return nones diff --git a/M3U8/scrapers/xstreameast.py b/M3U8/scrapers/xstreameast.py deleted file mode 100644 index ddb72fac..00000000 --- a/M3U8/scrapers/xstreameast.py +++ /dev/null @@ -1,179 +0,0 @@ -import asyncio -import re -from functools import partial -from urllib.parse import urljoin - -from selectolax.parser import HTMLParser - -from .utils import Cache, Time, get_logger, leagues, network - -log = get_logger(__name__) - -urls: dict[str, dict[str, str | float]] = {} - -TAG = "XSTRMEAST" - -CACHE_FILE = Cache(TAG, exp=10_800) - -BASE_URL = "https://xstreameast.com" - -SPORT_URLS = [ - urljoin(BASE_URL, f"categories/{sport}/") - for sport in [ - # "mlb", - # "mma", - "nba", - # "nfl", - # "nhl", - "soccer", - # "wwe", - ] -] - - -async def process_event(url: str, url_num: int) -> tuple[str | None, str | None]: - nones = None, None - - if not (html_data := await network.request(url, log=log)): - log.warning(f"URL {url_num}) Failed to load url.") - return nones - - soup = HTMLParser(html_data.content) - - iframe = soup.css_first("iframe") - - if not iframe or not (iframe_src := iframe.attributes.get("src")): - log.warning(f"URL {url_num}) No iframe element found.") - return nones - - elif iframe_src == "about:blank": - log.warning(f"URL {url_num}) No iframe element found.") - return nones - - if not (iframe_src_data := await network.request(iframe_src, log=log)): - log.warning(f"URL {url_num}) Failed to load iframe source.") - return nones - - valid_m3u8 = re.compile(r'(var|const)\s+(\w+)\s+=\s+"([^"]*)"', re.I) - - if not (match := valid_m3u8.search(iframe_src_data.text)): - log.warning(f"URL {url_num}) No Clappr source found.") - return nones - - if len(encoded := match[2]) < 20: - encoded = match[3] - - log.info(f"URL {url_num}) Captured M3U8") - - return bytes.fromhex(encoded).decode("utf-8"), iframe_src - - -async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: - tasks = [network.request(url, log=log) for url in SPORT_URLS] - - results = await asyncio.gather(*tasks) - - events = [] - - if not (soups := [HTMLParser(html.content) for html in results if html]): - return events - - sport = "Live Event" - - for soup in soups: - if sport_header := soup.css_first("h1.text-3xl"): - header = sport_header.text(strip=True) - - sport = header.split("Streams")[0].strip() - - for card in soup.css("article.game-card"): - if not (team_elem := card.css_first("h2.text-xl.font-semibold")): - continue - - if not (link_elem := card.css_first("a.stream-button")) or not ( - href := link_elem.attributes.get("href") - ): - continue - - if ( - not (live_badge := card.css_first("span.bg-green-600")) - or live_badge.text(strip=True) != "LIVE" - ): - continue - - event_name = team_elem.text(strip=True) - - if f"[{sport}] {event_name} ({TAG})" in cached_keys: - continue - - events.append( - { - "sport": sport, - "event": event_name, - "link": href, - } - ) - - return events - - -async def scrape() -> None: - cached_urls = CACHE_FILE.load() - - valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} - - valid_count = cached_count = len(valid_urls) - - urls.update(valid_urls) - - log.info(f"Loaded {cached_count} event(s) from cache") - - log.info(f'Scraping from "{BASE_URL}"') - - if events := await get_events(cached_urls.keys()): - log.info(f"Processing {len(events)} new URL(s)") - - now = Time.clean(Time.now()) - - for i, ev in enumerate(events, start=1): - handler = partial( - process_event, - url=(link := ev["link"]), - url_num=i, - ) - - url, iframe = await network.safe_process( - handler, - url_num=i, - semaphore=network.HTTP_S, - log=log, - ) - - sport, event = ev["sport"], ev["event"] - - key = f"[{sport}] {event} ({TAG})" - - tvg_id, logo = leagues.get_tvg_info(sport, event) - - entry = { - "url": url, - "logo": logo, - "base": iframe, - "timestamp": now.timestamp(), - "id": tvg_id or "Live.Event.us", - "link": link, - } - - cached_urls[key] = entry - - if url: - valid_count += 1 - - urls[key] = entry - - log.info(f"Collected and cached {valid_count - cached_count} new event(s)") - - else: - log.info("No new events found") - - CACHE_FILE.write(cached_urls)