diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 50a0d49..73ebd40 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -4,6 +4,7 @@ import re from pathlib import Path from scrapers import ( + fawa, fstv, lotus, pixel, @@ -43,6 +44,7 @@ async def main() -> None: base_m3u8, tvg_chno = load_base() tasks = [ + asyncio.create_task(fawa.scrape(network.client)), asyncio.create_task(fstv.scrape(network.client)), asyncio.create_task(lotus.scrape(network.client)), asyncio.create_task(pixel.scrape(network.client)), @@ -59,7 +61,8 @@ async def main() -> None: await asyncio.gather(*tasks) additions = ( - fstv.urls + fawa.urls + | fstv.urls | lotus.urls | pixel.urls | ppv.urls diff --git a/M3U8/scrapers/fawa.py b/M3U8/scrapers/fawa.py new file mode 100644 index 0000000..047ba6f --- /dev/null +++ b/M3U8/scrapers/fawa.py @@ -0,0 +1,160 @@ +import re +from functools import partial +from pathlib import Path +from urllib.parse import quote, urljoin + +import httpx +from selectolax.parser import HTMLParser + +from .utils import Cache, Time, get_logger, leagues, network + +log = get_logger(__name__) + +urls: dict[str, dict[str, str]] = {} + +BASE_URL = "http://www.fawanews.sc" + +CACHE_FILE = Cache(Path(__file__).parent / "caches" / "fawa.json", exp=10_800) + + +async def process_event( + client: httpx.AsyncClient, + url: str, + url_num: int, +) -> str | None: + + try: + r = await client.get(url) + r.raise_for_status() + except Exception as e: + log.error(f'URL {url_num}) Failed to fetch "{url}": {e}') + return + + valid_m3u8 = re.compile( + r'var\s+(\w+)\s*=\s*\[["\']?(https?:\/\/[^"\'\s>]+\.m3u8(?:\?[^"\'\s>]*)?)["\']\]?', + re.IGNORECASE, + ) + + if match := valid_m3u8.search(r.text): + log.info(f"URL {url_num}) Captured M3U8") + return match[2] + + log.info(f"URL {url_num}) No M3U8 found") + + +async def get_events( + client: httpx.AsyncClient, cached_hrefs: set[str] +) -> list[dict[str, str]]: + try: + r = await client.get(BASE_URL) + r.raise_for_status() + except Exception as e: + log.error(f'Failed to fetch "{BASE_URL}": {e}') + + return [] + + soup = HTMLParser(r.text) + + now = Time.clean(Time.now()) + start_ts = now.delta(minutes=-30) + end_ts = now.delta(minutes=30) + pattern = re.compile(r"\d{1,2}:\d{1,2}") + + events = [] + + for item in soup.css(".user-item"): + text = item.css_first(".user-item__name") + subtext = item.css_first(".user-item__playing") + link = item.css_first("a[href]") + + if not (href := link.attributes.get("href")): + continue + + href = quote(href) + + if cached_hrefs & {href}: + continue + + if not (text and subtext): + continue + + event_name, details = text.text(strip=True), subtext.text(strip=True) + + if not (match := pattern.search(details)): + continue + + sport = pattern.split(details)[0].strip() + + event_time = Time.from_str(f"{now.date()} {match[0]} CET", "%Y-%m-%d %H:%M") + + if not start_ts <= event_time <= end_ts: + continue + + events.append( + { + "sport": sport, + "event": event_name, + "link": urljoin("http://www.fawanews.sc", href), + "href": href, + } + ) + + return events + + +async def scrape(client: httpx.AsyncClient) -> None: + cached_urls = CACHE_FILE.load() + cached_hrefs = {entry["href"] for entry in cached_urls.values()} + cached_count = len(cached_urls) + urls.update(cached_urls) + + log.info(f"Loaded {cached_count} event(s) from cache") + + log.info(f'Scraping from "{BASE_URL}"') + + events = await get_events(client, cached_hrefs) + + log.info(f"Processing {len(events)} new URL(s)") + + if events: + now = Time.now().timestamp() + + for i, ev in enumerate(events, start=1): + handler = partial( + process_event, + client=client, + url=ev["link"], + url_num=i, + ) + + url = await network.safe_process( + handler, + url_num=i, + log=log, + timeout=10, + ) + + if url: + sport, event = ev["sport"], ev["event"] + + key = f"[{sport}] {event} (FAWA)" + + tvg_id, logo = leagues.get_tvg_info(sport, event) + + entry = { + "url": url, + "logo": logo, + "base": BASE_URL, + "timestamp": now, + "id": tvg_id or "Live.Event.us", + "href": ev["href"], + } + + urls[key] = cached_urls[key] = entry + + if new_count := len(cached_urls) - cached_count: + log.info(f"Collected and cached {new_count} new event(s)") + else: + log.info("No new events found") + + CACHE_FILE.write(cached_urls) diff --git a/M3U8/scrapers/old/ace.py b/M3U8/scrapers/old/ace.py deleted file mode 100644 index ae7e7c1..0000000 --- a/M3U8/scrapers/old/ace.py +++ /dev/null @@ -1,127 +0,0 @@ -import asyncio -import re -from urllib.parse import urljoin - -import httpx -from selectolax.parser import HTMLParser, Node - -from .utils import get_base, get_logger, leagues - -log = get_logger(__name__) - -urls: dict[str, dict[str, str]] = {} - -MIRRORS = ["https://aceztrims.pages.dev/", "https://acestrlms.pages.dev/"] - - -def is_valid_href(a: Node) -> bool: - href = a.attributes.get("href", "") - return href.startswith("/") and href != "/news/" - - -async def get_schedule(client: httpx.AsyncClient, base_url: str) -> list[dict]: - log.info(f'Scraping from "{base_url}"') - - try: - r = await client.get(base_url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{base_url}": {e}') - return [] - - html = re.sub(r"", "", r.text, flags=re.DOTALL) - - tree = HTMLParser(html) - - events = [] - - for a in filter(is_valid_href, tree.css("a[href]")): - href = a.attributes.get("href", "") - - title_text = a.text(strip=True) - - after_time = ( - title_text.split("//", 1)[1].strip() if "//" in title_text else title_text - ) - - if " - " in after_time: - sport, event_name = (x.strip() for x in after_time.split(" - ", 1)) - else: - sport, event_name = "", after_time - - events.append( - {"sport": sport, "event": event_name, "href": urljoin(base_url, href)} - ) - - return events - - -async def get_m3u8_links(client: httpx.AsyncClient, url: str) -> list[str]: - try: - r = await client.get(url) - r.raise_for_status() - except Exception as e: - log.error(f'Failed to fetch "{url}": {e}') - return [] - - html = re.sub(r"", "", r.text, flags=re.DOTALL) - - soup = HTMLParser(html) - - m3u8_links = [] - - for btn in soup.css("button[onclick]"): - onclick = btn.attributes.get("onclick", "") - - if match := re.search(r"src\s*=\s*['\"](.*?)['\"]", onclick): - link = match[1] - - if ".m3u8" in link: - m3u8_links.append(link) - - if iframe := soup.css_first("iframe#iframe"): - src = iframe.attributes.get("src", "") - - if ".m3u8" in src and src not in m3u8_links: - m3u8_links.insert( - 0, - src.split("cors.ricohspaces.app/")[-1], - ) - - return m3u8_links - - -async def scrape(client: httpx.AsyncClient) -> None: - if not (base_url := await get_base(client, MIRRORS)): - log.warning("No working ace mirrors") - return - - schedule = await get_schedule(client, base_url) - - tasks = [get_m3u8_links(client, item["href"]) for item in schedule] - - results = await asyncio.gather(*tasks) - - for item, m3u8_urls in zip(schedule, results): - if not m3u8_urls: - continue - - for i, link in enumerate(m3u8_urls, start=1): - sport, event = item["sport"], item["event"] - - key = f"[{sport}] {event} (S{i})" - - tvg_id, logo = leagues.info(sport) - - entry = { - "url": link, - "logo": logo, - "id": tvg_id, - } - - urls[key] = entry - - log.info(f"Collected {len(urls)} events") - - -# need to update diff --git a/M3U8/scrapers/utils/config.py b/M3U8/scrapers/utils/config.py index 434963c..05eaf59 100644 --- a/M3U8/scrapers/utils/config.py +++ b/M3U8/scrapers/utils/config.py @@ -8,6 +8,7 @@ import pytz class Time(datetime): ZONES = { + "CET": pytz.timezone("Europe/Berlin"), "ET": pytz.timezone("America/New_York"), "PST": pytz.timezone("America/Los_Angeles"), "UTC": timezone.utc, @@ -53,7 +54,7 @@ class Time(datetime): fmt: str | None = None, ) -> "Time": - pattern = re.compile(r"\b(ET|UTC|EST|EDT|PST)\b") + pattern = re.compile(rf"\b({"|".join(cls.ZONES.keys())})\b") match = pattern.search(s)