This commit is contained in:
doms9 2025-10-10 17:14:32 -04:00
parent c090e6d0be
commit 00000d922c
6 changed files with 417 additions and 7 deletions

View file

@ -2,7 +2,7 @@
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from scrapers import fstv, livetvsx, ppv, streambtw, streameast, streamed, tvpass from scrapers import fstv, livetvsx, ppv, streambtw, streamed, strmd, tvpass
from scrapers.utils import get_logger, network from scrapers.utils import get_logger, network
log = get_logger(__name__) log = get_logger(__name__)
@ -27,11 +27,11 @@ async def main() -> None:
tasks = [ tasks = [
asyncio.create_task(fstv.scrape(network.client)), asyncio.create_task(fstv.scrape(network.client)),
# asyncio.create_task(livetvsx.scrape(network.client)), asyncio.create_task(livetvsx.scrape(network.client)),
asyncio.create_task(ppv.scrape(network.client)), asyncio.create_task(ppv.scrape(network.client)),
asyncio.create_task(streambtw.scrape(network.client)), asyncio.create_task(streambtw.scrape(network.client)),
# asyncio.create_task(streameast.scrape(network.client)),
asyncio.create_task(streamed.scrape(network.client)), asyncio.create_task(streamed.scrape(network.client)),
asyncio.create_task(strmd.scrape(network.client)),
asyncio.create_task(tvpass.scrape(network.client)), asyncio.create_task(tvpass.scrape(network.client)),
] ]
@ -42,8 +42,8 @@ async def main() -> None:
| livetvsx.urls | livetvsx.urls
| ppv.urls | ppv.urls
| streambtw.urls | streambtw.urls
| streameast.urls
| streamed.urls | streamed.urls
| strmd.urls
| tvpass.urls | tvpass.urls
) )

127
M3U8/scrapers/old/ace.py Normal file
View file

@ -0,0 +1,127 @@
import asyncio
import re
from urllib.parse import urljoin
import httpx
from selectolax.parser import HTMLParser, Node
from .utils import get_base, get_logger, leagues
log = get_logger(__name__)
urls: dict[str, dict[str, str]] = {}
MIRRORS = ["https://aceztrims.pages.dev/", "https://acestrlms.pages.dev/"]
def is_valid_href(a: Node) -> bool:
href = a.attributes.get("href", "")
return href.startswith("/") and href != "/news/"
async def get_schedule(client: httpx.AsyncClient, base_url: str) -> list[dict]:
log.info(f'Scraping from "{base_url}"')
try:
r = await client.get(base_url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{base_url}": {e}')
return []
html = re.sub(r"<!--.*?-->", "", r.text, flags=re.DOTALL)
tree = HTMLParser(html)
events = []
for a in filter(is_valid_href, tree.css("a[href]")):
href = a.attributes.get("href", "")
title_text = a.text(strip=True)
after_time = (
title_text.split("//", 1)[1].strip() if "//" in title_text else title_text
)
if " - " in after_time:
sport, event_name = (x.strip() for x in after_time.split(" - ", 1))
else:
sport, event_name = "", after_time
events.append(
{"sport": sport, "event": event_name, "href": urljoin(base_url, href)}
)
return events
async def get_m3u8_links(client: httpx.AsyncClient, url: str) -> list[str]:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return []
html = re.sub(r"<!--.*?-->", "", r.text, flags=re.DOTALL)
soup = HTMLParser(html)
m3u8_links = []
for btn in soup.css("button[onclick]"):
onclick = btn.attributes.get("onclick", "")
if match := re.search(r"src\s*=\s*['\"](.*?)['\"]", onclick):
link = match[1]
if ".m3u8" in link:
m3u8_links.append(link)
if iframe := soup.css_first("iframe#iframe"):
src = iframe.attributes.get("src", "")
if ".m3u8" in src and src not in m3u8_links:
m3u8_links.insert(
0,
src.split("cors.ricohspaces.app/")[-1],
)
return m3u8_links
async def scrape(client: httpx.AsyncClient) -> None:
if not (base_url := await get_base(client, MIRRORS)):
log.warning("No working ace mirrors")
return
schedule = await get_schedule(client, base_url)
tasks = [get_m3u8_links(client, item["href"]) for item in schedule]
results = await asyncio.gather(*tasks)
for item, m3u8_urls in zip(schedule, results):
if not m3u8_urls:
continue
for i, link in enumerate(m3u8_urls, start=1):
sport, event = item["sport"], item["event"]
key = f"[{sport}] {event} (S{i})"
tvg_id, logo = leagues.info(sport)
entry = {
"url": link,
"logo": logo,
"id": tvg_id,
}
urls[key] = entry
log.info(f"Collected {len(urls)} events")
# need to update

View file

@ -216,3 +216,6 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info("No new events found") log.info("No new events found")
CACHE_FILE.write(cached_urls) CACHE_FILE.write(cached_urls)
# cloudflare bot protection added

View file

@ -110,7 +110,7 @@ async def refresh_html_cache(client: httpx.AsyncClient, url: str) -> dict[str, s
if not (m := re.search(r"openPlayerPopup\(\s*(\d+)\s*\)", onclick)): if not (m := re.search(r"openPlayerPopup\(\s*(\d+)\s*\)", onclick)):
continue continue
key = f"[{sport}] {event} (STRMD)" key = f"[{sport}] {event} (STRMED)"
events[key] = { events[key] = {
"sport": sport, "sport": sport,

272
M3U8/scrapers/strmd.py Normal file
View file

@ -0,0 +1,272 @@
import asyncio
import re
from functools import partial
from pathlib import Path
from typing import Any
from urllib.parse import urljoin
import httpx
from playwright.async_api import BrowserContext, async_playwright
from .utils import Cache, Time, get_logger, leagues, network
log = get_logger(__name__)
urls: dict[str, dict[str, str | float]] = {}
API_FILE = Cache(Path(__file__).parent / "caches" / "strmd_api.json", exp=28_800)
CACHE_FILE = Cache(Path(__file__).parent / "caches" / "strmd.json", exp=10_800)
MIRRORS = ["https://streamed.pk", "https://streami.su", "https://streamed.st"]
def validate_category(s: str) -> str:
if "-" in s:
return " ".join([i.capitalize() for i in s.split("-")])
elif s == "fight":
return "Fight (UFC/Boxing)"
return s.capitalize()
def get_tvg(sport: str, event: str) -> str:
match sport:
case "American Football":
if leagues.is_valid(event, "NFL"):
return "NFL.Dummy.us"
else:
return "NCAA.Sports.Dummy.us"
case "Basketball":
if leagues.is_valid(event, "NBA"):
return "NBA.Basketball.Dummy.us"
elif leagues.is_valid(event, "WNBA"):
return "WNBA.dummy.us"
# NCAA
else:
return "Basketball.Dummy.us"
case "Hockey":
return "NHL.Hockey.Dummy.us"
case _:
return leagues.info(sport)[0]
async def refresh_api_cache(
client: httpx.AsyncClient, url: str
) -> list[dict[str, Any]]:
log.info("Refreshing API cache")
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}"\n{e}')
return {}
data = r.json()
data[0]["timestamp"] = Time.now().timestamp()
return data
async def process_event(
url: str,
url_num: int,
context: BrowserContext,
) -> str | None:
page = await context.new_page()
captured: list[str] = []
got_one = asyncio.Event()
handler = partial(network.capture_req, captured=captured, got_one=got_one)
page.on("request", handler)
try:
await page.goto(
url,
wait_until="domcontentloaded",
timeout=15_000,
)
wait_task = asyncio.create_task(got_one.wait())
try:
await asyncio.wait_for(wait_task, timeout=10)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
return
finally:
if not wait_task.done():
wait_task.cancel()
try:
await wait_task
except asyncio.CancelledError:
pass
if captured:
log.info(f"URL {url_num}) Captured M3U8")
return captured[-1]
log.warning(f"URL {url_num}) No M3U8 captured after waiting.")
return
except Exception as e:
log.warning(f"URL {url_num}) Exception while processing: {e}")
return
finally:
page.remove_listener("request", handler)
await page.close()
async def get_events(
client: httpx.AsyncClient,
base_url: str,
cached_keys: set[str],
) -> list[dict[str, str]]:
if not (api_data := API_FILE.load(per_entry=False, index=True)):
api_data = await refresh_api_cache(
client, urljoin(base_url, "api/matches/all-today")
)
API_FILE.write(api_data)
events: list[dict[str, str]] = []
now = Time.clean(Time.now())
start_dt = now.delta(minutes=-30)
end_dt = now.delta(minutes=30)
pattern = re.compile(r"[\n\r]+|\s{2,}")
for event in api_data:
category = event["category"]
if category == "other":
continue
sport = validate_category(category)
parts = pattern.split(event["title"].strip())
name = " | ".join(p.strip() for p in parts if p.strip())
logo = urljoin(base_url, poster) if (poster := event.get("poster")) else None
key = f"[{sport}] {name} (STRMD)"
if cached_keys & {key}:
continue
if not (ts := event["date"]):
continue
start_ts = int(str(ts)[:-3])
event_dt = Time.from_ts(start_ts)
if not start_dt <= event_dt <= end_dt:
continue
source: list[dict[str, str]] = event["sources"]
for s in source:
source_type = s.get("source")
stream_id = s.get("id")
if not source_type:
continue
if not stream_id:
continue
events.append(
{
"sport": sport,
"event": name,
"link": f"https://embedsports.top/embed/{source_type}/{stream_id}/1",
"logo": logo,
"timestamp": event_dt.timestamp(),
}
)
return events
async def scrape(client: httpx.AsyncClient) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
urls.update(cached_urls)
log.info(f"Loaded {cached_count} event(s) from cache")
if not (base_url := await network.get_base(MIRRORS)):
log.warning("No working PPV mirrors")
CACHE_FILE.write(cached_urls)
return
log.info(f'Scraping from "{base_url}"')
events = await get_events(
client,
base_url,
set(cached_urls.keys()),
)
log.info(f"Processing {len(events)} new URL(s)")
async with async_playwright() as p:
browser, context = await network.browser(p, "brave")
for i, ev in enumerate(events, start=1):
url = await network.safe_process(
lambda: process_event(
ev["link"],
url_num=i,
context=context,
),
url_num=i,
log=log,
)
if url:
sport, event, logo, ts = (
ev["sport"],
ev["event"],
ev["logo"],
ev["timestamp"],
)
key = f"[{sport}] {event} (STRMD)"
entry = {
"url": url,
"logo": logo or leagues.info(sport)[1],
"base": "https://embedsports.top/",
"timestamp": ts,
"id": get_tvg(sport, event) or "Live.Event.us",
}
urls[key] = cached_urls[key] = entry
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")
else:
log.info("No new events found")
CACHE_FILE.write(cached_urls)

View file

@ -17,7 +17,11 @@ class Cache:
return self.now_ts - dt_ts < self.exp return self.now_ts - dt_ts < self.exp
def load(self, per_entry: bool = True) -> dict[str, dict[str, str | float]]: def load(
self,
per_entry: bool = True,
index: bool = False,
) -> dict[str, dict[str, str | float]]:
try: try:
data: dict = json.loads(self.file.read_text(encoding="utf-8")) data: dict = json.loads(self.file.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError): except (FileNotFoundError, json.JSONDecodeError):
@ -26,7 +30,11 @@ class Cache:
if per_entry: if per_entry:
return {k: v for k, v in data.items() if self.is_fresh(v)} return {k: v for k, v in data.items() if self.is_fresh(v)}
ts: float | int = data.get("timestamp", 31496400) if index:
ts: float | int = data[0].get("timestamp", 31496400)
else:
ts: float | int = data.get("timestamp", 31496400)
dt_ts = Time.clean(Time.from_ts(ts)).timestamp() dt_ts = Time.clean(Time.from_ts(ts)).timestamp()