iptv/M3U8/scrapers/ppv.py

268 lines
6.8 KiB
Python
Raw Normal View History

2025-09-03 15:00:17 -04:00
import asyncio
2025-09-04 11:50:29 -04:00
from datetime import datetime, timedelta
2025-09-04 19:53:27 -04:00
from functools import partial
2025-09-03 15:00:17 -04:00
from pathlib import Path
from urllib.parse import urljoin
import httpx
2025-09-04 19:53:27 -04:00
from playwright.async_api import async_playwright
from .utils import (
TZ,
capture_req,
get_base,
get_logger,
2025-09-19 02:05:40 -04:00
league_info,
2025-09-05 10:37:22 -04:00
load_cache,
2025-09-17 19:35:28 -04:00
new_browser,
2025-09-04 19:53:27 -04:00
now,
safe_process_event,
2025-09-15 09:26:20 -04:00
write_cache,
2025-09-04 19:53:27 -04:00
)
2025-09-03 15:00:17 -04:00
log = get_logger(__name__)
2025-09-04 14:50:52 -04:00
urls: dict[str, dict[str, str | float]] = {}
2025-09-03 15:00:17 -04:00
API_FILE = Path(__file__).parent / "caches" / "ppv_api.json"
CACHE_FILE = Path(__file__).parent / "caches" / "ppv.json"
2025-09-04 19:53:27 -04:00
MIRRORS = [
"https://ppvs.su",
"https://ppv.to",
"https://ppv.wtf",
"https://ppv.land",
"https://freeppv.fun",
]
2025-09-03 15:00:17 -04:00
2025-09-20 23:26:18 -04:00
NFL_TEAMS = {
"Arizona Cardinals",
"Atlanta Falcons",
"Baltimore Ravens",
"Buffalo Bills",
"Carolina Panthers",
"Chicago Bears",
"Cincinnati Bengals",
"Cleveland Browns",
"Dallas Cowboys",
"Denver Broncos",
"Detroit Lions",
"Green Bay Packers",
"Houston Texans",
"Indianapolis Colts",
"Jacksonville Jaguars",
"Kansas City Chiefs",
"Las Vegas Raiders",
"Los Angeles Chargers",
"Los Angeles Rams",
"Miami Dolphins",
"Minnesota Vikings",
"New England Patriots",
"New Orleans Saints",
"New York Giants",
"New York Jets",
"Philadelphia Eagles",
"Pittsburgh Steelers",
"San Francisco 49ers",
"Seattle Seahawks",
"Tampa Bay Buccaneers",
"Tennessee Titans",
2025-09-20 23:47:18 -04:00
"Washington Commanders",
2025-09-20 23:26:18 -04:00
}
def is_nfl(event: str) -> bool:
try:
t1, t2 = event.split(" vs. ")
return t1 in NFL_TEAMS or t2 in NFL_TEAMS
except ValueError:
2025-09-22 00:31:21 -04:00
return event.lower() == "nfl redzone"
2025-09-20 23:26:18 -04:00
2025-09-03 15:00:17 -04:00
2025-09-09 13:34:16 -04:00
async def refresh_api_cache(
client: httpx.AsyncClient, url: str
) -> dict[str, dict[str, str]]:
2025-09-03 15:00:17 -04:00
log.info("Refreshing API cache")
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}"\n{e}')
return {}
return r.json()
async def process_event(url: str, url_num: int) -> str | None:
async with async_playwright() as p:
2025-09-17 19:35:28 -04:00
browser, context = await new_browser(p)
2025-09-03 15:00:17 -04:00
page = await context.new_page()
captured: list[str] = []
got_one = asyncio.Event()
2025-09-04 19:53:27 -04:00
handler = partial(capture_req, captured=captured, got_one=got_one)
2025-09-03 15:00:17 -04:00
2025-09-04 19:53:27 -04:00
page.on("request", handler)
2025-09-03 15:00:17 -04:00
try:
2025-09-04 09:59:19 -04:00
await page.goto(url, wait_until="domcontentloaded", timeout=15_000)
2025-09-03 15:00:17 -04:00
wait_task = asyncio.create_task(got_one.wait())
try:
await asyncio.wait_for(wait_task, timeout=10)
except asyncio.TimeoutError:
2025-09-04 19:53:27 -04:00
log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
return
2025-09-03 15:00:17 -04:00
finally:
if not wait_task.done():
wait_task.cancel()
try:
await wait_task
except asyncio.CancelledError:
pass
if captured:
log.info(f"URL {url_num}) Captured M3U8")
return captured[-1]
2025-09-04 19:53:27 -04:00
log.warning(f"URL {url_num}) No M3U8 captured after waiting.")
return
2025-09-03 15:00:17 -04:00
except Exception as e:
log.warning(f"URL {url_num}) Exception while processing: {e}")
2025-09-04 19:53:27 -04:00
return
2025-09-03 15:00:17 -04:00
finally:
2025-09-04 19:53:27 -04:00
page.remove_listener("request", handler)
2025-09-03 15:00:17 -04:00
await page.close()
await browser.close()
async def get_events(
client: httpx.AsyncClient,
2025-09-20 23:26:18 -04:00
base_url: str,
2025-09-03 18:41:07 -04:00
cached_keys: set[str],
2025-09-04 19:53:27 -04:00
) -> list[dict[str, str]]:
2025-09-03 18:41:07 -04:00
events: list[dict[str, str]] = []
2025-09-03 15:00:17 -04:00
2025-09-09 13:34:16 -04:00
if not (
api_data := load_cache(
API_FILE,
exp=86400,
nearest_hr=True,
per_entry=False,
)
):
2025-09-20 23:26:18 -04:00
api_data = await refresh_api_cache(client, urljoin(base_url, "api/streams"))
write_cache(API_FILE, api_data)
2025-09-03 15:00:17 -04:00
for stream_group in api_data["streams"]:
sport = stream_group["category"]
if sport == "24/7 Streams":
continue
for event in stream_group["streams"]:
name, start_ts, end_ts, logo, uri_name = (
event["name"],
event["starts_at"],
event["ends_at"],
2025-09-19 02:05:40 -04:00
event["poster"],
2025-09-03 15:00:17 -04:00
event["uri_name"],
)
2025-09-13 04:42:55 -04:00
key = f"[{sport}] {name} (PPV)"
2025-09-03 15:00:17 -04:00
2025-09-13 04:42:55 -04:00
if cached_keys & {key}:
2025-09-03 15:00:17 -04:00
continue
2025-09-04 11:50:29 -04:00
start_dt = datetime.fromtimestamp(start_ts, tz=TZ) - timedelta(minutes=30)
2025-09-03 15:00:17 -04:00
2025-09-04 11:50:29 -04:00
end_dt = datetime.fromtimestamp(end_ts, tz=TZ) + timedelta(minutes=30)
2025-09-03 15:00:17 -04:00
2025-09-03 18:41:07 -04:00
if not start_dt <= now < end_dt:
2025-09-03 15:00:17 -04:00
continue
events.append(
{
"sport": sport,
"event": name,
2025-09-08 12:02:36 -04:00
"link": urljoin(base_url, f"live/{uri_name}"),
2025-09-03 15:00:17 -04:00
"logo": logo,
}
)
return events
2025-09-20 23:26:18 -04:00
async def scrape(client: httpx.AsyncClient) -> None:
cached_urls = load_cache(CACHE_FILE, exp=10_800)
2025-09-03 15:00:17 -04:00
cached_count = len(cached_urls)
2025-09-05 15:56:07 -04:00
urls.update(cached_urls)
2025-09-03 15:00:17 -04:00
2025-09-04 14:50:52 -04:00
log.info(f"Collected {cached_count} event(s) from cache")
2025-09-08 12:02:36 -04:00
if not (base_url := await get_base(client, MIRRORS)):
log.warning("No working PPV mirrors")
2025-09-15 09:26:20 -04:00
write_cache(CACHE_FILE, cached_urls)
2025-09-08 12:02:36 -04:00
return
log.info(f'Scraping from "{base_url}"')
2025-09-03 15:00:17 -04:00
events = await get_events(
client,
2025-09-20 23:26:18 -04:00
base_url,
2025-09-03 15:00:17 -04:00
set(cached_urls.keys()),
)
2025-09-05 15:56:07 -04:00
log.info(f"Processing {len(events)} new URL(s)")
2025-09-03 15:00:17 -04:00
2025-09-03 18:41:07 -04:00
for i, ev in enumerate(events, start=1):
2025-09-03 15:00:17 -04:00
url = await safe_process_event(
2025-09-03 18:41:07 -04:00
lambda: process_event(ev["link"], url_num=i),
url_num=i,
2025-09-03 15:00:17 -04:00
log=log,
)
if url:
2025-09-19 02:05:40 -04:00
sport, event = ev["sport"], ev["event"]
2025-09-20 23:26:18 -04:00
if sport == "American Football":
tvg_id = "NFL.Dummy.us" if is_nfl(event) else "NCAA.Sports.Dummy.us"
else:
tvg_id = league_info(sport)[0]
2025-09-19 02:05:40 -04:00
key = f"[{sport}] {event} (PPV)"
2025-09-13 04:42:55 -04:00
2025-09-03 15:00:17 -04:00
entry = {
"url": url,
"logo": ev["logo"],
2025-09-13 04:42:55 -04:00
"base": base_url,
2025-09-04 09:59:19 -04:00
"timestamp": now.timestamp(),
2025-09-20 23:26:18 -04:00
"id": tvg_id or "Live.Event.us",
2025-09-03 15:00:17 -04:00
}
urls[key] = cached_urls[key] = entry
2025-09-09 13:34:16 -04:00
if new_count := len(cached_urls) - cached_count:
2025-09-05 15:56:07 -04:00
log.info(f"Collected and cached {new_count} new event(s)")
else:
log.info("No new events found")
2025-09-03 15:00:17 -04:00
2025-09-15 09:26:20 -04:00
write_cache(CACHE_FILE, cached_urls)
2025-09-11 14:55:53 -04:00
2025-09-03 15:00:17 -04:00
# works if no cloudflare bot detection