This commit is contained in:
doms9 2025-09-04 19:53:27 -04:00
parent cb9d5637fc
commit 00000d905f
7 changed files with 224 additions and 93 deletions

View file

@ -3,7 +3,7 @@ import asyncio
from pathlib import Path from pathlib import Path
import httpx import httpx
from scrape import ace, fstv, livetvsx, ppv, tvpass from scrape import livetvsx, ppv, streambtw, tvpass
from scrape.utils import get_logger from scrape.utils import get_logger
log = get_logger(__name__) log = get_logger(__name__)
@ -40,10 +40,9 @@ async def vanilla_fetch() -> tuple[list[str], int]:
async def main() -> None: async def main() -> None:
tasks = [ tasks = [
# asyncio.create_task(ace.main(client)),
# asyncio.create_task(fstv.main(client)),
asyncio.create_task(livetvsx.main(CLIENT)), asyncio.create_task(livetvsx.main(CLIENT)),
asyncio.create_task(ppv.main(CLIENT)), asyncio.create_task(ppv.main(CLIENT)),
asyncio.create_task(streambtw.main(CLIENT)),
asyncio.create_task(tvpass.main(CLIENT)), asyncio.create_task(tvpass.main(CLIENT)),
vanilla_fetch(), vanilla_fetch(),
] ]
@ -52,7 +51,7 @@ async def main() -> None:
base_m3u8, tvg_chno = results[-1] base_m3u8, tvg_chno = results[-1]
additions = ace.urls | fstv.urls | livetvsx.urls | ppv.urls | tvpass.urls additions = livetvsx.urls | ppv.urls | streambtw.urls | tvpass.urls
lines = [ lines = [
f'#EXTINF:-1 tvg-chno="{chnl_num}" tvg-id="(N/A)" tvg-name="{event}" tvg-logo="{info["logo"]}" group-title="Live Events",{event}\n{info["url"]}' f'#EXTINF:-1 tvg-chno="{chnl_num}" tvg-id="(N/A)" tvg-name="{event}" tvg-logo="{info["logo"]}" group-title="Live Events",{event}\n{info["url"]}'

View file

@ -4,12 +4,21 @@ import json
import ssl import ssl
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from datetime import datetime, timedelta from datetime import datetime, timedelta
from functools import partial
from pathlib import Path from pathlib import Path
import httpx import httpx
from playwright.async_api import Request, async_playwright from playwright.async_api import async_playwright
from .utils import LOGOS, TZ, get_logger, now, safe_process_event from .utils import (
LOGOS,
TZ,
capture_req,
get_logger,
load_ts_cache,
now,
safe_process_event,
)
log = get_logger(__name__) log = get_logger(__name__)
@ -62,21 +71,6 @@ async def get_cert(client: httpx.AsyncClient) -> ssl.SSLContext:
return ssl.create_default_context(cafile=CERT_FILE) return ssl.create_default_context(cafile=CERT_FILE)
def load_cache() -> dict[str, dict[str, str | float]]:
try:
data: dict[str, dict[str, str | float]] = json.loads(
CACHE_FILE.read_text(encoding="utf-8")
)
return {
k: v
for k, v in data.items()
if now.timestamp() - data[k].get("timestamp", 0) < 14400 # 4 hours
}
except (FileNotFoundError, json.JSONDecodeError):
return {}
async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO: async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO:
buffer = io.BytesIO() buffer = io.BytesIO()
@ -104,44 +98,38 @@ async def process_event(url: str, url_num: int) -> str | None:
context = await browser.new_context( context = await browser.new_context(
ignore_https_errors=True # website doesn't send valid certs ignore_https_errors=True # website doesn't send valid certs
) )
ev_page = await context.new_page()
page = await context.new_page()
captured: list[str] = [] captured: list[str] = []
got_one = asyncio.Event() got_one = asyncio.Event()
def capture_req(req: Request) -> None: handler = partial(capture_req, captured=captured, got_one=got_one)
if (
".m3u8" in req.url
and "amazonaws" not in req.url
and "knitcdn" not in req.url
):
captured.append(req.url)
got_one.set()
popup = None popup = None
try: try:
await ev_page.goto( await page.goto(
url, url,
wait_until="domcontentloaded", wait_until="domcontentloaded",
timeout=10_000, timeout=10_000,
) )
btn = await ev_page.query_selector(".lnkhdr > tbody > tr > td:nth-child(2)") btn = await page.query_selector(".lnkhdr > tbody > tr > td:nth-child(2)")
if btn: if btn:
try: try:
await btn.click() await btn.click()
await ev_page.wait_for_timeout(500) await page.wait_for_timeout(500)
except Exception as e: except Exception as e:
log.debug(f"URL {url_num}) Failed to click Browser Links tab: {e}") log.debug(f"URL {url_num}) Failed to click Browser Links tab: {e}")
return return
else: else:
log.warning(f"URL {url_num}) Browser Links tab not found") log.warning(f"URL {url_num}) Browser Links tab not found")
link_img = await ev_page.query_selector( link_img = await page.query_selector(
"tr:nth-child(2) > td:nth-child(1) td:nth-child(6) img" "tr:nth-child(2) > td:nth-child(1) td:nth-child(6) img"
) )
@ -149,10 +137,10 @@ async def process_event(url: str, url_num: int) -> str | None:
log.warning(f"URL {url_num}) No browser link to click.") log.warning(f"URL {url_num}) No browser link to click.")
return return
ev_page.on("request", capture_req) page.on("request", handler)
try: try:
async with ev_page.expect_popup(timeout=5_000) as popup_info: async with page.expect_popup(timeout=5_000) as popup_info:
try: try:
await link_img.click() await link_img.click()
except Exception as e: except Exception as e:
@ -162,22 +150,20 @@ async def process_event(url: str, url_num: int) -> str | None:
popup = await popup_info.value popup = await popup_info.value
popup.on("request", capture_req) popup.on("request", handler)
except Exception: except Exception:
try: try:
await link_img.click() await link_img.click()
except Exception as e: except Exception as e:
log.debug(f"URL {url_num}) Fallback click failed: {e}") log.debug(f"URL {url_num}) Fallback click failed: {e}")
return
wait_task = asyncio.create_task(got_one.wait()) wait_task = asyncio.create_task(got_one.wait())
try: try:
await asyncio.wait_for(wait_task, timeout=1.5e1) await asyncio.wait_for(wait_task, timeout=1.5e1)
except asyncio.TimeoutError: except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out waiting for m3u8.") log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
return return
finally: finally:
@ -189,32 +175,32 @@ async def process_event(url: str, url_num: int) -> str | None:
except asyncio.CancelledError: except asyncio.CancelledError:
pass pass
ev_page.remove_listener("request", capture_req) page.remove_listener("request", handler)
if popup: if popup:
popup.remove_listener("request", capture_req) popup.remove_listener("request", handler)
await popup.close() await popup.close()
await ev_page.close() await page.close()
if captured: if captured:
log.info(f"URL {url_num}) Captured M3U8") log.info(f"URL {url_num}) Captured M3U8")
return captured[-1] return captured[-1]
log.warning(f"URL {url_num}) No m3u8 captured in popup or inline playback.") log.warning(f"URL {url_num}) No M3U8 captured in popup or inline playback.")
return return
except Exception: except Exception:
try: try:
ev_page.remove_listener("request", capture_req) page.remove_listener("request", handler)
if popup: if popup:
popup.remove_listener("request", capture_req) popup.remove_listener("request", handler)
await popup.close() await popup.close()
await ev_page.close() await page.close()
except Exception: except Exception:
pass pass
@ -283,7 +269,7 @@ async def main(client: httpx.AsyncClient) -> None:
cert = await get_cert(client) cert = await get_cert(client)
cached_urls = load_cache() cached_urls = load_ts_cache(CACHE_FILE, 14400)
cached_count = len(cached_urls) cached_count = len(cached_urls)
log.info(f"Collected {cached_count} event(s) from cache") log.info(f"Collected {cached_count} event(s) from cache")

View file

@ -4,13 +4,22 @@ import asyncio
import json import json
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
from functools import partial
from pathlib import Path from pathlib import Path
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx import httpx
from playwright.async_api import Request, async_playwright from playwright.async_api import async_playwright
from .utils import TZ, get_base, get_logger, now, safe_process_event from .utils import (
TZ,
capture_req,
get_base,
get_logger,
load_ts_cache,
now,
safe_process_event,
)
log = get_logger(__name__) log = get_logger(__name__)
@ -20,7 +29,13 @@ API_FILE = Path(__file__).parent / "caches" / "ppv_api.json"
CACHE_FILE = Path(__file__).parent / "caches" / "ppv.json" CACHE_FILE = Path(__file__).parent / "caches" / "ppv.json"
MIRRORS = ["https://ppv.to", "https://ppvs.su"] MIRRORS = [
"https://ppvs.su",
"https://ppv.to",
"https://ppv.wtf",
"https://ppv.land",
"https://freeppv.fun",
]
async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict: async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict:
@ -36,21 +51,6 @@ async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict:
return r.json() return r.json()
def load_cache() -> dict[str, dict[str, str | float]]:
try:
data: dict[str, dict[str, str | float]] = json.loads(
CACHE_FILE.read_text(encoding="utf-8")
)
return {
k: v
for k, v in data.items()
if now.timestamp() - data[k].get("timestamp", 0) < 14400 # 4 hours
}
except (FileNotFoundError, json.JSONDecodeError):
return {}
def load_api_cache() -> dict[str, dict[str, str | str]]: def load_api_cache() -> dict[str, dict[str, str | str]]:
try: try:
data: dict = json.loads(API_FILE.read_text(encoding="utf-8")) data: dict = json.loads(API_FILE.read_text(encoding="utf-8"))
@ -74,16 +74,9 @@ async def process_event(url: str, url_num: int) -> str | None:
got_one = asyncio.Event() got_one = asyncio.Event()
def capture_req(req: Request) -> None: handler = partial(capture_req, captured=captured, got_one=got_one)
if (
".m3u8" in req.url
and "amazonaws" not in req.url
and "knitcdn" not in req.url
):
captured.append(req.url)
got_one.set()
page.on("request", capture_req) page.on("request", handler)
try: try:
await page.goto(url, wait_until="domcontentloaded", timeout=15_000) await page.goto(url, wait_until="domcontentloaded", timeout=15_000)
@ -93,8 +86,8 @@ async def process_event(url: str, url_num: int) -> str | None:
try: try:
await asyncio.wait_for(wait_task, timeout=10) await asyncio.wait_for(wait_task, timeout=10)
except asyncio.TimeoutError: except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out waiting for m3u8.") log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
return None return
finally: finally:
if not wait_task.done(): if not wait_task.done():
@ -110,15 +103,15 @@ async def process_event(url: str, url_num: int) -> str | None:
return captured[-1] return captured[-1]
log.warning(f"URL {url_num}) No m3u8 captured after waiting.") log.warning(f"URL {url_num}) No M3U8 captured after waiting.")
return None return
except Exception as e: except Exception as e:
log.warning(f"URL {url_num}) Exception while processing: {e}") log.warning(f"URL {url_num}) Exception while processing: {e}")
return None return
finally: finally:
page.remove_listener("request", capture_req) page.remove_listener("request", handler)
await page.close() await page.close()
await browser.close() await browser.close()
@ -127,7 +120,7 @@ async def get_events(
client: httpx.AsyncClient, client: httpx.AsyncClient,
api_url: str, api_url: str,
cached_keys: set[str], cached_keys: set[str],
) -> dict[str, dict[str, str | str]]: ) -> list[dict[str, str]]:
events: list[dict[str, str]] = [] events: list[dict[str, str]] = []
@ -186,7 +179,7 @@ async def main(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{base_url}"') log.info(f'Scraping from "{base_url}"')
cached_urls = load_cache() cached_urls = load_ts_cache(CACHE_FILE, 14400)
cached_count = len(cached_urls) cached_count = len(cached_urls)
log.info(f"Collected {cached_count} event(s) from cache") log.info(f"Collected {cached_count} event(s) from cache")

112
M3U8/scrape/streambtw.py Normal file
View file

@ -0,0 +1,112 @@
import json
import re
from pathlib import Path
from urllib.parse import urljoin
import httpx
from selectolax.parser import HTMLParser
from .utils import get_logger, load_ts_cache, now, safe_process_event
log = get_logger(__name__)
urls: dict[str, dict[str, str | float]] = {}
BASE_URL = "https://streambtw.com/"
CACHE_FILE = Path(__file__).parent / "caches" / "streambtw.json"
async def process_event(
client: httpx.AsyncClient,
url: str,
url_num: int,
) -> str | None:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'URL {url_num}) Failed to fetch "{url}"\n{e}')
return
valid_m3u8 = re.compile(
r'var\s+randomM3u8\s*=\s*[\'"]([^\'"]+)[\'"]',
re.IGNORECASE,
)
if match := valid_m3u8.search(r.text):
log.info(f"URL {url_num}) Captured M3U8")
return match[1]
log.info(f"URL {url_num}) No M3U8 found")
async def get_events(client: httpx.AsyncClient) -> list[dict[str, str]]:
try:
r = await client.get(BASE_URL)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{BASE_URL}"\n{e}')
return []
soup = HTMLParser(r.text)
events = []
for card in soup.css("div.container div.card"):
img = card.css_first("img.league-logo")
logo = img.attrs.get("src") if img else None
sport = card.css_first("h5.card-title").text(strip=True)
name = card.css_first("p.card-text").text(strip=True)
link = card.css_first("a.btn.btn-primary")
if href := link.attrs.get("href"):
events.append(
{
"sport": sport,
"event": name,
"link": urljoin(BASE_URL, href),
"logo": logo,
}
)
return events
async def main(client: httpx.AsyncClient) -> None:
if cached := load_ts_cache(CACHE_FILE, 86400): # find out when site updates
urls.update(cached)
log.info(f"Collected {len(urls)} event(s) from cache")
return
log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client)
log.info(f"Processing {len(events)} new URLs")
for i, ev in enumerate(events, start=1):
url = await safe_process_event(
lambda: process_event(client, url=ev["link"], url_num=i),
url_num=i,
log=log,
)
if url:
entry = {
"url": url,
"logo": ev["logo"],
"timestamp": now.timestamp(),
}
urls[f"[{ev['sport']}] {ev['event']}"] = entry
log.info(f"Collected {len(urls)} event(s)")
CACHE_FILE.write_text(json.dumps(urls, indent=2), encoding="utf-8")

View file

@ -18,17 +18,18 @@ CACHE_FILE = Path(__file__).parent / "caches" / "tvpass.json"
def load_cache() -> dict[str, str]: def load_cache() -> dict[str, str]:
try: try:
data = json.loads(CACHE_FILE.read_text(encoding="utf-8")) data = json.loads(CACHE_FILE.read_text(encoding="utf-8"))
return {} if 8 <= now.hour <= 12 else data return {} if now.hour <= 12 else data
except (FileNotFoundError, json.JSONDecodeError): except (FileNotFoundError, json.JSONDecodeError):
return {} return {}
async def fetch_m3u8(client: httpx.AsyncClient) -> list[str] | None: async def fetch_m3u8(client: httpx.AsyncClient) -> list[str]:
try: try:
r = await client.get(BASE_URL) r = await client.get(BASE_URL)
r.raise_for_status() r.raise_for_status()
except Exception as e: except Exception as e:
log.error(f'Failed to fetch "{BASE_URL}"\n{e}') log.error(f'Failed to fetch "{BASE_URL}"\n{e}')
return []
return r.text.splitlines() return r.text.splitlines()
@ -41,10 +42,7 @@ async def main(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
if not (data := await fetch_m3u8(client)): for i, line in enumerate(data := await fetch_m3u8(client)):
return
for i, line in enumerate(data):
if line.startswith("#EXTINF"): if line.startswith("#EXTINF"):
tvg_id_match = re.search(r'tvg-id="([^"]*)"', line) tvg_id_match = re.search(r'tvg-id="([^"]*)"', line)
tvg_name_match = re.search(r'tvg-name="([^"]*)"', line) tvg_name_match = re.search(r'tvg-name="([^"]*)"', line)
@ -69,7 +67,6 @@ async def main(client: httpx.AsyncClient) -> None:
), ),
} }
if urls: CACHE_FILE.write_text(json.dumps(urls, indent=2), encoding="utf-8")
CACHE_FILE.write_text(json.dumps(urls, indent=2), encoding="utf-8")
log.info(f"Cached {len(urls)} event(s)") log.info(f"Cached {len(urls)} event(s)")

View file

@ -1,10 +1,21 @@
from .config import LOGOS, TZ, get_base, get_logger, now, safe_process_event from .config import (
LOGOS,
TZ,
capture_req,
get_base,
get_logger,
load_ts_cache,
now,
safe_process_event,
)
__all__ = [ __all__ = [
"LOGOS", "LOGOS",
"TZ", "TZ",
"capture_req",
"get_base", "get_base",
"get_logger", "get_logger",
"load_ts_cache",
"now", "now",
"safe_process_event", "safe_process_event",
] ]

View file

@ -1,11 +1,14 @@
import asyncio import asyncio
import json
import logging import logging
import re
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
import httpx import httpx
import pytz import pytz
from playwright.async_api import Request
TZ = pytz.timezone("America/New_York") TZ = pytz.timezone("America/New_York")
@ -65,6 +68,24 @@ def get_logger(name: str | None = None) -> logging.Logger:
return logger return logger
def load_ts_cache(
file: Path,
cache_exp: int | float,
) -> dict[str, dict[str, str | float]]:
try:
data: dict[str, dict[str, str | float]] = json.loads(
file.read_text(encoding="utf-8")
)
return {
k: v
for k, v in data.items()
if now.timestamp() - v.get("timestamp", 0) < cache_exp
}
except (FileNotFoundError, json.JSONDecodeError):
return {}
async def safe_process_event( async def safe_process_event(
fn, fn,
url_num: int, url_num: int,
@ -107,3 +128,15 @@ async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str:
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
return [url for url, ok in zip(mirrors, results) if ok][0] return [url for url, ok in zip(mirrors, results) if ok][0]
def capture_req(
req: Request,
captured: list[str],
got_one: asyncio.Event,
) -> None:
valid_m3u8 = re.compile(r"^(?!.*(amazonaws|knitcdn)).*\.m3u8")
if valid_m3u8.search(req.url):
captured.append(req.url)
got_one.set()