This commit is contained in:
doms9 2025-09-03 15:00:17 -04:00
parent dece752803
commit 00000d9440
12 changed files with 371 additions and 133 deletions

View file

@ -10,12 +10,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v5
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Cache venv
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: .venv
key: shared-venv-${{ runner.os }}-${{ hashFiles('uv.lock') }}

View file

@ -10,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v5
uses: actions/checkout@v4
with:
fetch-depth: 0

View file

@ -18,13 +18,13 @@ jobs:
- name: Checkout
if: steps.check_time.outputs.run == 'true'
uses: actions/checkout@v5
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Cache venv
if: steps.check_time.outputs.run == 'true'
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: .venv
key: shared-venv-${{ runner.os }}-${{ hashFiles('uv.lock') }}
@ -32,7 +32,7 @@ jobs:
shared-venv-${{ runner.os }}-
- name: Cache cert
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: M3U8/scrape/utils/cached-ca.pem
key: cert-cache-${{ runner.os }}-${{ hashFiles('M3U8/scrape/utils/cached-ca.pem') }}
@ -57,7 +57,7 @@ jobs:
- name: Cache Playwright browsers
id: cache-pw
if: steps.check_time.outputs.run == 'true'
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/ms-playwright
key: ${{ runner.os }}-playwright

View file

@ -3,10 +3,10 @@ import asyncio
from pathlib import Path
import httpx
from scrape import ace, fstv, livetvsx, tvpass
from scrape.utils import logger
from scrape import ace, fstv, livetvsx, ppv, tvpass
from scrape.utils import get_logger
log = logger.get_logger(__name__)
log = get_logger(__name__)
BASE_URL = "https://s.id/ePwXT"
@ -40,9 +40,10 @@ async def vanilla_fetch() -> tuple[list[str], int]:
async def main() -> None:
tasks = [
# ace.main(client),
# fstv.main(client),
# asyncio.create_task(ace.main(client)),
# asyncio.create_task(fstv.main(client)),
asyncio.create_task(livetvsx.main(CLIENT)),
asyncio.create_task(ppv.main(CLIENT)),
asyncio.create_task(tvpass.main(CLIENT)),
vanilla_fetch(),
]
@ -51,7 +52,7 @@ async def main() -> None:
base_m3u8, tvg_chno = results[-1]
additions = ace.urls | fstv.urls | livetvsx.urls | tvpass.urls
additions = ace.urls | fstv.urls | livetvsx.urls | ppv.urls | tvpass.urls
lines = [
f'#EXTINF:-1 tvg-chno="{chnl_num}" tvg-id="(N/A)" tvg-name="{event}" tvg-logo="{info["logo"]}" group-title="Live Events",{event}\n{info["url"]}'

View file

@ -5,9 +5,7 @@ from urllib.parse import urljoin
import httpx
from selectolax.parser import HTMLParser, Node
from .fstv import get_base
from .tvpass import logos
from .utils.logger import get_logger
from .utils import LOGOS, get_base, get_logger
log = get_logger(__name__)
@ -116,7 +114,7 @@ async def main(client: httpx.AsyncClient) -> None:
urls[f"[{sport}] {event} (S{i})"] = {
"url": link,
"logo": logos.get(
"logo": LOGOS.get(
sport,
"https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
),

View file

@ -4,8 +4,7 @@ from urllib.parse import urljoin
import httpx
from selectolax.parser import HTMLParser
from .tvpass import logos
from .utils.logger import get_logger
from .utils import LOGOS, get_base, get_logger
log = get_logger(__name__)
@ -19,23 +18,6 @@ MIRRORS = [
]
async def check_status(client: httpx.AsyncClient, url: str) -> bool:
try:
r = await client.get(url)
r.raise_for_status()
except Exception:
return False
return r.status_code == 200
async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str:
tasks = [check_status(client, link) for link in mirrors]
results = await asyncio.gather(*tasks)
return [url for url, ok in zip(mirrors, results) if ok][0]
async def get_hrefs(client: httpx.AsyncClient, base_url: str) -> list[tuple[str, str]]:
log.info(f'Scraping from "{base_url}"')
@ -125,7 +107,7 @@ async def main(client: httpx.AsyncClient) -> None:
urls[key] = {
"url": link,
"logo": logos.get(
"logo": LOGOS.get(
event,
"https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
),

View file

@ -5,20 +5,16 @@ import ssl
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any
import httpx
from playwright.async_api import Request, async_playwright
from .tvpass import TZ, logos
from .utils.logger import get_logger
from .utils import LOGOS, TZ, get_logger, safe_process_event
log = get_logger(__name__)
urls: dict[str, dict[str, str]] = {}
tvp_sports = set(logos.keys())
BASE_URL = "https://cdn.livetv861.me/rss/upcoming_en.xml"
CERT_BUNDL_URLS = [
@ -32,23 +28,7 @@ CERT_FILE = Path(__file__).parent / "utils" / "cached-ca.pem"
CACHE_FILE = Path(__file__).parent / "caches" / "livetvsx.json"
async def safe_process_event(fn, url_num: int, timeout=20) -> Any | None:
task = asyncio.create_task(fn())
try:
return await asyncio.wait_for(task, timeout=timeout)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event")
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
except Exception as e:
log.debug(f"URL {url_num}) Ignore exception after timeout: {e}")
exist_sprts = set(LOGOS.keys())
async def write_to_cert(client: httpx.AsyncClient, url: str, cert: Path) -> None:
@ -86,15 +66,13 @@ async def get_cert(client: httpx.AsyncClient) -> ssl.SSLContext:
def load_cache() -> dict[str, dict[str, str | str]]:
try:
data = json.loads(CACHE_FILE.read_text(encoding="utf-8"))
data: dict = json.loads(CACHE_FILE.read_text(encoding="utf-8"))
now = datetime.now(TZ).timestamp()
return {
k: v
for k, v in data.items()
if now - v.get("timestamp", 0) < timedelta(hours=4).total_seconds()
}
age: float = now - data.get("timestamp", 0)
return {k: v for k, v in data.items() if age < 14400} # 4 hours
except (FileNotFoundError, json.JSONDecodeError):
return {}
@ -163,15 +141,17 @@ async def parse_feed(
elem.clear()
continue
elif not tvp_sports & {sport, event}:
events.append(
{
"sport": sport,
"event": event,
"title": title,
"link": link,
}
)
if not exist_sprts & {sport, event}:
continue
events.append(
{
"sport": sport,
"event": event,
"title": title,
"link": link,
}
)
elem.clear()
@ -288,7 +268,7 @@ async def process_event(url: str, url_num: int) -> str | None:
log.warning(f"URL {url_num}) No m3u8 captured in popup or inline playback.")
return
except Exception as e:
except Exception:
try:
ev_page.remove_listener("request", capture_req)
@ -310,10 +290,9 @@ async def main(client: httpx.AsyncClient) -> None:
cert = await get_cert(client)
cached_urls = load_cache()
cached_keys = set(cached_urls.keys())
cached_count = len(cached_urls)
events = await parse_feed(BASE_URL, cert, cached_keys)
events = await parse_feed(BASE_URL, cert, set(cached_urls.keys()))
log.info(f"Processing {len(events)} URLs")
@ -328,13 +307,15 @@ async def main(client: httpx.AsyncClient) -> None:
key = f"[{sport}: {event}] {title}"
url = await safe_process_event(
lambda: process_event(link, url_num=num), url_num=num
lambda: process_event(link, url_num=num),
url_num=num,
log=log,
)
if url:
entry = {
"url": url,
"logo": logos.get(
"logo": LOGOS.get(
sport,
"https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
),
@ -349,4 +330,4 @@ async def main(client: httpx.AsyncClient) -> None:
log.info(f"Cached {cached_count} event(s)")
log.info(f"Collected {new_count} new event(s)")
log.info(f"Collected {new_count} event(s)")

218
M3U8/scrape/ppv.py Normal file
View file

@ -0,0 +1,218 @@
#!/usr/bin/env python3
import asyncio
import json
import re
from datetime import datetime
from pathlib import Path
from urllib.parse import urljoin
import httpx
from playwright.async_api import Request, async_playwright
from .utils import TZ, get_base, get_logger, safe_process_event
log = get_logger(__name__)
urls: dict[str, dict[str, str]] = {}
API_FILE = Path(__file__).parent / "caches" / "ppv_api.json"
CACHE_FILE = Path(__file__).parent / "caches" / "ppv.json"
MIRRORS = ["https://ppv.to", "https://ppvs.su"]
async def refresh_api_cache(client: httpx.AsyncClient, url: str) -> dict:
log.info("Refreshing API cache")
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}"\n{e}')
return {}
return r.json()
def load_cache() -> dict[str, dict[str, str | str]]:
try:
return json.loads(CACHE_FILE.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError):
return {}
def load_api_cache() -> dict[str, dict[str, str | str]]:
try:
data: dict = json.loads(API_FILE.read_text(encoding="utf-8"))
age: float = datetime.now(TZ).timestamp() - data.get("timestamp", 0)
return data if age < 86400 else {} # 24 hours
except (FileNotFoundError, json.JSONDecodeError):
return {}
async def process_event(url: str, url_num: int) -> str | None:
async with async_playwright() as p:
browser = await p.firefox.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
captured: list[str] = []
got_one = asyncio.Event()
def capture_req(req: Request) -> None:
if (
".m3u8" in req.url
and "amazonaws" not in req.url
and "knitcdn" not in req.url
and not captured
):
captured.append(req.url)
got_one.set()
page.on("request", capture_req)
try:
await page.goto(url, wait_until="domcontentloaded", timeout=10_000)
wait_task = asyncio.create_task(got_one.wait())
try:
await asyncio.wait_for(wait_task, timeout=10)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out waiting for m3u8.")
return None
finally:
if not wait_task.done():
wait_task.cancel()
try:
await wait_task
except asyncio.CancelledError:
pass
if captured:
log.info(f"URL {url_num}) Captured M3U8")
return captured[-1]
log.warning(f"URL {url_num}) No m3u8 captured after waiting.")
return None
except Exception as e:
log.warning(f"URL {url_num}) Exception while processing: {e}")
return None
finally:
page.remove_listener("request", capture_req)
await page.close()
await browser.close()
async def get_events(
client: httpx.AsyncClient,
api_url: str,
cached_keys: list[str],
) -> dict[str, dict[str, str | str]]:
events = []
base_url = re.match(r"(https?://.+?)/", api_url)[1]
if not (api_data := load_api_cache()):
api_data = await refresh_api_cache(client, api_url)
API_FILE.write_text(json.dumps(api_data, indent=2), encoding="utf-8")
for stream_group in api_data["streams"]:
sport = stream_group["category"]
if sport == "24/7 Streams":
continue
for event in stream_group["streams"]:
name, start_ts, end_ts, logo, uri_name = (
event["name"],
event["starts_at"],
event["ends_at"],
event.get(
"poster",
"https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
),
event["uri_name"],
)
key = f"[{sport}] {name}"
if key in cached_keys:
continue
start_dt = datetime.fromtimestamp(start_ts, tz=TZ)
end_dt = datetime.fromtimestamp(end_ts, tz=TZ)
if not start_dt <= datetime.now(TZ) < end_dt:
continue
events.append(
{
"sport": sport,
"event": name,
"link": urljoin(base_url, f"/live/{uri_name}"),
"logo": logo,
}
)
return events
async def main(client: httpx.AsyncClient) -> None:
if not (base_url := await get_base(client, MIRRORS)):
log.warning("No working PPV mirrors")
return
log.info(f'Scraping from "{base_url}"')
cached_urls = load_cache()
cached_count = len(cached_urls)
events = await get_events(
client,
urljoin(base_url, "/api/streams"),
set(cached_urls.keys()),
)
log.info(f"Processing {len(events)} URLs")
for num, ev in enumerate(events, start=1):
url = await safe_process_event(
lambda: process_event(ev["link"], url_num=num),
url_num=num,
log=log,
)
if url:
entry = {
"url": url,
"logo": ev["logo"],
}
key = f"[{ev['sport']}] {ev['event']}"
urls[key] = cached_urls[key] = entry
CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
new_count = len(cached_urls) - cached_count
log.info(f"Cached {cached_count} event(s)")
log.info(f"Collected {new_count} event(s)")
# works if no cloudflare bot detection

View file

@ -4,9 +4,8 @@ from datetime import datetime
from pathlib import Path
import httpx
import pytz
from .utils.logger import get_logger
from .utils import LOGOS, TZ, get_logger
log = get_logger(__name__)
@ -16,18 +15,6 @@ BASE_URL = "https://tvpass.org/playlist/m3u"
CACHE_FILE = Path(__file__).parent / "caches" / "tvpass.json"
logos = {
"MLB": "https://i.gyazo.com/0fe7865ef2f06c9507791b24f04dbca8.png",
"NBA": "https://i.gyazo.com/773c23570f095a5d549c23b9401d83f4.png",
"NCAAF": "https://i.gyazo.com/ca63b40c86e757436de9d34d369b24f8.png",
"NCAAB": "https://i.gyazo.com/ca63b40c86e757436de9d34d369b24f8.png",
"NFL": "https://i.gyazo.com/fb4956d7a2fe54a1bac54cd81e1b3f11.png",
"NHL": "https://i.gyazo.com/526607d4e886d5ed1fecca4bff3115e2.png",
"WNBA": "https://i.gyazo.com/02d665a5704118d195dbcd5fa20d5462.png",
}
TZ = pytz.timezone("America/New_York")
def load_cache() -> dict[str, str]:
try:
@ -78,7 +65,7 @@ async def main(client: httpx.AsyncClient) -> None:
if url.endswith("/hd"):
urls[f"[{sport}] {tvg_name}"] = {
"url": f"http://origin.thetvapp.to/hls/{url.split('/')[-2]}/mono.m3u8",
"logo": logos.get(
"logo": LOGOS.get(
sport,
"https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
),

View file

@ -0,0 +1,3 @@
from .config import LOGOS, TZ, get_base, get_logger, safe_process_event
__all__ = ["LOGOS", "TZ", "get_base", "get_logger", "safe_process_event"]

106
M3U8/scrape/utils/config.py Normal file
View file

@ -0,0 +1,106 @@
import asyncio
import logging
from pathlib import Path
from typing import Any
import httpx
import pytz
TZ = pytz.timezone("America/New_York")
LOGOS = {
"MLB": "https://i.gyazo.com/0fe7865ef2f06c9507791b24f04dbca8.png",
"NBA": "https://i.gyazo.com/773c23570f095a5d549c23b9401d83f4.png",
"NCAAF": "https://i.gyazo.com/ca63b40c86e757436de9d34d369b24f8.png",
"NCAAB": "https://i.gyazo.com/ca63b40c86e757436de9d34d369b24f8.png",
"NFL": "https://i.gyazo.com/fb4956d7a2fe54a1bac54cd81e1b3f11.png",
"NHL": "https://i.gyazo.com/526607d4e886d5ed1fecca4bff3115e2.png",
"WNBA": "https://i.gyazo.com/02d665a5704118d195dbcd5fa20d5462.png",
}
LOG_FMT = (
"[%(asctime)s] "
"%(levelname)-8s "
"[%(name)s] "
"%(message)-70s "
"(%(filename)s:%(lineno)d)"
)
COLORS = {
"DEBUG": "\033[37m",
"INFO": "\033[32m",
"WARNING": "\033[33m",
"ERROR": "\033[31m",
"CRITICAL": "\033[41m",
"reset": "\033[0m",
}
class ColorFormatter(logging.Formatter):
def format(self, record) -> str:
color = COLORS.get(record.levelname, "")
levelname = record.levelname
record.levelname = f"{color}{levelname}{COLORS['reset']}"
formatted = super().format(record)
record.levelname = levelname
return formatted
def get_logger(name: str | None = None) -> logging.Logger:
if not name:
name = Path(__file__).stem
logger = logging.getLogger(name)
if not logger.hasHandlers():
handler = logging.StreamHandler()
formatter = ColorFormatter(LOG_FMT, datefmt="%Y-%m-%d | %H:%M:%S")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
async def safe_process_event(
fn,
url_num: int,
timeout=20,
log: logging.Logger | None = None,
) -> Any | None:
if not log:
log = logging.getLogger(__name__)
task = asyncio.create_task(fn())
try:
return await asyncio.wait_for(task, timeout=timeout)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event")
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
except Exception as e:
log.debug(f"URL {url_num}) Ignore exception after timeout: {e}")
async def check_status(client: httpx.AsyncClient, url: str) -> bool:
try:
r = await client.get(url)
r.raise_for_status()
except Exception:
return False
return r.status_code == 200
async def get_base(client: httpx.AsyncClient, mirrors: list[str]) -> str:
tasks = [check_status(client, link) for link in mirrors]
results = await asyncio.gather(*tasks)
return [url for url, ok in zip(mirrors, results) if ok][0]

View file

@ -1,38 +0,0 @@
import logging
log_format = "[%(asctime)s] %(levelname)-8s %(message)-70s %(filename)s:%(lineno)d"
colors = {
"DEBUG": "\033[37m",
"INFO": "\033[32m",
"WARNING": "\033[33m",
"ERROR": "\033[31m",
"CRITICAL": "\033[41m",
"reset": "\033[0m",
}
class ColorFormatter(logging.Formatter):
def format(self, record) -> str:
color = colors.get(record.levelname, "")
record.levelname = f"{color}{record.levelname}{colors['reset']}"
return super().format(record)
def get_logger(name: str = __name__) -> logging.Logger:
logger = logging.getLogger(name)
if not logger.hasHandlers():
handler = logging.StreamHandler()
formatter = ColorFormatter(log_format, datefmt="%Y-%m-%d | %H:%M:%S")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger