This commit is contained in:
doms9 2025-09-11 14:55:53 -04:00
parent 8796e2dfc6
commit 00000d9ef1
7 changed files with 318 additions and 44 deletions

View file

@ -3,8 +3,8 @@ import asyncio
from pathlib import Path from pathlib import Path
import httpx import httpx
from scrapers import livetvsx, ppv, streambtw, tvpass from scrapers import livetvsx, ppv, streambtw, streameast, tvpass
from scrapers.utils import get_logger from scrapers.utils import UA, get_logger
log = get_logger(__name__) log = get_logger(__name__)
@ -15,9 +15,7 @@ M3U8_FILE = Path(__file__).parent / "TV.m3u8"
CLIENT = httpx.AsyncClient( CLIENT = httpx.AsyncClient(
timeout=5, timeout=5,
follow_redirects=True, follow_redirects=True,
headers={ headers={"User-Agent": UA},
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0"
},
) )
@ -41,6 +39,7 @@ async def main() -> None:
asyncio.create_task(livetvsx.main(CLIENT)), asyncio.create_task(livetvsx.main(CLIENT)),
asyncio.create_task(ppv.main(CLIENT)), asyncio.create_task(ppv.main(CLIENT)),
asyncio.create_task(streambtw.main(CLIENT)), asyncio.create_task(streambtw.main(CLIENT)),
asyncio.create_task(streameast.main(CLIENT)),
asyncio.create_task(tvpass.main(CLIENT)), asyncio.create_task(tvpass.main(CLIENT)),
vanilla_fetch(), vanilla_fetch(),
] ]
@ -49,7 +48,9 @@ async def main() -> None:
base_m3u8, tvg_chno = results[-1] base_m3u8, tvg_chno = results[-1]
additions = livetvsx.urls | ppv.urls | streambtw.urls | tvpass.urls additions = (
livetvsx.urls | ppv.urls | streambtw.urls | streameast.urls | tvpass.urls
)
lines = [ lines = [
f'#EXTINF:-1 tvg-chno="{chnl_num}" tvg-id="(N/A)" tvg-name="{event}" tvg-logo="{info["logo"]}" group-title="Live Events",{event}\n{info["url"]}' f'#EXTINF:-1 tvg-chno="{chnl_num}" tvg-id="(N/A)" tvg-name="{event}" tvg-logo="{info["logo"]}" group-title="Live Events",{event}\n{info["url"]}'

View file

@ -18,7 +18,7 @@ MIRRORS = [
] ]
async def get_hrefs(client: httpx.AsyncClient, base_url: str) -> list[tuple[str, str]]: async def get_events(client: httpx.AsyncClient, base_url: str) -> list[tuple[str, str]]:
log.info(f'Scraping from "{base_url}"') log.info(f'Scraping from "{base_url}"')
try: try:
@ -55,7 +55,7 @@ async def get_hrefs(client: httpx.AsyncClient, base_url: str) -> list[tuple[str,
return events.items() return events.items()
async def fetch_m3u8(client: httpx.AsyncClient, url: str) -> tuple[str, list[str]]: async def process_events(client: httpx.AsyncClient, url: str) -> tuple[str, list[str]]:
try: try:
r = await client.get(url) r = await client.get(url)
r.raise_for_status() r.raise_for_status()
@ -89,9 +89,9 @@ async def main(client: httpx.AsyncClient) -> None:
log.warning("No working FSTV mirrors") log.warning("No working FSTV mirrors")
return return
events = await get_hrefs(client, base_url) events = await get_events(client, base_url)
tasks = [fetch_m3u8(client, href) for _, href in events if href] tasks = [process_events(client, href) for _, href in events if href]
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
for (event, _), (match_name, m3u8_urls) in zip(events, results): for (event, _), (match_name, m3u8_urls) in zip(events, results):

View file

@ -14,6 +14,7 @@ from .utils import (
LOGOS, LOGOS,
TZ, TZ,
capture_req, capture_req,
firefox,
get_logger, get_logger,
load_cache, load_cache,
now, now,
@ -96,11 +97,7 @@ async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO | No
async def process_event(url: str, url_num: int) -> str | None: async def process_event(url: str, url_num: int) -> str | None:
async with async_playwright() as p: async with async_playwright() as p:
browser = await p.firefox.launch(headless=True) browser, context = await firefox(p, ignore_https_errors=True)
context = await browser.new_context(
ignore_https_errors=True # website doesn't send valid certs
)
page = await context.new_page() page = await context.new_page()
@ -217,7 +214,9 @@ async def get_events(
) -> list[dict[str, str]]: ) -> list[dict[str, str]]:
events: list[dict[str, str]] = [] events: list[dict[str, str]] = []
window_start, window_end = now - timedelta(hours=1), now + timedelta(minutes=30)
start_dt = now - timedelta(minutes=30)
end_dt = now + timedelta(minutes=30)
if buffer := await fetch_xml_stream(url, ssl_ctx): if buffer := await fetch_xml_stream(url, ssl_ctx):
pub_date_format = "%a, %d %b %Y %H:%M:%S %z" pub_date_format = "%a, %d %b %Y %H:%M:%S %z"
@ -236,30 +235,33 @@ async def get_events(
elem.clear() elem.clear()
continue continue
if window_start <= dt <= window_end: if not start_dt <= dt <= end_dt:
sport, event = ( elem.clear()
( continue
desc.split(".")[0].strip(),
" ".join(p.strip() for p in desc.split(".")[1:]), sport, event = (
) (
if desc desc.split(".")[0].strip(),
else ("", "") " ".join(p.strip() for p in desc.split(".")[1:]),
) )
if desc
else ("", "")
)
key = f"[{sport}: {event}] {title}" key = f"[{sport}: {event}] {title}"
if key in cached_keys: if key in cached_keys:
elem.clear() elem.clear()
continue continue
events.append( events.append(
{ {
"sport": sport, "sport": sport,
"event": event, "event": event,
"title": title, "title": title,
"link": link, "link": link,
} }
) )
elem.clear() elem.clear()
@ -312,8 +314,8 @@ async def main(client: httpx.AsyncClient) -> None:
urls[key] = cached_urls[key] = entry urls[key] = cached_urls[key] = entry
if new_count := len(cached_urls) - cached_count: if new_count := len(cached_urls) - cached_count:
CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")
else: else:
log.info("No new events found") log.info("No new events found")
CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")

View file

@ -14,6 +14,7 @@ from playwright.async_api import async_playwright
from .utils import ( from .utils import (
TZ, TZ,
capture_req, capture_req,
firefox,
get_base, get_base,
get_logger, get_logger,
load_cache, load_cache,
@ -55,9 +56,7 @@ async def refresh_api_cache(
async def process_event(url: str, url_num: int) -> str | None: async def process_event(url: str, url_num: int) -> str | None:
async with async_playwright() as p: async with async_playwright() as p:
browser = await p.firefox.launch(headless=True) browser, context = await firefox(p)
context = await browser.new_context()
page = await context.new_page() page = await context.new_page()
@ -210,11 +209,11 @@ async def main(client: httpx.AsyncClient) -> None:
urls[key] = cached_urls[key] = entry urls[key] = cached_urls[key] = entry
if new_count := len(cached_urls) - cached_count: if new_count := len(cached_urls) - cached_count:
CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")
else: else:
log.info("No new events found") log.info("No new events found")
CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")
# works if no cloudflare bot detection # works if no cloudflare bot detection

206
M3U8/scrapers/streameast.py Normal file
View file

@ -0,0 +1,206 @@
import asyncio
import json
from datetime import datetime, timedelta
from functools import partial
from pathlib import Path
from urllib.parse import urljoin
import httpx
from playwright.async_api import async_playwright
from selectolax.parser import HTMLParser
from .utils import (
LOGOS,
TZ,
capture_req,
firefox,
get_base,
get_logger,
load_cache,
now,
safe_process_event,
)
log = get_logger(__name__)
urls: dict[str, dict[str, str | float]] = {}
CACHE_FILE = Path(__file__).parent / "caches" / "streameast.json"
MIRRORS = [
"https://streameast.ga",
"https://streameast.tw",
"https://streameast.ph",
"https://streameast.sg",
"https://streameast.ch",
"https://streameast.ec",
"https://streameast.fi",
"https://streameast.ms",
"https://streameast.ps",
"https://streameast.cf",
"https://streameast.sk",
"https://thestreameast.co",
"https://thestreameast.fun",
"https://thestreameast.ru",
"https://thestreameast.su",
]
LOGOS["CFB"] = LOGOS["NCAAF"]
LOGOS["CBB"] = LOGOS["NCAAB"]
async def process_event(url: str, url_num: int) -> str | None:
async with async_playwright() as p:
browser, context = await firefox(p)
page = await context.new_page()
captured: list[str] = []
got_one = asyncio.Event()
handler = partial(capture_req, captured=captured, got_one=got_one)
page.on("request", handler)
try:
await page.goto(url, wait_until="domcontentloaded", timeout=15_000)
wait_task = asyncio.create_task(got_one.wait())
try:
await asyncio.wait_for(wait_task, timeout=10)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
return
finally:
if not wait_task.done():
wait_task.cancel()
try:
await wait_task
except asyncio.CancelledError:
pass
if captured:
log.info(f"URL {url_num}) Captured M3U8")
return captured[-1]
log.warning(f"URL {url_num}) No M3U8 captured after waiting.")
return
except Exception as e:
log.warning(f"URL {url_num}) Exception while processing: {e}")
return
finally:
page.remove_listener("request", handler)
await page.close()
await browser.close()
async def get_events(
client: httpx.AsyncClient,
url: str,
cached_keys: list[str],
) -> list[dict[str, str]]:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}"\n{e}')
return []
soup = HTMLParser(r.text)
events = []
start_dt = now - timedelta(minutes=30)
end_dt = now + timedelta(minutes=30)
for li in soup.css("li.f1-podium--item"):
a = li.css_first("a.f1-podium--link")
if not a:
continue
href = urljoin(url, a.attributes.get("href", ""))
sport = a.css_first(".MacBaslikKat").text(strip=True)
name = a.css_first(".MacIsimleri").text(strip=True)
time_span = a.css_first(".f1-podium--time")
time_text = time_span.text(strip=True)
timestamp = int(time_span.attributes.get("data-zaman"))
key = f"[{sport}] {name}"
if key in cached_keys:
continue
event_dt = datetime.fromtimestamp(timestamp, TZ)
if time_text == "LIVE" or (start_dt <= event_dt < end_dt):
events.append(
{
"sport": sport,
"event": name,
"link": href,
"logo": LOGOS.get(
sport,
"https://i.gyazo.com/ec27417a9644ae517196494afa72d2b9.png",
),
}
)
return events
async def main(client: httpx.AsyncClient) -> None:
cached_urls = load_cache(CACHE_FILE, exp=14400)
cached_count = len(cached_urls)
urls.update(cached_urls)
log.info(f"Collected {cached_count} event(s) from cache")
if not (base_url := await get_base(client, MIRRORS)):
log.warning("No working StreamEast mirrors")
return
log.info(f'Scraping from "{base_url}"')
events = await get_events(
client,
base_url,
set(cached_urls.keys()),
)
log.info(f"Processing {len(events)} new URL(s)")
for i, ev in enumerate(events, start=1):
url = await safe_process_event(
lambda: process_event(ev["link"], url_num=i),
url_num=i,
log=log,
)
if url:
entry = {
"url": url,
"logo": ev["logo"],
"timestamp": now.timestamp(),
}
key = f"[{ev['sport']}] {ev['event']}"
urls[key] = cached_urls[key] = entry
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")
else:
log.info("No new events found")
CACHE_FILE.write_text(json.dumps(cached_urls, indent=2), encoding="utf-8")

View file

@ -1,7 +1,9 @@
from .config import ( from .config import (
LOGOS, LOGOS,
TZ, TZ,
UA,
capture_req, capture_req,
firefox,
get_base, get_base,
get_logger, get_logger,
load_cache, load_cache,
@ -12,7 +14,9 @@ from .config import (
__all__ = [ __all__ = [
"LOGOS", "LOGOS",
"TZ", "TZ",
"UA",
"capture_req", "capture_req",
"firefox",
"get_base", "get_base",
"get_logger", "get_logger",
"load_cache", "load_cache",

View file

@ -9,12 +9,18 @@ from typing import Any
import httpx import httpx
import pytz import pytz
from playwright.async_api import Request from playwright.async_api import Request, Playwright, Browser, BrowserContext
TZ = pytz.timezone("America/New_York") TZ = pytz.timezone("America/New_York")
now = datetime.now(TZ) now = datetime.now(TZ)
UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0"
)
LOGOS = { LOGOS = {
"MLB": "https://i.gyazo.com/0fe7865ef2f06c9507791b24f04dbca8.png", "MLB": "https://i.gyazo.com/0fe7865ef2f06c9507791b24f04dbca8.png",
"NBA": "https://i.gyazo.com/773c23570f095a5d549c23b9401d83f4.png", "NBA": "https://i.gyazo.com/773c23570f095a5d549c23b9401d83f4.png",
@ -162,3 +168,59 @@ def capture_req(
if valid_m3u8.search(req.url): if valid_m3u8.search(req.url):
captured.append(req.url) captured.append(req.url)
got_one.set() got_one.set()
async def firefox(
playwright: Playwright, ignore_https_errors: bool = False
) -> tuple[Browser, BrowserContext]:
browser = await playwright.firefox.launch(headless=True)
context = await browser.new_context(
user_agent=UA,
viewport={"width": 1366, "height": 768},
device_scale_factor=1,
locale="en-US",
timezone_id="America/New_York",
color_scheme="dark",
permissions=["geolocation"],
extra_http_headers={
"Accept-Language": "en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
},
ignore_https_errors=ignore_https_errors,
)
await context.add_init_script(
"""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4]
});
const elementDescriptor = Object.getOwnPropertyDescriptor(HTMLElement.prototype, 'offsetHeight');
Object.defineProperty(HTMLDivElement.prototype, 'offsetHeight', {
...elementDescriptor,
get: function() {
if (this.id === 'modernizr') { return 24; }
return elementDescriptor.get.apply(this);
}
});
Object.defineProperty(window.screen, 'width', { get: () => 1366 });
Object.defineProperty(window.screen, 'height', { get: () => 768 });
const getParameter = WebGLRenderingContext.prototype. getParameter;
WebGLRenderingContext.prototype.getParameter = function (param) {
if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL
if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL
return getParameter.apply(this, [param]);
};
"""
)
return browser, context