- edit scraping for livetvsx.py
- edit caching for streamsgate.py
- edit caching for streamhub.py
- misc edits.
This commit is contained in:
doms9 2026-04-21 17:48:21 -04:00
parent fce1e8f6a9
commit 00000d9fe2
6 changed files with 152 additions and 249 deletions

View file

@ -1,7 +1,6 @@
import asyncio import re
from functools import partial from functools import partial
from playwright.async_api import Browser, Page, TimeoutError
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -17,83 +16,61 @@ CACHE_FILE = Cache(TAG, exp=10_800)
BASE_URL = "https://livetv.sx/export/webmasters.php" BASE_URL = "https://livetv.sx/export/webmasters.php"
async def process_event( async def process_event(url: str, url_num: int) -> tuple[str | None, str | None]:
url: str, nones = None, None
url_num: int,
page: Page,
) -> str | None:
captured: list[str] = [] if not (ev_data_1 := await network.request(url, log=log)):
log.warning(f"URL {url_num}) Failed to load url. (EVD1)")
return nones
got_one = asyncio.Event() soup_1 = HTMLParser(ev_data_1.content)
handler = partial( for a_elem in soup_1.css("a"):
network.capture_req, if not (src_title := a_elem.attributes.get("title")) or (
captured=captured, "aliez" not in src_title.lower()
got_one=got_one, ):
) continue
page.on("request", handler) href = a_elem.attributes["href"]
try:
resp = await page.goto(
url,
wait_until="domcontentloaded",
timeout=10_000,
)
if not resp or resp.status != 200:
log.warning(
f"URL {url_num}) Status Code: {resp.status if resp else 'None'}"
)
return
try:
event_a = page.locator('a[title*="Aliez"]').first
href = await event_a.get_attribute("href", timeout=1_250)
except TimeoutError:
log.warning(f"URL {url_num}) No valid sources found.")
return
event_url = href if href.startswith("http") else f"https:{href}" event_url = href if href.startswith("http") else f"https:{href}"
break
await page.goto( else:
event_url, log.warning(f"URL {url_num}) No valid sources found.")
wait_until="domcontentloaded", return nones
timeout=5_000,
)
wait_task = asyncio.create_task(got_one.wait()) if not (ev_data_2 := await network.request(event_url, log=log)):
log.warning(f"URL {url_num}) Failed to load url. (EVD2)")
return nones
try: soup_2 = HTMLParser(ev_data_2.content)
await asyncio.wait_for(wait_task, timeout=6)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
return
finally: ifr_1 = soup_2.css_first("tr > td > iframe")
if not wait_task.done():
wait_task.cancel()
try: if not ifr_1 or not (ifr_1_src := ifr_1.attributes.get("src")):
await wait_task log.warning(f"URL {url_num}) No iframe element found.")
except asyncio.CancelledError: return nones
pass
if captured: ifr_1_src = "".join(
log.info(f"URL {url_num}) Captured M3U8") (ifr_1_src if ifr_1_src.startswith("http") else f"https:{ifr_1_src}").split()
return captured[0] )
log.warning(f"URL {url_num}) No M3U8 captured after waiting.") if not (ev_data_3 := await network.request(ifr_1_src, log=log)):
return log.warning(f"URL {url_num}) Failed to load url. (EVD3)")
return nones
except Exception as e: pattern = re.compile(r'pl\.init\((\'|\")([^"]*)(\'|\")\)', re.I)
log.warning(f"URL {url_num}) {e}")
return
finally: if not (match := pattern.search(ev_data_3.text)):
page.remove_listener("request", handler) log.warning(f"URL {url_num}) No M3U8 source found.")
return nones
log.info(f"URL {url_num}) Captured M3U8")
m3u: str = match[2] if match[2].startswith("http") else f"https:{match[2]}"
return m3u, ifr_1_src
async def get_events(cached_keys: list[str]) -> list[dict[str, str]]: async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
@ -102,6 +79,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
php_data = await network.unvd_client.get(BASE_URL, params={"lang": "en"}) php_data = await network.unvd_client.get(BASE_URL, params={"lang": "en"})
if php_data.status_code != 200: if php_data.status_code != 200:
log.warning("Failed to get php data.")
return events return events
soup = HTMLParser(php_data.content) soup = HTMLParser(php_data.content)
@ -143,7 +121,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events return events
async def scrape(browser: Browser) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
@ -161,49 +139,45 @@ async def scrape(browser: Browser) -> None:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
async with network.event_context(browser, ignore_https=True) as context: for i, ev in enumerate(events, start=1):
for i, ev in enumerate(events, start=1): handler = partial(
async with network.event_page(context) as page: process_event,
handler = partial( url=(link := ev["link"]),
process_event, url_num=i,
url=(link := ev["link"]), )
url_num=i,
page=page,
)
url = await network.safe_process( url, iframe = await network.safe_process(
handler, handler,
url_num=i, url_num=i,
semaphore=network.PW_S, semaphore=network.HTTP_S,
log=log, log=log,
timeout=20, )
)
sport, league, event = ( sport, league, event = (
ev["sport"], ev["sport"],
ev["league"], ev["league"],
ev["event"], ev["event"],
) )
key = f"[{sport} - {league}] {event} ({TAG})" key = f"[{sport} - {league}] {event} ({TAG})"
tvg_id, logo = leagues.get_tvg_info(sport, event) tvg_id, logo = leagues.get_tvg_info(sport, event)
entry = { entry = {
"url": url, "url": url,
"logo": logo, "logo": logo,
"base": "https://livetv.sx/enx/", "base": iframe,
"timestamp": now.timestamp(), "timestamp": now.timestamp(),
"id": tvg_id or "Live.Event.us", "id": tvg_id or "Live.Event.us",
"link": link, "link": link,
} }
cached_urls[key] = entry cached_urls[key] = entry
if url: if url:
valid_count += 1 valid_count += 1
urls[key] = entry urls[key] = entry
log.info(f"Collected and cached {valid_count - cached_count} new event(s)") log.info(f"Collected and cached {valid_count - cached_count} new event(s)")

View file

@ -163,7 +163,7 @@ async def scrape() -> None:
url = await network.safe_process( url = await network.safe_process(
handler, handler,
url_num=i, url_num=i,
semaphore=network.PW_S, semaphore=network.HTTP_S,
log=log, log=log,
) )

View file

@ -117,7 +117,7 @@ async def scrape() -> None:
url = await network.safe_process( url = await network.safe_process(
handler, handler,
url_num=i, url_num=i,
semaphore=network.PW_S, semaphore=network.HTTP_S,
log=log, log=log,
) )

View file

@ -13,9 +13,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMHUB" TAG = "STRMHUB"
CACHE_FILE = Cache(TAG, exp=10_800) CACHE_FILE = Cache(TAG, exp=28_800)
HTML_FILE = Cache(f"{TAG}-html", exp=19_800)
BASE_URL = "https://livesports4u.net" BASE_URL = "https://livesports4u.net"
@ -116,116 +114,78 @@ async def process_event(
page.remove_listener("request", handler) page.remove_listener("request", handler)
async def refresh_html_cache( async def get_events() -> list[dict[str, str]]:
date: str, now = Time.clean(Time.now())
sport_id: str,
ts: float,
) -> dict[str, dict[str, str | float]]:
events = {} tasks = [
network.request(
if not (
html_data := await network.request(
urljoin(BASE_URL, f"events/{date}"), urljoin(BASE_URL, f"events/{date}"),
params={"sport_id": sport_id}, params={"sport_id": sport_id},
log=log, log=log,
) )
): for date in [now.date(), now.delta(days=1).date()]
for sport_id in SPORT_ENDPOINTS
]
results = await asyncio.gather(*tasks)
events = []
if not (soups := [HTMLParser(html.content) for html in results if html]):
return events return events
soup = HTMLParser(html_data.content) for soup in soups:
for section in soup.css(".events-section"):
for section in soup.css(".events-section"): if not (sport_node := section.css_first(".section-titlte")):
if not (sport_node := section.css_first(".section-titlte")):
continue
sport = sport_node.text(strip=True)
for event in section.css(".section-event"):
event_name = "Live Event"
if teams := event.css_first(".event-competitors"):
home, away = teams.text(strip=True).split("vs.")
event_name = f"{away} vs {home}"
if not (event_button := event.css_first(".event-button a")) or not (
href := event_button.attributes.get("href")
):
continue continue
event_date = event.css_first(".event-countdown").attributes.get( sport = sport_node.text(strip=True)
"data-start"
)
event_dt = Time.from_str(event_date, timezone="UTC") for event in section.css(".section-event"):
event_name = "Live Event"
key = f"[{sport}] {event_name} ({TAG})" if teams := event.css_first(".event-competitors"):
home, away = teams.text(strip=True).split("vs.")
events[key] = { event_name = f"{away} vs {home}"
"sport": sport,
"event": event_name, if not (event_button := event.css_first(".event-button a")) or not (
"link": href, href := event_button.attributes.get("href")
"event_ts": event_dt.timestamp(), ):
"timestamp": ts, continue
}
event_date = event.css_first(".event-countdown").attributes.get(
"data-start"
)
event_dt = Time.from_str(event_date, timezone="UTC")
if event_dt.date() != now.date():
continue
events.append(
{
"sport": sport,
"event": event_name,
"link": href,
"timestamp": now.timestamp(),
}
)
return events return events
async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
now = Time.clean(Time.now())
if not (events := HTML_FILE.load()):
log.info("Refreshing HTML cache")
tasks = [
refresh_html_cache(
date,
sport_id,
now.timestamp(),
)
for date in [now.date(), now.delta(days=1).date()]
for sport_id in SPORT_ENDPOINTS
]
results = await asyncio.gather(*tasks)
events = {k: v for data in results for k, v in data.items()}
HTML_FILE.write(events)
live = []
start_ts = now.delta(minutes=-30).timestamp()
end_ts = now.delta(minutes=30).timestamp()
for k, v in events.items():
if k in cached_keys:
continue
if not start_ts <= v["event_ts"] <= end_ts:
continue
live.append(v)
return live
async def scrape(browser: Browser) -> None: async def scrape(browser: Browser) -> None:
cached_urls = CACHE_FILE.load() if cached_urls := CACHE_FILE.load():
urls.update({k: v for k, v in cached_urls.items() if v["url"]})
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} log.info(f"Loaded {len(urls)} event(s) from cache")
valid_count = cached_count = len(valid_urls) return
urls.update(valid_urls)
log.info(f"Loaded {cached_count} event(s) from cache")
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
if events := await get_events(cached_urls.keys()): if events := await get_events():
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
async with network.event_context(browser) as context: async with network.event_context(browser) as context:
@ -249,7 +209,7 @@ async def scrape(browser: Browser) -> None:
sport, event, ts = ( sport, event, ts = (
ev["sport"], ev["sport"],
ev["event"], ev["event"],
ev["event_ts"], ev["timestamp"],
) )
key = f"[{sport}] {event} ({TAG})" key = f"[{sport}] {event} ({TAG})"
@ -268,13 +228,11 @@ async def scrape(browser: Browser) -> None:
cached_urls[key] = entry cached_urls[key] = entry
if url: if url:
valid_count += 1
entry["url"] = url.split("?st")[0] entry["url"] = url.split("?st")[0]
urls[key] = entry urls[key] = entry
log.info(f"Collected and cached {valid_count - cached_count} new event(s)") log.info(f"Collected and cached {len(urls)} new event(s)")
else: else:
log.info("No new events found") log.info("No new events found")

View file

@ -2,7 +2,6 @@ import asyncio
import re import re
from functools import partial from functools import partial
from itertools import chain from itertools import chain
from typing import Any
from urllib.parse import urljoin from urllib.parse import urljoin
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
@ -15,9 +14,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMSGATE" TAG = "STRMSGATE"
CACHE_FILE = Cache(TAG, exp=10_800) CACHE_FILE = Cache(TAG, exp=28_800)
API_FILE = Cache(f"{TAG}-api", exp=19_800)
BASE_URL = "https://streamsgates.io" BASE_URL = "https://streamsgates.io"
@ -85,36 +82,17 @@ async def process_event(url: str, url_num: int) -> tuple[str | None, str | None]
return match[3], ifr_src return match[3], ifr_src
async def refresh_api_cache(now_ts: float) -> list[dict[str, Any]]: async def get_events() -> list[dict[str, str]]:
now = Time.clean(Time.now())
tasks = [network.request(url, log=log) for url in SPORT_URLS] tasks = [network.request(url, log=log) for url in SPORT_URLS]
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
if not (data := [*chain.from_iterable(r.json() for r in results if r)]):
return [{"timestamp": now_ts}]
for ev in data:
ev["ts"] = ev.pop("timestamp")
data[-1]["timestamp"] = now_ts
return data
async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
now = Time.clean(Time.now())
if not (api_data := API_FILE.load(per_entry=False, index=-1)):
log.info("Refreshing API cache")
api_data = await refresh_api_cache(now.timestamp())
API_FILE.write(api_data)
events = [] events = []
start_dt = now.delta(hours=-2.5) if not (api_data := [*chain.from_iterable(r.json() for r in results if r)]):
end_dt = now.delta(minutes=30) return events
for stream_group in api_data: for stream_group in api_data:
date = stream_group.get("time") date = stream_group.get("time")
@ -123,34 +101,30 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
t1, t2 = stream_group.get("away"), stream_group.get("home") t1, t2 = stream_group.get("away"), stream_group.get("home")
if not (t1 and t2):
continue
event = get_event(t1, t2)
if not (date and sport): if not (date and sport):
continue continue
if f"[{sport}] {event} ({TAG})" in cached_keys:
continue
event_dt = Time.from_str(date, timezone="UTC") event_dt = Time.from_str(date, timezone="UTC")
if not start_dt <= event_dt <= end_dt: if event_dt.date() != now.date():
continue continue
if not (streams := stream_group.get("streams")): if not (streams := stream_group.get("streams")) or not (
url := streams[0].get("url")
):
continue continue
if not (url := streams[0].get("url")): if not (t1 and t2):
continue continue
event = get_event(t1, t2)
events.append( events.append(
{ {
"sport": sport, "sport": sport,
"event": event, "event": event,
"link": url, "link": url,
"timestamp": event_dt.timestamp(), "timestamp": now.timestamp(),
} }
) )
@ -158,19 +132,16 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
async def scrape() -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() if cached_urls := CACHE_FILE.load():
urls.update({k: v for k, v in cached_urls.items() if v["url"]})
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} log.info(f"Loaded {len(urls)} event(s) from cache")
valid_count = cached_count = len(valid_urls) return
urls.update(valid_urls)
log.info(f"Loaded {cached_count} event(s) from cache")
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
if events := await get_events(cached_urls.keys()): if events := await get_events():
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
@ -183,7 +154,7 @@ async def scrape() -> None:
url, iframe = await network.safe_process( url, iframe = await network.safe_process(
handler, handler,
url_num=i, url_num=i,
semaphore=network.PW_S, semaphore=network.HTTP_S,
log=log, log=log,
) )
@ -209,11 +180,11 @@ async def scrape() -> None:
cached_urls[key] = entry cached_urls[key] = entry
if url: if url:
valid_count += 1 entry["url"] = url.split("?st")[0]
urls[key] = entry urls[key] = entry
log.info(f"Collected and cached {valid_count - cached_count} new event(s)") log.info(f"Collected and cached {len(urls)} new event(s)")
else: else:
log.info("No new events found") log.info("No new events found")

View file

@ -157,7 +157,7 @@ async def scrape() -> None:
url = await network.safe_process( url = await network.safe_process(
handler, handler,
url_num=i, url_num=i,
semaphore=network.PW_S, semaphore=network.HTTP_S,
log=log, log=log,
) )