fix streambtw.py scraping
This commit is contained in:
doms9 2026-02-08 14:55:52 -05:00
parent 443b8465d4
commit 00000d9f85
2 changed files with 40 additions and 47 deletions

View file

@ -1,9 +1,7 @@
import base64 import base64
import json
import re import re
from functools import partial from functools import partial
from urllib.parse import urljoin
from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -15,6 +13,8 @@ TAG = "STRMBTW"
CACHE_FILE = Cache(TAG, exp=3_600) CACHE_FILE = Cache(TAG, exp=3_600)
API_FILE = Cache(f"{TAG}-api", exp=19_800)
BASE_URL = "https://hiteasport.info" BASE_URL = "https://hiteasport.info"
@ -32,7 +32,6 @@ async def process_event(url: str, url_num: int) -> str | None:
if not (match := valid_m3u8.search(html_data.text)): if not (match := valid_m3u8.search(html_data.text)):
log.info(f"URL {url_num}) No M3U8 found") log.info(f"URL {url_num}) No M3U8 found")
return return
stream_link: str = match[2] stream_link: str = match[2]
@ -46,56 +45,49 @@ async def process_event(url: str, url_num: int) -> str | None:
async def get_events() -> list[dict[str, str]]: async def get_events() -> list[dict[str, str]]:
now = Time.clean(Time.now())
if not (api_data := API_FILE.load(per_entry=False)):
log.info("Refreshing API cache")
api_data = {"timestamp": now.timestamp()}
if r := await network.request(
urljoin(BASE_URL, "public/api.php"),
log=log,
params={"action": "get"},
):
api_data: dict = r.json()
api_data["timestamp"] = now.timestamp()
API_FILE.write(api_data)
events = [] events = []
if not (html_data := await network.request(BASE_URL, log=log)): if last_update := api_data.get("updated_at"):
return events last_update_dt = Time.from_str(last_update, timezone="UTC")
soup = HTMLParser(html_data.content) if last_update_dt.date() != now.date():
return events
script_text = None for info in api_data.get("groups", []):
if not (sport := info["title"]):
sport = "Live Event"
for s in soup.css("script"): if items := info.get("items"):
t = s.text() or "" for event in items:
event_name: str = event["title"]
if "const DATA" in t: link: str = event["url"]
script_text = t
break
if not script_text: events.append(
return events {
"sport": fix_league(sport),
if not ( "event": event_name,
match := re.search(r"const\s+DATA\s*=\s*(\[\s*.*?\s*\]);", script_text, re.S) "link": link,
): }
return events )
data_js = match[1].replace("\n ", "").replace("\n ", "")
s1 = re.sub(r"{\s", '{"', data_js)
s2 = re.sub(r':"', '":"', s1)
s3 = re.sub(r":\[", '":[', s2)
s4 = re.sub(r"},\]", "}]", s3)
s5 = re.sub(r'",\s', '","', s4)
data: list[dict[str, str]] = json.loads(s5)
for matches in data:
league = matches["title"]
items: list[dict[str, str]] = matches["items"]
for info in items:
title = info["title"]
url = info["url"]
events.append(
{
"sport": fix_league(league),
"event": title,
"link": url,
}
)
return events return events

View file

@ -101,6 +101,7 @@ class Time(datetime):
"%Y-%m-%d %H:%M %p", "%Y-%m-%d %H:%M %p",
"%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S",
"%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%SZ",
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%S.%fZ",
"%Y/%m/%d %H:%M", "%Y/%m/%d %H:%M",
"%Y/%m/%d %H:%M:%S", "%Y/%m/%d %H:%M:%S",