misc. edits
This commit is contained in:
doms9 2025-12-18 03:04:11 -05:00
parent a8ead389ea
commit 00000d9079
17 changed files with 273 additions and 552 deletions

View file

@ -48,22 +48,22 @@ async def main() -> None:
base_m3u8, tvg_chno = load_base() base_m3u8, tvg_chno = load_base()
tasks = [ tasks = [
asyncio.create_task(fawa.scrape(network.client)), asyncio.create_task(fawa.scrape()),
asyncio.create_task(istreameast.scrape(network.client)), asyncio.create_task(istreameast.scrape()),
asyncio.create_task(lotus.scrape(network.client)), asyncio.create_task(lotus.scrape()),
asyncio.create_task(pixel.scrape()), asyncio.create_task(pixel.scrape()),
asyncio.create_task(ppv.scrape(network.client)), asyncio.create_task(ppv.scrape()),
asyncio.create_task(roxie.scrape(network.client)), asyncio.create_task(roxie.scrape()),
asyncio.create_task(shark.scrape(network.client)), asyncio.create_task(shark.scrape()),
asyncio.create_task(sport9.scrape(network.client)), asyncio.create_task(sport9.scrape()),
asyncio.create_task(streamcenter.scrape(network.client)), asyncio.create_task(streamcenter.scrape()),
asyncio.create_task(streamfree.scrape(network.client)), asyncio.create_task(streamfree.scrape()),
asyncio.create_task(streamhub.scrape(network.client)), asyncio.create_task(streamhub.scrape()),
asyncio.create_task(streamsgate.scrape(network.client)), asyncio.create_task(streamsgate.scrape()),
asyncio.create_task(strmd.scrape(network.client)), asyncio.create_task(strmd.scrape()),
asyncio.create_task(tvpass.scrape(network.client)), asyncio.create_task(tvpass.scrape()),
asyncio.create_task(watchfooty.scrape(network.client)), asyncio.create_task(watchfooty.scrape()),
asyncio.create_task(webcast.scrape(network.client)), asyncio.create_task(webcast.scrape()),
] ]
await asyncio.gather(*tasks) await asyncio.gather(*tasks)

View file

@ -2,7 +2,6 @@ import re
from functools import partial from functools import partial
from urllib.parse import quote, urljoin from urllib.parse import quote, urljoin
import httpx
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -18,17 +17,9 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
BASE_URL = "http://www.fawanews.sc/" BASE_URL = "http://www.fawanews.sc/"
async def process_event( async def process_event(url: str, url_num: int) -> str | None:
client: httpx.AsyncClient, if not (html_data := await network.request(url, log=log)):
url: str, log.info(f"URL {url_num}) Failed to load url.")
url_num: int,
) -> str | None:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'URL {url_num}) Failed to fetch "{url}": {e}')
return return
valid_m3u8 = re.compile( valid_m3u8 = re.compile(
@ -36,7 +27,7 @@ async def process_event(
re.IGNORECASE, re.IGNORECASE,
) )
if not (match := valid_m3u8.search(r.text)): if not (match := valid_m3u8.search(html_data.text)):
log.info(f"URL {url_num}) No M3U8 found") log.info(f"URL {url_num}) No M3U8 found")
return return
@ -44,25 +35,17 @@ async def process_event(
return match[2] return match[2]
async def get_events( async def get_events(cached_hrefs: set[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient, events = []
cached_hrefs: set[str],
) -> list[dict[str, str]]:
try:
r = await client.get(BASE_URL)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{BASE_URL}": {e}')
return [] if not (html_data := await network.request(BASE_URL, log=log)):
return events
soup = HTMLParser(r.content) soup = HTMLParser(html_data.content)
valid_event = re.compile(r"\d{1,2}:\d{1,2}") valid_event = re.compile(r"\d{1,2}:\d{1,2}")
clean_event = re.compile(r"\s+-+\s+\w{1,4}") clean_event = re.compile(r"\s+-+\s+\w{1,4}")
events = []
for item in soup.css(".user-item"): for item in soup.css(".user-item"):
text = item.css_first(".user-item__name") text = item.css_first(".user-item__name")
subtext = item.css_first(".user-item__playing") subtext = item.css_first(".user-item__playing")
@ -98,7 +81,7 @@ async def get_events(
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_hrefs = {entry["href"] for entry in cached_urls.values()} cached_hrefs = {entry["href"] for entry in cached_urls.values()}
cached_count = len(cached_urls) cached_count = len(cached_urls)
@ -108,7 +91,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client, cached_hrefs) events = await get_events(cached_hrefs)
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
@ -118,7 +101,6 @@ async def scrape(client: httpx.AsyncClient) -> None:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
handler = partial( handler = partial(
process_event, process_event,
client=client,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
) )

View file

@ -1,10 +1,9 @@
import base64 import base64
import re import re
import httpx
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues from .utils import Cache, Time, get_logger, leagues, network
log = get_logger(__name__) log = get_logger(__name__)
@ -17,31 +16,14 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=3_600)
BASE_URL = "https://istreameast.app" BASE_URL = "https://istreameast.app"
async def get_html_data(client: httpx.AsyncClient, url: str) -> str: async def process_event(url: str, url_num: int) -> str | None:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return b""
return r.text
async def process_event(
client: httpx.AsyncClient,
url: str,
url_num: int,
) -> str | None:
pattern = re.compile(r"source:\s*window\.atob\(\s*'([^']+)'\s*\)", re.IGNORECASE) pattern = re.compile(r"source:\s*window\.atob\(\s*'([^']+)'\s*\)", re.IGNORECASE)
if not (event_data := await get_html_data(client, url)): if not (event_data := await network.request(url, log=log)):
log.warning(f"URL {url_num}) Failed to load event url.") log.info(f"URL {url_num}) Failed to load url.")
return return
soup = HTMLParser(event_data) soup = HTMLParser(event_data.content)
if not (iframe := soup.css_first("iframe#wp_player")): if not (iframe := soup.css_first("iframe#wp_player")):
log.warning(f"URL {url_num}) No iframe element found.") log.warning(f"URL {url_num}) No iframe element found.")
@ -51,11 +33,11 @@ async def process_event(
log.warning(f"URL {url_num}) No iframe source found.") log.warning(f"URL {url_num}) No iframe source found.")
return return
if not (iframe_src_data := await get_html_data(client, iframe_src)): if not (iframe_src_data := await network.request(iframe_src, log=log)):
log.warning(f"URL {url_num}) Failed to load iframe source.") log.info(f"URL {url_num}) Failed to load iframe source.")
return return
if not (match := pattern.search(iframe_src_data)): if not (match := pattern.search(iframe_src_data.text)):
log.warning(f"URL {url_num}) No Clappr source found.") log.warning(f"URL {url_num}) No Clappr source found.")
return return
@ -63,16 +45,15 @@ async def process_event(
return base64.b64decode(match[1]).decode("utf-8") return base64.b64decode(match[1]).decode("utf-8")
async def get_events( async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient, cached_keys: set[str] events = []
) -> list[dict[str, str]]:
if not (html_data := await network.request(BASE_URL, log=log)):
return events
pattern = re.compile(r"^(?:LIVE|\d+\s+(minutes?)\b)", re.IGNORECASE) pattern = re.compile(r"^(?:LIVE|\d+\s+(minutes?)\b)", re.IGNORECASE)
html_data = await get_html_data(client, BASE_URL) soup = HTMLParser(html_data.content)
soup = HTMLParser(html_data)
events = []
for link in soup.css("li.f1-podium--item > a.f1-podium--link"): for link in soup.css("li.f1-podium--item > a.f1-podium--link"):
li_item = link.parent li_item = link.parent
@ -90,6 +71,9 @@ async def get_events(
if inner_span := driver_elem.css_first("span.d-md-inline"): if inner_span := driver_elem.css_first("span.d-md-inline"):
event_name = inner_span.text(strip=True) event_name = inner_span.text(strip=True)
if f"[{sport}] {event_name} ({TAG})" in cached_keys:
continue
if not (href := link.attributes.get("href")): if not (href := link.attributes.get("href")):
continue continue
@ -101,11 +85,6 @@ async def get_events(
if not pattern.search(time_text): if not pattern.search(time_text):
continue continue
key = f"[{sport}] {event_name} ({TAG})"
if cached_keys & {key}:
continue
events.append( events.append(
{ {
"sport": sport, "sport": sport,
@ -117,7 +96,7 @@ async def get_events(
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -126,7 +105,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client, set(cached_urls.keys())) events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
@ -134,11 +113,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
now = Time.clean(Time.now()).timestamp() now = Time.clean(Time.now()).timestamp()
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
if url := await process_event( if url := await process_event(ev["link"], i):
client,
ev["link"],
i,
):
sport, event, link = ( sport, event, link = (
ev["sport"], ev["sport"],
ev["event"], ev["event"],

View file

@ -1,6 +1,5 @@
from functools import partial from functools import partial
import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -22,40 +21,16 @@ def fix_league(s: str) -> str:
return " ".join(x.capitalize() for x in s.split()) if len(s) > 5 else s.upper() return " ".join(x.capitalize() for x in s.split()) if len(s) > 5 else s.upper()
async def refresh_api_cache( async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient, now = Time.clean(Time.now())
url: str,
now_ts: float,
) -> dict[str, dict[str, str]]:
log.info("Refreshing API cache")
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return {}
if not (data := r.json()):
return {}
data["timestamp"] = now_ts
return data
async def get_events(
client: httpx.AsyncClient, cached_keys: set[str]
) -> list[dict[str, str]]:
now = Time.now()
if not (api_data := API_CACHE.load(per_entry=False)): if not (api_data := API_CACHE.load(per_entry=False)):
api_data = await refresh_api_cache( api_data = {}
client,
BASE_URL, if r := await network.request(BASE_URL, log=log):
now.timestamp(), api_data: dict = r.json()
)
api_data["timestamp"] = now.timestamp()
API_CACHE.write(api_data) API_CACHE.write(api_data)
@ -68,9 +43,14 @@ async def get_events(
continue continue
for event in info["items"]: for event in info["items"]:
event_league = event["league"] if (event_league := event["league"]) == "channel tv":
continue
if event_league == "channel tv": sport = fix_league(event_league)
event_name = event["title"]
if f"[{sport}] {event_name} ({TAG})" in cached_keys:
continue continue
event_streams: list[dict[str, str]] = event["streams"] event_streams: list[dict[str, str]] = event["streams"]
@ -78,26 +58,19 @@ async def get_events(
if not (event_link := event_streams[0].get("link")): if not (event_link := event_streams[0].get("link")):
continue continue
sport = fix_league(event_league)
event_name = event["title"]
key = f"[{sport}] {event_name} ({TAG})"
if cached_keys & {key}:
continue
events.append( events.append(
{ {
"sport": sport, "sport": sport,
"event": event_name, "event": event_name,
"link": event_link, "link": event_link,
"timestamp": now.timestamp(),
} }
) )
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -106,13 +79,11 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client, set(cached_urls.keys())) events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
now = Time.clean(Time.now()).timestamp()
async with async_playwright() as p: async with async_playwright() as p:
browser, context = await network.browser(p) browser, context = await network.browser(p)
@ -132,10 +103,11 @@ async def scrape(client: httpx.AsyncClient) -> None:
) )
if url: if url:
sport, event, link = ( sport, event, link, ts = (
ev["sport"], ev["sport"],
ev["event"], ev["event"],
ev["link"], ev["link"],
ev["timestamp"],
) )
tvg_id, logo = leagues.get_tvg_info(sport, event) tvg_id, logo = leagues.get_tvg_info(sport, event)
@ -146,7 +118,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
"url": url, "url": url,
"logo": logo, "logo": logo,
"base": "https://vividmosaica.com/", "base": "https://vividmosaica.com/",
"timestamp": now, "timestamp": ts,
"id": tvg_id or "Live.Event.us", "id": tvg_id or "Live.Event.us",
"link": link, "link": link,
} }

View file

@ -1,6 +1,5 @@
from functools import partial from functools import partial
import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -28,35 +27,17 @@ BASE_MIRRORS = [
] ]
async def refresh_api_cache( async def get_events(api_url: str, cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient, events = []
url: str,
) -> dict[str, dict[str, str]]:
log.info("Refreshing API cache")
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return {}
return r.json()
async def get_events(
client: httpx.AsyncClient,
api_url: str,
cached_keys: set[str],
) -> list[dict[str, str]]:
if not (api_data := API_FILE.load(per_entry=False)): if not (api_data := API_FILE.load(per_entry=False)):
api_data = await refresh_api_cache(client, api_url) api_data = {}
if r := await network.request(api_url, log=log):
api_data: dict = r.json()
API_FILE.write(api_data) API_FILE.write(api_data)
events = []
now = Time.clean(Time.now()) now = Time.clean(Time.now())
start_dt = now.delta(minutes=-30) start_dt = now.delta(minutes=-30)
end_dt = now.delta(minutes=30) end_dt = now.delta(minutes=30)
@ -76,9 +57,7 @@ async def get_events(
if not (name and start_ts and iframe): if not (name and start_ts and iframe):
continue continue
key = f"[{sport}] {name} ({TAG})" if f"[{sport}] {name} ({TAG})" in cached_keys:
if cached_keys & {key}:
continue continue
event_dt = Time.from_ts(start_ts) event_dt = Time.from_ts(start_ts)
@ -99,7 +78,7 @@ async def get_events(
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -117,11 +96,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{base_url}"') log.info(f'Scraping from "{base_url}"')
events = await get_events( events = await get_events(api_url, cached_urls.keys())
client,
api_url,
set(cached_urls.keys()),
)
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")

View file

@ -3,7 +3,6 @@ import re
from functools import partial from functools import partial
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -31,17 +30,8 @@ SPORT_ENDPOINTS = {
} }
async def process_event( async def process_event(url: str, url_num: int) -> str | None:
client: httpx.AsyncClient, if not (html_data := await network.request(url, log=log)):
url: str,
url_num: int,
) -> str | None:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'URL {url_num}) Failed to fetch "{url}": {e}')
return return
valid_m3u8 = re.compile( valid_m3u8 = re.compile(
@ -49,7 +39,7 @@ async def process_event(
re.IGNORECASE, re.IGNORECASE,
) )
if not (match := valid_m3u8.search(r.text)): if not (match := valid_m3u8.search(html_data.text)):
log.info(f"URL {url_num}) No M3U8 found") log.info(f"URL {url_num}) No M3U8 found")
return return
@ -57,31 +47,19 @@ async def process_event(
return match[1] return match[1]
async def get_html_data(client: httpx.AsyncClient, url: str) -> bytes:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return b""
return r.content
async def refresh_html_cache( async def refresh_html_cache(
client: httpx.AsyncClient,
url: str, url: str,
sport: str, sport: str,
now_ts: float, now_ts: float,
) -> dict[str, dict[str, str | float]]: ) -> dict[str, dict[str, str | float]]:
html_data = await get_html_data(client, url)
soup = HTMLParser(html_data)
events = {} events = {}
if not (html_data := await network.request(url, log=log)):
return events
soup = HTMLParser(html_data.content)
for row in soup.css("table#eventsTable tbody tr"): for row in soup.css("table#eventsTable tbody tr"):
if not (a_tag := row.css_first("td a")): if not (a_tag := row.css_first("td a")):
continue continue
@ -113,9 +91,7 @@ async def refresh_html_cache(
return events return events
async def get_events( async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient, cached_keys: set[str]
) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
if not (events := HTML_CACHE.load()): if not (events := HTML_CACHE.load()):
@ -125,7 +101,6 @@ async def get_events(
tasks = [ tasks = [
refresh_html_cache( refresh_html_cache(
client,
url, url,
sport, sport,
now.timestamp(), now.timestamp(),
@ -145,7 +120,7 @@ async def get_events(
end_ts = now.delta(minutes=30).timestamp() end_ts = now.delta(minutes=30).timestamp()
for k, v in events.items(): for k, v in events.items():
if cached_keys & {k}: if k in cached_keys:
continue continue
if not start_ts <= v["event_ts"] <= end_ts: if not start_ts <= v["event_ts"] <= end_ts:
@ -156,7 +131,7 @@ async def get_events(
return live return live
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -165,7 +140,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client, set(cached_urls.keys())) events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
@ -173,7 +148,6 @@ async def scrape(client: httpx.AsyncClient) -> None:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
handler = partial( handler = partial(
process_event, process_event,
client=client,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
) )

View file

@ -1,7 +1,6 @@
import re import re
from functools import partial from functools import partial
import httpx
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -19,49 +18,32 @@ HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=19_800)
BASE_URL = "https://sharkstreams.net" BASE_URL = "https://sharkstreams.net"
async def process_event( async def process_event(url: str, url_num: int) -> str | None:
client: httpx.AsyncClient, if not (r := await network.request(url, log=log)):
url: str, log.info(f"URL {url_num}) Failed to load url.")
url_num: int,
) -> str | None:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'URL {url_num}) Failed to fetch "{url}": {e}')
return return
data: dict[str, list[str]] = r.json() data: dict[str, list[str]] = r.json()
if not data.get("urls"): if not (urls := data.get("urls")):
log.info(f"URL {url_num}) No M3U8 found") log.info(f"URL {url_num}) No M3U8 found")
return return
log.info(f"URL {url_num}) Captured M3U8") log.info(f"URL {url_num}) Captured M3U8")
return urls[0]
return data["urls"][0]
async def refresh_html_cache( async def refresh_html_cache(now_ts: float) -> dict[str, dict[str, str | float]]:
client: httpx.AsyncClient, now_ts: float
) -> dict[str, dict[str, str | float]]:
log.info("Refreshing HTML cache") log.info("Refreshing HTML cache")
try: events = {}
r = await client.get(BASE_URL)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{BASE_URL}": {e}')
return {} if not (html_data := await network.request(BASE_URL, log=log)):
return events
pattern = re.compile(r"openEmbed\('([^']+)'\)", re.IGNORECASE) pattern = re.compile(r"openEmbed\('([^']+)'\)", re.IGNORECASE)
soup = HTMLParser(r.content) soup = HTMLParser(html_data.content)
events = {}
for row in soup.css(".row"): for row in soup.css(".row"):
date_node = row.css_first(".ch-date") date_node = row.css_first(".ch-date")
@ -98,14 +80,11 @@ async def refresh_html_cache(
return events return events
async def get_events( async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient,
cached_keys: set[str],
) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
if not (events := HTML_CACHE.load()): if not (events := HTML_CACHE.load()):
events = await refresh_html_cache(client, now.timestamp()) events = await refresh_html_cache(now.timestamp())
HTML_CACHE.write(events) HTML_CACHE.write(events)
@ -115,7 +94,7 @@ async def get_events(
end_ts = now.delta(minutes=10).timestamp() end_ts = now.delta(minutes=10).timestamp()
for k, v in events.items(): for k, v in events.items():
if cached_keys & {k}: if k in cached_keys:
continue continue
if not start_ts <= v["event_ts"] <= end_ts: if not start_ts <= v["event_ts"] <= end_ts:
@ -126,7 +105,7 @@ async def get_events(
return live return live
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -135,7 +114,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client, set(cached_urls.keys())) events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
@ -143,7 +122,6 @@ async def scrape(client: httpx.AsyncClient) -> None:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
handler = partial( handler = partial(
process_event, process_event,
client=client,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
) )

View file

@ -2,7 +2,6 @@ import asyncio
from functools import partial from functools import partial
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
@ -16,34 +15,18 @@ TAG = "SPORT9"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=3_600) CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=3_600)
BASE_URL = "https://sport9.ru" BASE_URL = "https://sport9.ru/"
async def get_html_data( async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient,
url: str,
date: str,
) -> bytes:
try:
r = await client.get(url, params={"date": date})
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{r.url}": {e}')
return b""
return r.content
async def get_events(
client: httpx.AsyncClient,
cached_keys: set[str],
) -> list[dict[str, str]]:
now = Time.now() now = Time.now()
tasks = [ tasks = [
get_html_data(client, BASE_URL, str(d.date())) network.request(
BASE_URL,
log=log,
params={"date": f"{d.date()}"},
)
for d in [ for d in [
now.delta(days=-1), now.delta(days=-1),
now, now,
@ -53,10 +36,11 @@ async def get_events(
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
soups = [HTMLParser(html) for html in results]
events = [] events = []
if not (soups := [HTMLParser(html.content) for html in results if html]):
return events
for soup in soups: for soup in soups:
for card in soup.css("a.match-card"): for card in soup.css("a.match-card"):
live_badge = card.css_first(".live-badge") live_badge = card.css_first(".live-badge")
@ -85,12 +69,10 @@ async def get_events(
else: else:
continue continue
if not (href := card.attributes.get("href")): if f"[{sport}] {event} ({TAG})" in cached_keys:
continue continue
key = f"[{sport}] {event} ({TAG})" if not (href := card.attributes.get("href")):
if cached_keys & {key}:
continue continue
events.append( events.append(
@ -104,7 +86,7 @@ async def get_events(
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -113,7 +95,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client, set(cached_urls.keys())) events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")

View file

@ -1,6 +1,5 @@
from functools import partial from functools import partial
import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -33,35 +32,20 @@ CATEGORIES = {
} }
async def refresh_api_cache( async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient, now_ts: float
) -> list[dict[str, str | int]]:
log.info("Refreshing API cache")
try:
r = await client.get(BASE_URL, params={"pageNumber": 1, "pageSize": 500})
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{r.url}": {e}')
return []
if not (data := r.json()):
return []
data[-1]["timestamp"] = now_ts
return data
async def get_events(
client: httpx.AsyncClient,
cached_keys: set[str],
) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
if not (api_data := API_FILE.load(per_entry=False, index=-1)): if not (api_data := API_FILE.load(per_entry=False, index=-1)):
api_data = await refresh_api_cache(client, now.timestamp()) api_data = []
if r := await network.request(
BASE_URL,
log=log,
params={"pageNumber": 1, "pageSize": 500},
):
api_data: list[dict] = r.json()
api_data[-1]["timestamp"] = now.timestamp()
API_FILE.write(api_data) API_FILE.write(api_data)
@ -82,17 +66,15 @@ async def get_events(
if not (name and category_id and iframe and event_time): if not (name and category_id and iframe and event_time):
continue continue
event_dt = Time.from_str(event_time, timezone="CET")
if not start_dt <= event_dt <= end_dt:
continue
if not (sport := CATEGORIES.get(category_id)): if not (sport := CATEGORIES.get(category_id)):
continue continue
key = f"[{sport}] {name} ({TAG})" if f"[{sport}] {name} ({TAG})" in cached_keys:
continue
if cached_keys & {key}: event_dt = Time.from_str(event_time, timezone="CET")
if not start_dt <= event_dt <= end_dt:
continue continue
events.append( events.append(
@ -107,7 +89,7 @@ async def get_events(
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -116,7 +98,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info('Scraping from "https://streamcenter.xyz"') log.info('Scraping from "https://streamcenter.xyz"')
events = await get_events(client, set(cached_urls.keys())) events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")

View file

@ -1,7 +1,5 @@
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
log = get_logger(__name__) log = get_logger(__name__)
@ -15,24 +13,20 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=19_800)
BASE_URL = "https://streamfree.to/" BASE_URL = "https://streamfree.to/"
async def refresh_api_cache(client: httpx.AsyncClient) -> dict[str, dict[str, list]]: async def get_events() -> dict[str, dict[str, str | float]]:
try:
r = await client.get(urljoin(BASE_URL, "streams"))
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{r.url}": {e}')
return {}
return r.json()
async def get_events(client: httpx.AsyncClient) -> dict[str, dict[str, str | float]]:
api_data = await refresh_api_cache(client)
events = {} events = {}
now = Time.clean(Time.now()).timestamp() if not (
r := await network.request(
urljoin(BASE_URL, "streams"),
log=log,
)
):
return events
api_data: dict = r.json()
now = Time.clean(Time.now())
for streams in api_data.get("streams", {}).values(): for streams in api_data.get("streams", {}).values():
if not streams: if not streams:
@ -66,14 +60,14 @@ async def get_events(client: httpx.AsyncClient) -> dict[str, dict[str, str | flo
), ),
"logo": logo or pic, "logo": logo or pic,
"base": BASE_URL, "base": BASE_URL,
"timestamp": now, "timestamp": now.timestamp(),
"id": tvg_id or "Live.Event.us", "id": tvg_id or "Live.Event.us",
} }
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
if cached := CACHE_FILE.load(): if cached := CACHE_FILE.load():
urls.update(cached) urls.update(cached)
log.info(f"Loaded {len(urls)} event(s) from cache") log.info(f"Loaded {len(urls)} event(s) from cache")
@ -81,9 +75,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client) urls.update(await get_events())
urls.update(events)
CACHE_FILE.write(urls) CACHE_FILE.write(urls)

View file

@ -2,7 +2,6 @@ import asyncio
from functools import partial from functools import partial
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
@ -36,40 +35,24 @@ CATEGORIES = {
} }
async def get_html_data(
client: httpx.AsyncClient,
date: str,
sport_id: str,
) -> bytes:
try:
r = await client.get(
urljoin(BASE_URL, f"events/{date}"),
params={"sport_id": sport_id},
)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{r.url}": {e}')
return b""
return r.content
async def refresh_html_cache( async def refresh_html_cache(
client: httpx.AsyncClient,
date: str, date: str,
sport_id: str, sport_id: str,
ts: float, ts: float,
) -> dict[str, dict[str, str | float]]: ) -> dict[str, dict[str, str | float]]:
html_data = await get_html_data(client, date, sport_id)
soup = HTMLParser(html_data)
events = {} events = {}
if not (
html_data := await network.request(
urljoin(BASE_URL, f"events/{date}"),
log=log,
params={"sport_id": sport_id},
)
):
return events
soup = HTMLParser(html_data.content)
for section in soup.css(".events-section"): for section in soup.css(".events-section"):
if not (sport_node := section.css_first(".section-titlte")): if not (sport_node := section.css_first(".section-titlte")):
continue continue
@ -111,25 +94,19 @@ async def refresh_html_cache(
return events return events
async def get_events( async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient,
cached_keys: set[str],
) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
if not (events := HTML_CACHE.load()): if not (events := HTML_CACHE.load()):
log.info("Refreshing HTML cache") log.info("Refreshing HTML cache")
dates = [now.date(), now.delta(days=1).date()]
tasks = [ tasks = [
refresh_html_cache( refresh_html_cache(
client,
date, date,
sport_id, sport_id,
now.timestamp(), now.timestamp(),
) )
for date in dates for date in [now.date(), now.delta(days=1).date()]
for sport_id in CATEGORIES.values() for sport_id in CATEGORIES.values()
] ]
@ -145,7 +122,7 @@ async def get_events(
end_ts = now.delta(minutes=5).timestamp() end_ts = now.delta(minutes=5).timestamp()
for k, v in events.items(): for k, v in events.items():
if cached_keys & {k}: if k in cached_keys:
continue continue
if not start_ts <= v["event_ts"] <= end_ts: if not start_ts <= v["event_ts"] <= end_ts:
@ -156,7 +133,7 @@ async def get_events(
return live return live
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -165,7 +142,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client, set(cached_urls.keys())) events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")

View file

@ -4,7 +4,6 @@ from itertools import chain
from typing import Any from typing import Any
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -46,32 +45,20 @@ def get_event(t1: str, t2: str) -> str:
return f"{t1.strip()} vs {t2.strip()}" return f"{t1.strip()} vs {t2.strip()}"
async def get_api_data(client: httpx.AsyncClient, url: str) -> list[dict[str, Any]]: async def refresh_api_cache(now_ts: float) -> list[dict[str, Any]]:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return []
return r.json()
async def refresh_api_cache(
client: httpx.AsyncClient,
now_ts: float,
) -> list[dict[str, Any]]:
log.info("Refreshing API cache") log.info("Refreshing API cache")
tasks = [ tasks = [
get_api_data(client, urljoin(BASE_URL, f"data/{sport}.json")) network.request(
urljoin(BASE_URL, f"data/{sport}.json"),
log=log,
)
for sport in SPORT_ENDPOINTS for sport in SPORT_ENDPOINTS
] ]
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
if not (data := list(chain(*results))): if not (data := list(chain.from_iterable(r.json() for r in results if r))):
return [] return []
for ev in data: for ev in data:
@ -82,13 +69,11 @@ async def refresh_api_cache(
return data return data
async def get_events( async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient, cached_keys: set[str]
) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
if not (api_data := API_FILE.load(per_entry=False, index=-1)): if not (api_data := API_FILE.load(per_entry=False, index=-1)):
api_data = await refresh_api_cache(client, now.timestamp()) api_data = await refresh_api_cache(now.timestamp())
API_FILE.write(api_data) API_FILE.write(api_data)
@ -104,27 +89,28 @@ async def get_events(
t1, t2 = stream_group.get("away"), stream_group.get("home") t1, t2 = stream_group.get("away"), stream_group.get("home")
event = get_event(t1, t2)
if not (event_ts and sport): if not (event_ts and sport):
continue continue
if f"[{sport}] {event} ({TAG})" in cached_keys:
continue
if "F1 Abu Dhabi" in event: # api bug
continue
event_dt = Time.from_ts(event_ts) event_dt = Time.from_ts(event_ts)
if not start_dt <= event_dt <= end_dt: if not start_dt <= event_dt <= end_dt:
continue continue
event = get_event(t1, t2)
if not (streams := stream_group.get("streams")): if not (streams := stream_group.get("streams")):
continue continue
if not (url := streams[0].get("url")): if not (url := streams[0].get("url")):
continue continue
key = f"[{sport}] {event} ({TAG})"
if cached_keys & {key}:
continue
events.append( events.append(
{ {
"sport": sport, "sport": sport,
@ -137,7 +123,7 @@ async def get_events(
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -146,7 +132,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client, set(cached_urls.keys())) events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")

View file

@ -1,9 +1,7 @@
import re import re
from functools import partial from functools import partial
from typing import Any
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -35,52 +33,28 @@ def fix_sport(s: str) -> str:
return s.capitalize() if len(s) >= 4 else s.upper() return s.capitalize() if len(s) >= 4 else s.upper()
async def refresh_api_cache( async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient,
url: str,
now_ts: float,
) -> list[dict[str, Any]]:
log.info("Refreshing API cache")
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return []
if not (data := r.json()):
return []
data[-1]["timestamp"] = now_ts
return data
async def get_events(
client: httpx.AsyncClient,
url: str,
cached_keys: set[str],
) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
if not (api_data := API_FILE.load(per_entry=False, index=-1)): if not (api_data := API_FILE.load(per_entry=False, index=-1)):
api_data = await refresh_api_cache( api_data = []
client,
if r := await network.request(
urljoin(url, "api/matches/all-today"), urljoin(url, "api/matches/all-today"),
now.timestamp(), log=log,
) ):
api_data: list[dict] = r.json()
api_data[-1]["timestamp"] = now.timestamp()
API_FILE.write(api_data) API_FILE.write(api_data)
events = [] events = []
pattern = re.compile(r"[\n\r]+|\s{2,}")
start_dt = now.delta(minutes=-30) start_dt = now.delta(minutes=-30)
end_dt = now.delta(minutes=30) end_dt = now.delta(minutes=30)
pattern = re.compile(r"[\n\r]+|\s{2,}")
for event in api_data: for event in api_data:
if (category := event.get("category")) == "other": if (category := event.get("category")) == "other":
@ -99,13 +73,12 @@ async def get_events(
sport = fix_sport(category) sport = fix_sport(category)
parts = pattern.split(event["title"].strip()) parts = pattern.split(event["title"].strip())
name = " | ".join(p.strip() for p in parts if p.strip()) name = " | ".join(p.strip() for p in parts if p.strip())
logo = urljoin(url, poster) if (poster := event.get("poster")) else None logo = urljoin(url, poster) if (poster := event.get("poster")) else None
key = f"[{sport}] {name} ({TAG})" if f"[{sport}] {name} ({TAG})" in cached_keys:
if cached_keys & {key}:
continue continue
sources: list[dict[str, str]] = event["sources"] sources: list[dict[str, str]] = event["sources"]
@ -113,7 +86,8 @@ async def get_events(
if not sources: if not sources:
continue continue
skip_types = {"alpha", "bravo"} skip_types = ["alpha", "bravo"]
valid_sources = [d for d in sources if d.get("source") not in skip_types] valid_sources = [d for d in sources if d.get("source") not in skip_types]
if not valid_sources: if not valid_sources:
@ -122,6 +96,7 @@ async def get_events(
srce = valid_sources[0] srce = valid_sources[0]
source_type = srce.get("source") source_type = srce.get("source")
stream_id = srce.get("id") stream_id = srce.get("id")
if not (source_type and stream_id): if not (source_type and stream_id):
@ -140,7 +115,7 @@ async def get_events(
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -154,11 +129,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{base_url}"') log.info(f'Scraping from "{base_url}"')
events = await get_events( events = await get_events(base_url, cached_urls.keys())
client,
base_url,
set(cached_urls.keys()),
)
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")

View file

@ -1,8 +1,6 @@
import re import re
import httpx from .utils import Cache, Time, get_logger, leagues, network
from .utils import Cache, Time, get_logger, leagues
log = get_logger(__name__) log = get_logger(__name__)
@ -15,24 +13,15 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=86_400)
BASE_URL = "https://tvpass.org/playlist/m3u" BASE_URL = "https://tvpass.org/playlist/m3u"
async def get_data(client: httpx.AsyncClient) -> list[str]: async def get_events() -> dict[str, dict[str, str | float]]:
try:
r = await client.get(BASE_URL)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{BASE_URL}": {e}')
return []
return r.text.splitlines()
async def get_events(client: httpx.AsyncClient) -> dict[str, dict[str, str | float]]:
now = Time.clean(Time.now()).timestamp()
events = {} events = {}
data = await get_data(client) if not (r := await network.request(BASE_URL, log=log)):
return events
now = Time.clean(Time.now())
data = r.text.splitlines()
for i, line in enumerate(data, start=1): for i, line in enumerate(data, start=1):
if line.startswith("#EXTINF"): if line.startswith("#EXTINF"):
@ -59,13 +48,13 @@ async def get_events(client: httpx.AsyncClient) -> dict[str, dict[str, str | flo
"logo": logo, "logo": logo,
"id": tvg_id or "Live.Event.us", "id": tvg_id or "Live.Event.us",
"base": "https://tvpass.org", "base": "https://tvpass.org",
"timestamp": now, "timestamp": now.timestamp(),
} }
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
if cached := CACHE_FILE.load(): if cached := CACHE_FILE.load():
urls.update(cached) urls.update(cached)
log.info(f"Loaded {len(urls)} event(s) from cache") log.info(f"Loaded {len(urls)} event(s) from cache")
@ -73,9 +62,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(client) urls.update(await get_events())
urls.update(events)
CACHE_FILE.write(urls) CACHE_FILE.write(urls)

View file

@ -51,26 +51,35 @@ class Network:
else urljoin(base, f"{tag}/{path}") else urljoin(base, f"{tag}/{path}")
) )
async def check_status(self, url: str) -> bool: async def request(
self,
url: str,
log: logging.Logger | None = None,
**kwargs,
) -> httpx.Response | None:
log = log or self._logger
try: try:
r = await self.client.get(url, timeout=5) r = await self.client.get(url, **kwargs)
r.raise_for_status() r.raise_for_status()
return r.status_code == 200 except Exception as e:
except (httpx.HTTPError, httpx.TimeoutException) as e: log.error(f'Failed to fetch "{url}": {e}\n{kwargs = }')
self._logger.debug(f"Status check failed for {url}: {e}") return ""
return False
return r
async def get_base(self, mirrors: list[str]) -> str | None: async def get_base(self, mirrors: list[str]) -> str | None:
random.shuffle(mirrors) random.shuffle(mirrors)
tasks = [self.check_status(link) for link in mirrors] for mirror in mirrors:
results = await asyncio.gather(*tasks, return_exceptions=True) if not (r := await self.request(mirror)):
continue
working_mirrors = [ elif r.status_code != 200:
mirror for mirror, success in zip(mirrors, results) if success continue
]
return working_mirrors[0] if working_mirrors else None return mirror
@staticmethod @staticmethod
async def safe_process( async def safe_process(
@ -80,8 +89,7 @@ class Network:
log: logging.Logger | None = None, log: logging.Logger | None = None,
) -> T | None: ) -> T | None:
if not log: log = log or get_logger("network")
log = logging.getLogger(__name__)
task = asyncio.create_task(fn()) task = asyncio.create_task(fn())
@ -133,6 +141,8 @@ class Network:
log: logging.Logger | None = None, log: logging.Logger | None = None,
) -> str | None: ) -> str | None:
log = log or self._logger
page = await context.new_page() page = await context.new_page()
captured: list[str] = [] captured: list[str] = []

View file

@ -5,7 +5,6 @@ from itertools import chain
from typing import Any from typing import Any
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx
from playwright.async_api import BrowserContext, async_playwright from playwright.async_api import BrowserContext, async_playwright
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -42,37 +41,27 @@ SPORT_ENDPOINTS = [
] ]
async def get_api_data(client: httpx.AsyncClient, url: str) -> list[dict[str, Any]]: async def refresh_api_cache(url: str, now_ts: float) -> list[dict[str, Any]]:
try:
r = await client.get(url, timeout=5)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return []
return r.json()
async def refresh_api_cache(
client: httpx.AsyncClient, url: str
) -> list[dict[str, Any]]:
log.info("Refreshing API cache") log.info("Refreshing API cache")
tasks = [ tasks = [
get_api_data(client, urljoin(url, f"api/v1/matches/{sport}")) network.request(
urljoin(url, f"api/v1/matches/{sport}"),
log=log,
timeout=5,
)
for sport in SPORT_ENDPOINTS for sport in SPORT_ENDPOINTS
] ]
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
if not (data := list(chain(*results))): if not (data := list(chain.from_iterable(r.json() for r in results if r))):
return [] return []
for ev in data: for ev in data:
ev["ts"] = ev.pop("timestamp") ev["ts"] = ev.pop("timestamp")
data[-1]["timestamp"] = Time.clean(Time.now()).timestamp() data[-1]["timestamp"] = now_ts
return data return data
@ -163,33 +152,40 @@ async def process_event(
async def get_events( async def get_events(
client: httpx.AsyncClient,
api_url: str,
base_url: str, base_url: str,
cached_keys: set[str], api_url: str,
cached_keys: list[str],
) -> list[dict[str, str]]: ) -> list[dict[str, str]]:
now = Time.clean(Time.now())
if not (api_data := API_FILE.load(per_entry=False, index=-1)): if not (api_data := API_FILE.load(per_entry=False, index=-1)):
api_data = await refresh_api_cache(client, api_url) api_data = await refresh_api_cache(api_url, now.timestamp())
API_FILE.write(api_data) API_FILE.write(api_data)
events = [] events = []
now = Time.clean(Time.now()) pattern = re.compile(r"\-+|\(")
start_dt = now.delta(minutes=-30) start_dt = now.delta(minutes=-30)
end_dt = now.delta(minutes=5) end_dt = now.delta(minutes=5)
pattern = re.compile(r"\-+|\(")
for event in api_data: for event in api_data:
match_id = event.get("matchId") match_id = event.get("matchId")
name = event.get("title") name = event.get("title")
league = event.get("league") league = event.get("league")
if not (match_id and name and league): if not (match_id and name and league):
continue continue
sport = pattern.split(league, 1)[0].strip()
if f"[{sport}] {name} ({TAG})" in cached_keys:
continue
if not (ts := event.get("ts")): if not (ts := event.get("ts")):
continue continue
@ -200,15 +196,8 @@ async def get_events(
if not start_dt <= event_dt <= end_dt: if not start_dt <= event_dt <= end_dt:
continue continue
sport = pattern.split(league, 1)[0].strip()
logo = urljoin(api_url, poster) if (poster := event.get("poster")) else None logo = urljoin(api_url, poster) if (poster := event.get("poster")) else None
key = f"[{sport}] {name} ({TAG})"
if cached_keys & {key}:
continue
events.append( events.append(
{ {
"sport": sport, "sport": sport,
@ -222,7 +211,7 @@ async def get_events(
return events return events
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
valid_count = cached_count = len(valid_urls) valid_count = cached_count = len(valid_urls)
@ -242,10 +231,9 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{base_url}"') log.info(f'Scraping from "{base_url}"')
events = await get_events( events = await get_events(
client,
api_url,
base_url, base_url,
set(cached_urls.keys()), api_url,
cached_urls.keys(),
) )
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")

View file

@ -1,7 +1,6 @@
import asyncio import asyncio
from functools import partial from functools import partial
import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
@ -24,22 +23,15 @@ def fix_event(s: str) -> str:
return " vs ".join(s.split("@")) return " vs ".join(s.split("@"))
async def refresh_html_cache( async def refresh_html_cache(url: str) -> dict[str, dict[str, str | float]]:
client: httpx.AsyncClient, url: str events = {}
) -> dict[str, dict[str, str | float]]:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return {} if not (html_data := await network.request(url, log=log)):
return events
now = Time.clean(Time.now()) now = Time.clean(Time.now())
soup = HTMLParser(r.content) soup = HTMLParser(html_data.content)
events = {}
title = soup.css_first("title").text(strip=True) title = soup.css_first("title").text(strip=True)
@ -87,15 +79,13 @@ async def refresh_html_cache(
return events return events
async def get_events( async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
client: httpx.AsyncClient, cached_keys: set[str]
) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
if not (events := HTML_CACHE.load()): if not (events := HTML_CACHE.load()):
log.info("Refreshing HTML cache") log.info("Refreshing HTML cache")
tasks = [refresh_html_cache(client, url) for url in BASE_URLS.values()] tasks = [refresh_html_cache(url) for url in BASE_URLS.values()]
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
@ -109,7 +99,7 @@ async def get_events(
end_ts = now.delta(minutes=30).timestamp() end_ts = now.delta(minutes=30).timestamp()
for k, v in events.items(): for k, v in events.items():
if cached_keys & {k}: if k in cached_keys:
continue continue
if not start_ts <= v["event_ts"] <= end_ts: if not start_ts <= v["event_ts"] <= end_ts:
@ -120,7 +110,7 @@ async def get_events(
return live return live
async def scrape(client: httpx.AsyncClient) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
urls.update(cached_urls) urls.update(cached_urls)
@ -129,7 +119,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info(f'Scraping from "{' & '.join(BASE_URLS.values())}"') log.info(f'Scraping from "{' & '.join(BASE_URLS.values())}"')
events = await get_events(client, set(cached_urls.keys())) events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")