This commit is contained in:
doms9 2025-11-14 19:27:54 -05:00
parent 3cad8619e7
commit 00000d9842
3 changed files with 285 additions and 2 deletions

View file

@ -14,6 +14,7 @@ from scrapers import (
streamfree, streamfree,
strmd, strmd,
tvpass, tvpass,
volo,
watchfooty, watchfooty,
) )
from scrapers.utils import get_logger, network from scrapers.utils import get_logger, network
@ -53,6 +54,7 @@ async def main() -> None:
asyncio.create_task(streamfree.scrape(network.client)), asyncio.create_task(streamfree.scrape(network.client)),
asyncio.create_task(strmd.scrape(network.client)), asyncio.create_task(strmd.scrape(network.client)),
asyncio.create_task(tvpass.scrape(network.client)), asyncio.create_task(tvpass.scrape(network.client)),
asyncio.create_task(volo.scrape(network.client)),
asyncio.create_task(watchfooty.scrape(network.client)), asyncio.create_task(watchfooty.scrape(network.client)),
] ]
@ -69,6 +71,7 @@ async def main() -> None:
| strmd.urls | strmd.urls
| streamfree.urls | streamfree.urls
| tvpass.urls | tvpass.urls
| volo.urls
| watchfooty.urls | watchfooty.urls
) )

View file

@ -1,6 +1,6 @@
import json import json
import re import re
from datetime import datetime, timedelta, timezone from datetime import date, datetime, timedelta, timezone
from pathlib import Path from pathlib import Path
import pytz import pytz
@ -52,6 +52,25 @@ class Time(datetime):
dt = dt.astimezone(cls.TZ) dt = dt.astimezone(cls.TZ)
return cls.fromtimestamp(dt.timestamp(), tz=cls.TZ) return cls.fromtimestamp(dt.timestamp(), tz=cls.TZ)
@classmethod
def from_only_time(cls, s: str, d: date, timezone: str) -> "Time":
hour, minute = map(int, s.split(":"))
dt = datetime(
2000,
1,
1,
hour,
minute,
tzinfo=cls.ZONES.get(timezone, cls.TZ),
)
dt = dt.astimezone(cls.TZ)
dt = datetime.combine(d, dt.timetz())
return cls.fromtimestamp(dt.timestamp(), tz=cls.TZ)
@classmethod @classmethod
def from_str( def from_str(
cls, cls,
@ -140,7 +159,7 @@ class Leagues:
league: str, league: str,
) -> bool: ) -> bool:
pattern = re.compile(r"\s+(?:-|vs\.?|at)\s+", flags=re.IGNORECASE) pattern = re.compile(r"\s+(?:-|vs\.?|at|@)\s+", flags=re.IGNORECASE)
if pattern.search(event): if pattern.search(event):
t1, t2 = re.split(pattern, event) t1, t2 = re.split(pattern, event)

261
M3U8/scrapers/volo.py Normal file
View file

@ -0,0 +1,261 @@
import asyncio
from functools import partial
from urllib.parse import urljoin
import httpx
from playwright.async_api import BrowserContext, async_playwright
from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network
log = get_logger(__name__)
urls: dict[str, dict[str, str | float]] = {}
CACHE_FILE = Cache("volo.json", exp=10_800)
HTML_CACHE = Cache("volo-html.json", exp=86_400)
BASE_URL = "http://volokit2.com/sport/"
valid_sports = {
"boxing": "Boxing",
"college-football": "CFB",
"mens-college-basketball": "CBB",
"mlb": "MLB",
"mls": "Soccer",
"nba": "NBA",
"nfl": "NFL",
"nhl": "NHL",
"race": "Racing",
"ufc": "UFC",
"wnba": "WNBA",
}
def fix_event(s: str) -> str:
return " ".join(x.capitalize() for x in s.split())
async def process_event(
url: str,
url_num: int,
context: BrowserContext,
timeout: int | float = 10,
) -> str | None:
page = await context.new_page()
captured: list[str] = []
got_one = asyncio.Event()
handler = partial(
network.capture_req,
captured=captured,
got_one=got_one,
)
page.on("request", handler)
try:
await page.goto(
url,
wait_until="domcontentloaded",
timeout=10_000,
)
wait_task = asyncio.create_task(got_one.wait())
try:
iframe = page.locator("iframe").first
src = await iframe.get_attribute("src")
await page.goto(
src,
wait_until="domcontentloaded",
timeout=10_000,
)
await page.click("#volokit_player")
await asyncio.wait_for(wait_task, timeout=timeout)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
return
finally:
if not wait_task.done():
wait_task.cancel()
try:
await wait_task
except asyncio.CancelledError:
pass
if captured:
log.info(f"URL {url_num}) Captured M3U8")
return captured[0]
log.warning(f"URL {url_num}) No M3U8 captured after waiting.")
return
except Exception as e:
log.warning(f"URL {url_num}) Exception while processing: {e}")
return
finally:
page.remove_listener("request", handler)
await page.close()
async def refresh_html_cache(
client: httpx.AsyncClient,
url: str,
sport: str,
) -> dict[str, str | float]:
try:
r = await client.get(url)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return {}
soup = HTMLParser(r.text)
now = Time.clean(Time.now())
events = {}
for card in soup.css("#events .table .vevent.theevent"):
name = card.css_first(".teamtd.event").text(strip=True)
time = card.css_first(".time").text(strip=True)
if not (href := card.css_first("a").attributes.get("href")):
continue
event_sport = valid_sports[sport]
event_name = fix_event(name)
event_dt = Time.from_only_time(time, now.date(), "UTC")
key = f"[{event_sport}] {event_name} (VOLO)"
events[key] = {
"sport": event_sport,
"event": event_name,
"link": href,
"event_ts": event_dt.timestamp(),
"timestamp": now.timestamp(),
}
return events
async def get_events(
client: httpx.AsyncClient,
sport_urls: dict[str, str],
cached_keys: set[str],
) -> list[dict[str, str]]:
now = Time.clean(Time.now())
if not (events := HTML_CACHE.load()):
tasks = [
refresh_html_cache(
client,
url,
sport,
)
for sport, url in sport_urls.items()
]
results = await asyncio.gather(*tasks)
events = {k: v for data in results for k, v in data.items()}
HTML_CACHE.write(events)
live = []
start_ts = now.delta(minutes=-30).timestamp()
end_ts = now.delta(minutes=30).timestamp()
for k, v in events.items():
if cached_keys & {k}:
continue
if not start_ts <= v["event_ts"] <= end_ts:
continue
live.append({**v})
return live
async def scrape(client: httpx.AsyncClient) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
urls.update(cached_urls)
log.info(f"Loaded {cached_count} event(s) from cache")
log.info(f'Scraping from "{BASE_URL}"')
sport_urls = {
sport: urljoin(BASE_URL, sport.lower()) for sport in valid_sports.keys()
}
events = await get_events(
client,
sport_urls,
set(cached_urls.keys()),
)
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p, browser="brave")
for i, ev in enumerate(events, start=1):
handler = partial(
process_event,
url=ev["link"],
url_num=i,
context=context,
)
url = await network.safe_process(
handler,
url_num=i,
log=log,
)
if url:
sport, event, ts = ev["sport"], ev["event"], ev["event_ts"]
tvg_id, logo = leagues.get_tvg_info(sport, event)
key = f"[{sport}] {event} (VOLO)"
entry = {
"url": url,
"logo": logo,
"base": "http://volokit2.com",
"timestamp": ts,
"id": tvg_id or "Live.Event.us",
}
urls[key] = cached_urls[key] = entry
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")
else:
log.info("No new events found")
CACHE_FILE.write(cached_urls)