This commit is contained in:
doms9 2025-10-11 18:43:57 -04:00
parent 703c55bee4
commit 00000d9199
5 changed files with 328 additions and 30 deletions

View file

@ -2,7 +2,7 @@
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from scrapers import fstv, livetvsx, streambtw, streamed, strmd, tvpass from scrapers import fstv, livetvsx, streambtw, streamed, strmd, tvpass, watchfooty
from scrapers.utils import get_logger, network from scrapers.utils import get_logger, network
log = get_logger(__name__) log = get_logger(__name__)
@ -32,6 +32,7 @@ async def main() -> None:
asyncio.create_task(streamed.scrape(network.client)), asyncio.create_task(streamed.scrape(network.client)),
asyncio.create_task(strmd.scrape(network.client)), asyncio.create_task(strmd.scrape(network.client)),
asyncio.create_task(tvpass.scrape(network.client)), asyncio.create_task(tvpass.scrape(network.client)),
asyncio.create_task(watchfooty.scrape(network.client)),
] ]
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
@ -43,6 +44,7 @@ async def main() -> None:
| streamed.urls | streamed.urls
| strmd.urls | strmd.urls
| tvpass.urls | tvpass.urls
| watchfooty.urls
) )
live_events = [] live_events = []

View file

@ -247,3 +247,6 @@ async def scrape(client: httpx.AsyncClient) -> None:
log.info("No new events found") log.info("No new events found")
CACHE_FILE.write(cached_urls) CACHE_FILE.write(cached_urls)
# same provider as strmd.py

View file

@ -31,34 +31,6 @@ def validate_category(s: str) -> str:
return s.capitalize() return s.capitalize()
def get_tvg_info(sport: str, event: str) -> tuple[str | None, str]:
match sport:
case "American Football":
if leagues.is_valid(event, "NFL"):
return leagues.info("NFL")
else:
return leagues.info("NCAA")
case "Basketball":
if leagues.is_valid(event, "NBA"):
return leagues.info("NBA")
elif leagues.is_valid(event, "WNBA"):
return leagues.info("WNBA")
# NCAA
else:
return leagues.info("Basketball")
case "Hockey":
return leagues.info("NHL")
case _:
return leagues.info(sport)
async def refresh_api_cache( async def refresh_api_cache(
client: httpx.AsyncClient, url: str client: httpx.AsyncClient, url: str
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
@ -139,6 +111,7 @@ async def get_events(
base_url: str, base_url: str,
cached_keys: set[str], cached_keys: set[str],
) -> list[dict[str, str]]: ) -> list[dict[str, str]]:
if not (api_data := API_FILE.load(per_entry=False, index=True)): if not (api_data := API_FILE.load(per_entry=False, index=True)):
api_data = await refresh_api_cache( api_data = await refresh_api_cache(
client, client,
@ -257,7 +230,7 @@ async def scrape(client: httpx.AsyncClient) -> None:
key = f"[{sport}] {event} (STRMD)" key = f"[{sport}] {event} (STRMD)"
tvg_id, pic = get_tvg_info(sport, event) tvg_id, pic = leagues.get_tvg_info(sport, event)
entry = { entry = {
"url": url, "url": url,

View file

@ -50,6 +50,7 @@ class Time(datetime):
formats = [ formats = [
"%Y-%m-%d %H:%M", "%Y-%m-%d %H:%M",
"%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S",
"%Y-%m-%dT%H:%M:%S.%fZ",
"%a, %d %b %Y %H:%M:%S %z", "%a, %d %b %Y %H:%M:%S %z",
] ]
@ -108,6 +109,33 @@ class Leagues:
return event.lower() == "nfl redzone" if league == "NFL" else False return event.lower() == "nfl redzone" if league == "NFL" else False
def get_tvg_info(self, sport: str, event: str) -> tuple[str | None, str]:
match sport:
case "American Football":
return (
self.info("NFL")
if self.is_valid(event, "NFL")
else self.info("NCAA")
)
case "Basketball":
if self.is_valid(event, "NBA"):
return self.info("NBA")
elif self.is_valid(event, "WNBA"):
return self.info("WNBA")
# NCAA
else:
return self.info("Basketball")
case "Hockey":
return self.info("NHL")
case _:
return self.info(sport)
@property @property
def league_names(self) -> list[str]: def league_names(self) -> list[str]:
return self.data["teams"].keys() return self.data["teams"].keys()

292
M3U8/scrapers/watchfooty.py Normal file
View file

@ -0,0 +1,292 @@
import asyncio
import re
from functools import partial
from pathlib import Path
from typing import Any
from urllib.parse import urljoin
import httpx
from playwright.async_api import BrowserContext, async_playwright
from .utils import Cache, Time, get_logger, leagues, network
log = get_logger(__name__)
urls: dict[str, dict[str, str | float]] = {}
API_FILE = Cache(Path(__file__).parent / "caches" / "watchfty_api.json", exp=28_800)
CACHE_FILE = Cache(Path(__file__).parent / "caches" / "watchfty.json", exp=10_800)
MIRRORS = [
"https://www.watchfooty.cc",
"https://www.watchfooty.vip",
"https://www.watchfooty.live",
]
SPORT_ENDPOINTS = [
"football",
"american-football",
"hockey",
"basketball",
"baseball",
"racing",
"fighting",
"golf",
]
async def get_api_data(
client: httpx.AsyncClient,
url: str,
) -> list[dict[str, Any]]:
try:
r = await client.get(url, timeout=10)
r.raise_for_status()
except Exception as e:
log.error(f'Failed to fetch "{url}": {e}')
return []
return r.json()
async def refresh_api_cache(
client: httpx.AsyncClient,
url: str,
) -> list[dict[str, Any]]:
log.info("Refreshing API cache")
tasks = [
get_api_data(
client,
urljoin(url, f"api/v1/matches/{sport}"),
)
for sport in SPORT_ENDPOINTS
]
results = await asyncio.gather(*tasks)
data = [event for sublist in results if sublist for event in sublist]
data[0]["timestamp"] = Time.now().timestamp()
return data
async def process_event(
url: str,
url_num: int,
context: BrowserContext,
) -> str | None:
page = await context.new_page()
captured: list[str] = []
got_one = asyncio.Event()
handler = partial(network.capture_req, captured=captured, got_one=got_one)
page.on("request", handler)
try:
await page.goto(
url,
wait_until="domcontentloaded",
timeout=15_000,
)
try:
header = await page.wait_for_selector(
"text=/Stream Links/i",
timeout=5_000,
)
text = await header.inner_text()
except TimeoutError:
return
match = re.search(r"\((\d+)\)", text)
if not match or int(match[1]) == 0:
return
try:
frame_el = await page.wait_for_selector(
"iframe[src*='embed.best-sports.stream']",
timeout=4_000,
)
except TimeoutError:
log.warning("No iframe found — exiting early.")
return
iframe = await frame_el.content_frame()
if not iframe:
return
try:
await iframe.wait_for_selector(
"button.vds-play-button",
state="visible",
timeout=4_000,
)
await iframe.click("button.vds-play-button")
except TimeoutError:
log.warning("Play button not found inside iframe.")
wait_task = asyncio.create_task(got_one.wait())
try:
await asyncio.wait_for(wait_task, timeout=10)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
return
finally:
if not wait_task.done():
wait_task.cancel()
try:
await wait_task
except asyncio.CancelledError:
pass
if captured:
log.info(f"URL {url_num}) Captured M3U8")
return captured[-1]
log.warning(f"URL {url_num}) No M3U8 captured after waiting.")
return
except Exception as e:
log.warning(f"URL {url_num}) Exception while processing: {e}")
return
finally:
page.remove_listener("request", handler)
await page.close()
async def get_events(
client: httpx.AsyncClient,
base_url: str,
cached_keys: set[str],
) -> list[dict[str, str]]:
if not (api_data := API_FILE.load(per_entry=False, index=True)):
api_data = await refresh_api_cache(client, base_url)
API_FILE.write(api_data)
events: list[dict[str, str]] = []
now = Time.clean(Time.now())
start_dt = now.delta(minutes=-30)
end_dt = now.delta(minutes=30)
pattern = re.compile(r"\-+|\(")
for event in api_data:
match_id = event["matchId"]
name = event["title"]
league = event["league"]
if not (date := event.get("date")):
continue
event_dt = Time.from_str(date)
if not start_dt <= event_dt <= end_dt:
continue
sport = pattern.split(league, 1)[0].strip()
logo = urljoin(base_url, poster) if (poster := event.get("poster")) else None
key = f"[{sport}] {name} (WFTY)"
if cached_keys & {key}:
continue
events.append(
{
"sport": sport,
"event": name,
"link": f"https://www.watchfooty.live/en/stream/{match_id}",
"logo": logo,
"timestamp": event_dt.timestamp(),
}
)
return events
async def scrape(client: httpx.AsyncClient) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
urls.update(cached_urls)
log.info(f"Loaded {cached_count} event(s) from cache")
if not (base_url := await network.get_base(MIRRORS)):
log.warning("No working WatchFooty mirrors")
CACHE_FILE.write(cached_urls)
return
log.info(f'Scraping from "{base_url}"')
events = await get_events(
client,
base_url,
set(cached_urls.keys()),
)
log.info(f"Processing {len(events)} new URL(s)")
# breakpoint()
async with async_playwright() as p:
browser, context = await network.browser(p)
for i, ev in enumerate(events, start=1):
url = await network.safe_process(
lambda: process_event(
ev["link"],
url_num=i,
context=context,
),
url_num=i,
log=log,
)
if url:
sport, event, logo, ts = (
ev["sport"],
ev["event"],
ev["logo"],
ev["timestamp"],
)
key = f"[{sport}] {event} (WFTY)"
tvg_id, pic = leagues.get_tvg_info(sport, event)
entry = {
"url": url,
"logo": logo or pic,
"base": base_url,
"timestamp": ts,
"id": tvg_id or "Live.Event.us",
}
urls[key] = cached_urls[key] = entry
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")
else:
log.info("No new events found")
CACHE_FILE.write(cached_urls)