- edit scraping for streamhub.py
- edit scraping for streamsgate.py
- misc edits.
This commit is contained in:
doms9 2026-04-05 17:26:17 -04:00
parent 55c8b7cfd3
commit 00000d9788
3 changed files with 171 additions and 102 deletions

View file

@ -1,8 +1,8 @@
import asyncio import asyncio
import re
from functools import partial from functools import partial
from urllib.parse import urljoin from urllib.parse import urljoin, urlparse
from playwright.async_api import Browser
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -34,6 +34,61 @@ SPORT_ENDPOINTS = [
] ]
async def process_event(url: str, url_num: int) -> tuple[str | None, str | None]:
if not (event_data := await network.request(url, log=log)):
log.warning(f"URL {url_num}) Failed to load url.")
return
soup_1 = HTMLParser(event_data.content)
ifr_1 = soup_1.css_first("iframe#playerIframe")
if not ifr_1 or not (src := ifr_1.attributes.get("src")):
log.warning(f"URL {url_num}) No iframe element found.")
return
parsed = urlparse(src)
ifr_1_src = urljoin(
BASE_URL,
f"embed1/{parsed.path.split('/')[-1].split('_')[0]}.php",
)
if not (
ifr_1_src_data := await network.request(
ifr_1_src,
headers={"Referer": url},
log=log,
)
):
log.warning(f"URL {url_num}) Failed to load iframe source. (IFR1)")
return
soup_2 = HTMLParser(ifr_1_src_data.content)
ifr_2 = soup_2.css_first("center iframe")
if not ifr_2 or not (ifr_2_src := ifr_2.attributes.get("src")):
log.warning(f"URL {url_num}) Unable to locate iframe. (IFR2)")
return
ifr_2_src = f"https:{ifr_2_src}" if ifr_2_src.startswith("//") else ifr_2_src
if not (ifr_2_src_data := await network.request(ifr_2_src, log=log)):
log.warning(f"URL {url_num}) Failed to load iframe source.")
return
valid_m3u8 = re.compile(r"src:\s+(\'|\")([^\']+)(\'|\")", re.I)
if not (match := valid_m3u8.search(ifr_2_src_data.text)):
log.warning(f"URL {url_num}) No source found.")
return
log.info(f"URL {url_num}) Captured M3U8")
return match[2]
async def refresh_html_cache( async def refresh_html_cache(
date: str, date: str,
sport_id: str, sport_id: str,
@ -115,8 +170,8 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
live = [] live = []
start_ts = now.delta(hours=-1).timestamp() start_ts = now.delta(minutes=-30).timestamp()
end_ts = now.delta(minutes=1).timestamp() end_ts = now.delta(minutes=30).timestamp()
for k, v in events.items(): for k, v in events.items():
if k in cached_keys: if k in cached_keys:
@ -130,7 +185,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return live return live
async def scrape(browser: Browser) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
@ -146,16 +201,12 @@ async def scrape(browser: Browser) -> None:
if events := await get_events(cached_urls.keys()): if events := await get_events(cached_urls.keys()):
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
async with network.event_context(browser, stealth=False) as context:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, process_event,
url=(link := ev["link"]), url=(link := ev["link"]),
url_num=i, url_num=i,
page=page,
timeout=5,
log=log,
) )
url = await network.safe_process( url = await network.safe_process(
@ -178,10 +229,11 @@ async def scrape(browser: Browser) -> None:
entry = { entry = {
"url": url, "url": url,
"logo": logo, "logo": logo,
"base": "https://storytrench.net/", "base": "https://hardsmart.click",
"timestamp": ts, "timestamp": ts,
"id": tvg_id or "Live.Event.us", "id": tvg_id or "Live.Event.us",
"link": link, "link": link,
"UA": "curl/8.19.0",
} }
cached_urls[key] = entry cached_urls[key] = entry
@ -189,8 +241,6 @@ async def scrape(browser: Browser) -> None:
if url: if url:
valid_count += 1 valid_count += 1
entry["url"] = url.split("?")[0]
urls[key] = entry urls[key] = entry
log.info(f"Collected and cached {valid_count - cached_count} new event(s)") log.info(f"Collected and cached {valid_count - cached_count} new event(s)")

View file

@ -1,10 +1,11 @@
import asyncio import asyncio
import re
from functools import partial from functools import partial
from itertools import chain from itertools import chain
from typing import Any from typing import Any
from urllib.parse import urljoin from urllib.parse import urljoin
from playwright.async_api import Browser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -46,6 +47,42 @@ def get_event(t1: str, t2: str) -> str:
return f"{t1.strip()} vs {t2.strip()}" return f"{t1.strip()} vs {t2.strip()}"
async def process_event(url: str, url_num: int) -> tuple[str | None, str | None]:
if not (event_data := await network.request(url, log=log)):
log.warning(f"URL {url_num}) Failed to load url.")
return
soup_1 = HTMLParser(event_data.content)
ifr = soup_1.css_first("iframe")
if not ifr or not (src := ifr.attributes.get("src")):
log.warning(f"URL {url_num}) No iframe element found.")
return
ifr_src = f"https:{src}" if src.startswith("//") else src
if not (
ifr_src_data := await network.request(
ifr_src,
headers={"Referer": url},
log=log,
)
):
log.warning(f"URL {url_num}) Failed to load iframe source. (IFR1)")
return
valid_m3u8 = re.compile(r"file:\s+(\'|\")([^\"]*)(\'|\")", re.I)
if not (match := valid_m3u8.search(ifr_src_data.text)):
log.warning(f"URL {url_num}) No source found.")
return
log.info(f"URL {url_num}) Captured M3U8")
return match[2]
async def refresh_api_cache(now_ts: float) -> list[dict[str, Any]]: async def refresh_api_cache(now_ts: float) -> list[dict[str, Any]]:
tasks = [network.request(url, log=log) for url in SPORT_URLS] tasks = [network.request(url, log=log) for url in SPORT_URLS]
@ -74,8 +111,8 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
events = [] events = []
start_dt = now.delta(hours=-1) start_dt = now.delta(minutes=-30)
end_dt = now.delta(minutes=5) end_dt = now.delta(minutes=30)
for stream_group in api_data: for stream_group in api_data:
date = stream_group.get("time") date = stream_group.get("time")
@ -118,7 +155,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events return events
async def scrape(browser: Browser) -> None: async def scrape() -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
@ -134,15 +171,11 @@ async def scrape(browser: Browser) -> None:
if events := await get_events(cached_urls.keys()): if events := await get_events(cached_urls.keys()):
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
async with network.event_context(browser, stealth=False) as context:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, process_event,
url=(link := ev["link"]), url=(link := ev["link"]),
url_num=i, url_num=i,
page=page,
log=log,
) )
url = await network.safe_process( url = await network.safe_process(
@ -165,7 +198,7 @@ async def scrape(browser: Browser) -> None:
entry = { entry = {
"url": url, "url": url,
"logo": logo, "logo": logo,
"base": "https://instreams.click/", "base": "https://streamfree.click",
"timestamp": ts, "timestamp": ts,
"id": tvg_id or "Live.Event.us", "id": tvg_id or "Live.Event.us",
"link": link, "link": link,
@ -176,8 +209,6 @@ async def scrape(browser: Browser) -> None:
if url: if url:
valid_count += 1 valid_count += 1
entry["url"] = url.split("&e")[0]
urls[key] = entry urls[key] = entry
log.info(f"Collected and cached {valid_count - cached_count} new event(s)") log.info(f"Collected and cached {valid_count - cached_count} new event(s)")

View file

@ -27,36 +27,26 @@ def fix_txt(s: str) -> str:
async def process_event(url: str, url_num: int) -> str | None: async def process_event(url: str, url_num: int) -> str | None:
if not (event_data := await network.request(url, log=log)): if not (event_data := await network.request(url, log=log)):
log.warning(f"URL {url_num}) Failed to load url.") log.warning(f"URL {url_num}) Failed to load url.")
return return
soup_1 = HTMLParser(event_data.content) soup_1 = HTMLParser(event_data.content)
if not (iframe_1 := soup_1.css_first("iframe")): iframe_1 = soup_1.css_first("iframe")
if not iframe_1 or not (iframe_1_src := iframe_1.attributes.get("src")):
log.warning(f"URL {url_num}) No iframe element found. (IFR1)") log.warning(f"URL {url_num}) No iframe element found. (IFR1)")
return
if not (iframe_1_src := iframe_1.attributes.get("src")):
log.warning(f"URL {url_num}) No iframe source found. (IFR1)")
return return
if not (iframe_1_src_data := await network.request(iframe_1_src, log=log)): if not (iframe_1_src_data := await network.request(iframe_1_src, log=log)):
log.warning(f"URL {url_num}) Failed to load iframe source. (IFR1)") log.warning(f"URL {url_num}) Failed to load iframe source. (IFR1)")
return return
soup_2 = HTMLParser(iframe_1_src_data.content) soup_2 = HTMLParser(iframe_1_src_data.content)
if not (iframe_2 := soup_2.css_first("iframe")): iframe_2 = soup_2.css_first("iframe")
if not iframe_2 or not (iframe_2_src := iframe_2.attributes.get("src")):
log.warning(f"URL {url_num}) No iframe element found. (IFR2)") log.warning(f"URL {url_num}) No iframe element found. (IFR2)")
return
if not (iframe_2_src := iframe_2.attributes.get("src")):
log.warning(f"URL {url_num}) No iframe source found. (IFR2)")
return return
if not ( if not (
@ -67,14 +57,12 @@ async def process_event(url: str, url_num: int) -> str | None:
) )
): ):
log.warning(f"URL {url_num}) Failed to load iframe source. (IFR2)") log.warning(f"URL {url_num}) Failed to load iframe source. (IFR2)")
return return
valid_m3u8 = re.compile(r'currentStreamUrl\s+=\s+"([^"]*)"', re.I) valid_m3u8 = re.compile(r'currentStreamUrl\s+=\s+"([^"]*)"', re.I)
if not (match := valid_m3u8.search(iframe_2_src_data.text)): if not (match := valid_m3u8.search(iframe_2_src_data.text)):
log.warning(f"URL {url_num}) No Clappr source found. (IFR2)") log.warning(f"URL {url_num}) No Clappr source found.")
return return
log.info(f"URL {url_num}) Captured M3U8") log.info(f"URL {url_num}) Captured M3U8")