From 00000d920aa169bde4f8c0a622e5bd65dbb7c2dd Mon Sep 17 00:00:00 2001 From: doms9 <96013514+doms9@users.noreply.github.com> Date: Tue, 23 Dec 2025 03:17:48 -0500 Subject: [PATCH] e add semaphores to scrapers (maybe) fix hanging on watchfooty misc. edits --- EPG/fetch.py | 2 +- M3U8/fetch.py | 2 +- M3U8/scrapers/embedhd.py | 1 + M3U8/scrapers/fawa.py | 2 +- M3U8/scrapers/istreameast.py | 2 +- M3U8/scrapers/pixel.py | 54 +++++++++++++++++++--------------- M3U8/scrapers/ppv.py | 1 + M3U8/scrapers/roxie.py | 1 + M3U8/scrapers/shark.py | 1 + M3U8/scrapers/sport9.py | 1 + M3U8/scrapers/streambtw.py | 6 ++-- M3U8/scrapers/streamcenter.py | 1 + M3U8/scrapers/streamfree.py | 9 +++++- M3U8/scrapers/streamhub.py | 1 + M3U8/scrapers/streamsgate.py | 1 + M3U8/scrapers/strmd.py | 1 + M3U8/scrapers/tvpass.py | 9 +++++- M3U8/scrapers/utils/webwork.py | 47 +++++++++++++++++------------ M3U8/scrapers/watchfooty.py | 33 ++++++++------------- M3U8/scrapers/webcast.py | 1 + 20 files changed, 103 insertions(+), 73 deletions(-) diff --git a/EPG/fetch.py b/EPG/fetch.py index 5d7983b..fbd916e 100644 --- a/EPG/fetch.py +++ b/EPG/fetch.py @@ -22,7 +22,7 @@ epg_urls = [ ] client = httpx.AsyncClient( - timeout=5, + timeout=httpx.Timeout(5.0), follow_redirects=True, http2=True, headers={ diff --git a/M3U8/fetch.py b/M3U8/fetch.py index 75de2ff..135eb2b 100644 --- a/M3U8/fetch.py +++ b/M3U8/fetch.py @@ -64,7 +64,7 @@ async def main() -> None: asyncio.create_task(streamsgate.scrape()), asyncio.create_task(strmd.scrape()), asyncio.create_task(tvpass.scrape()), - # asyncio.create_task(watchfooty.scrape()), + asyncio.create_task(watchfooty.scrape()), asyncio.create_task(webcast.scrape()), ] diff --git a/M3U8/scrapers/embedhd.py b/M3U8/scrapers/embedhd.py index 5ff3cf8..30c7622 100644 --- a/M3U8/scrapers/embedhd.py +++ b/M3U8/scrapers/embedhd.py @@ -101,6 +101,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.PW_S, log=log, ) diff --git a/M3U8/scrapers/fawa.py b/M3U8/scrapers/fawa.py index 8d61668..aec9edd 100644 --- a/M3U8/scrapers/fawa.py +++ b/M3U8/scrapers/fawa.py @@ -114,8 +114,8 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.HTTP_S, log=log, - timeout=10, ) if url: diff --git a/M3U8/scrapers/istreameast.py b/M3U8/scrapers/istreameast.py index c4ce5b7..d8a33c3 100644 --- a/M3U8/scrapers/istreameast.py +++ b/M3U8/scrapers/istreameast.py @@ -131,8 +131,8 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.HTTP_S, log=log, - timeout=10, ) if url: diff --git a/M3U8/scrapers/pixel.py b/M3U8/scrapers/pixel.py index de9ab0a..14a2b73 100644 --- a/M3U8/scrapers/pixel.py +++ b/M3U8/scrapers/pixel.py @@ -1,6 +1,7 @@ import json +from functools import partial -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext, async_playwright from .utils import Cache, Time, get_logger, leagues, network @@ -15,36 +16,29 @@ CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=19_800) BASE_URL = "https://pixelsport.tv/backend/livetv/events" -async def get_api_data() -> dict[str, list[dict, str, str]]: - async with async_playwright() as p: - try: - browser, context = await network.browser(p) +async def get_api_data(context: BrowserContext) -> dict[str, list[dict, str, str]]: + try: + page = await context.new_page() - page = await context.new_page() + await page.goto( + BASE_URL, + wait_until="domcontentloaded", + timeout=10_000, + ) - await page.goto( - BASE_URL, - wait_until="domcontentloaded", - timeout=10_000, - ) + raw_json = await page.locator("pre").inner_text(timeout=5_000) + except Exception as e: + log.error(f'Failed to fetch "{BASE_URL}": {e}') - raw_json = await page.locator("pre").inner_text(timeout=5_000) - - except Exception as e: - log.error(f'Failed to fetch "{BASE_URL}": {e}') - - return {} - - finally: - await browser.close() + return {} return json.loads(raw_json) -async def get_events() -> dict[str, dict[str, str | float]]: +async def get_events(context: BrowserContext) -> dict[str, dict[str, str | float]]: now = Time.clean(Time.now()) - api_data = await get_api_data() + api_data = await get_api_data(context) events = {} @@ -91,9 +85,21 @@ async def scrape() -> None: log.info(f'Scraping from "{BASE_URL}"') - events = await get_events() + async with async_playwright() as p: + browser, context = await network.browser(p) - urls.update(events) + handler = partial(get_events, context=context) + + events = await network.safe_process( + handler, + url_num=1, + semaphore=network.PW_S, + log=log, + ) + + await browser.close() + + urls.update(events or {}) CACHE_FILE.write(urls) diff --git a/M3U8/scrapers/ppv.py b/M3U8/scrapers/ppv.py index d4ad414..5bc31b4 100644 --- a/M3U8/scrapers/ppv.py +++ b/M3U8/scrapers/ppv.py @@ -123,6 +123,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.PW_S, log=log, ) diff --git a/M3U8/scrapers/roxie.py b/M3U8/scrapers/roxie.py index 868c2e8..d04f848 100644 --- a/M3U8/scrapers/roxie.py +++ b/M3U8/scrapers/roxie.py @@ -159,6 +159,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.HTTP_S, log=log, ) diff --git a/M3U8/scrapers/shark.py b/M3U8/scrapers/shark.py index 26297f2..bc42cc3 100644 --- a/M3U8/scrapers/shark.py +++ b/M3U8/scrapers/shark.py @@ -137,6 +137,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.HTTP_S, log=log, ) diff --git a/M3U8/scrapers/sport9.py b/M3U8/scrapers/sport9.py index f3b7ab6..f795d45 100644 --- a/M3U8/scrapers/sport9.py +++ b/M3U8/scrapers/sport9.py @@ -121,6 +121,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.PW_S, log=log, ) diff --git a/M3U8/scrapers/streambtw.py b/M3U8/scrapers/streambtw.py index e6029b4..78cc3e5 100644 --- a/M3U8/scrapers/streambtw.py +++ b/M3U8/scrapers/streambtw.py @@ -66,9 +66,7 @@ async def get_events() -> list[dict[str, str]]: ): continue - league = league_elem.text(strip=True) - - name = event_elem.text(strip=True) + league, name = league_elem.text(strip=True), event_elem.text(strip=True) events.append( { @@ -108,8 +106,8 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.HTTP_S, log=log, - timeout=10, ) if url: diff --git a/M3U8/scrapers/streamcenter.py b/M3U8/scrapers/streamcenter.py index 86719db..acf3622 100644 --- a/M3U8/scrapers/streamcenter.py +++ b/M3U8/scrapers/streamcenter.py @@ -119,6 +119,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.PW_S, log=log, ) diff --git a/M3U8/scrapers/streamfree.py b/M3U8/scrapers/streamfree.py index f065ccd..050fd1c 100644 --- a/M3U8/scrapers/streamfree.py +++ b/M3U8/scrapers/streamfree.py @@ -77,7 +77,14 @@ async def scrape() -> None: log.info(f'Scraping from "{BASE_URL}"') - urls.update(await get_events()) + events = await network.safe_process( + get_events, + url_num=1, + semaphore=network.HTTP_S, + log=log, + ) + + urls.update(events or {}) CACHE_FILE.write(urls) diff --git a/M3U8/scrapers/streamhub.py b/M3U8/scrapers/streamhub.py index ad051ad..9eda928 100644 --- a/M3U8/scrapers/streamhub.py +++ b/M3U8/scrapers/streamhub.py @@ -164,6 +164,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.PW_S, log=log, ) diff --git a/M3U8/scrapers/streamsgate.py b/M3U8/scrapers/streamsgate.py index 66fe4e8..1a9ee6e 100644 --- a/M3U8/scrapers/streamsgate.py +++ b/M3U8/scrapers/streamsgate.py @@ -151,6 +151,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.PW_S, log=log, ) diff --git a/M3U8/scrapers/strmd.py b/M3U8/scrapers/strmd.py index a882152..21b5b44 100644 --- a/M3U8/scrapers/strmd.py +++ b/M3U8/scrapers/strmd.py @@ -151,6 +151,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.PW_S, log=log, ) diff --git a/M3U8/scrapers/tvpass.py b/M3U8/scrapers/tvpass.py index 560734c..f99db2e 100644 --- a/M3U8/scrapers/tvpass.py +++ b/M3U8/scrapers/tvpass.py @@ -66,7 +66,14 @@ async def scrape() -> None: log.info(f'Scraping from "{BASE_URL}"') - urls.update(await get_events()) + events = await network.safe_process( + get_events, + url_num=1, + semaphore=network.HTTP_S, + log=log, + ) + + urls.update(events or {}) CACHE_FILE.write(urls) diff --git a/M3U8/scrapers/utils/webwork.py b/M3U8/scrapers/utils/webwork.py index 942e0ef..097a02b 100644 --- a/M3U8/scrapers/utils/webwork.py +++ b/M3U8/scrapers/utils/webwork.py @@ -24,11 +24,15 @@ class Network: "Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0" ) + HTTP_S = asyncio.Semaphore(10) + + PW_S = asyncio.Semaphore(3) + proxy_base = "https://stream.nvrmind.xyz" def __init__(self) -> None: self.client = httpx.AsyncClient( - timeout=5, + timeout=httpx.Timeout(5.0), follow_redirects=True, headers={"User-Agent": Network.UA}, http2=True, @@ -85,34 +89,39 @@ class Network: async def safe_process( fn: Callable[[], Awaitable[T]], url_num: int, - timeout: int | float = 15, + semaphore: asyncio.Semaphore, + timeout: int | float = 10, log: logging.Logger | None = None, ) -> T | None: log = log or logger - task = asyncio.create_task(fn()) - - try: - return await asyncio.wait_for(task, timeout=timeout) - except asyncio.TimeoutError: - log.warning(f"URL {url_num}) Timed out after {timeout}s, skipping event") - - task.cancel() + async with semaphore: + task = asyncio.create_task(fn()) try: - await task - except asyncio.CancelledError: - pass + return await asyncio.wait_for(task, timeout=timeout) + except asyncio.TimeoutError: + log.warning( + f"URL {url_num}) Timed out after {timeout}s, skipping event" + ) + + task.cancel() + + try: + await task + except asyncio.CancelledError: + pass + + except Exception as e: + log.debug(f"URL {url_num}) Ignore exception after timeout: {e}") + + return except Exception as e: - log.debug(f"URL {url_num}) Ignore exception after timeout: {e}") + log.error(f"URL {url_num}) Unexpected error: {e}") - return - except Exception as e: - log.error(f"URL {url_num}) Unexpected error: {e}") - - return + return @staticmethod def capture_req( diff --git a/M3U8/scrapers/watchfooty.py b/M3U8/scrapers/watchfooty.py index 3b90307..4367ac8 100644 --- a/M3U8/scrapers/watchfooty.py +++ b/M3U8/scrapers/watchfooty.py @@ -78,12 +78,12 @@ async def process_event( pattern = re.compile(r"\((\d+)\)") - page = await context.new_page() - captured: list[str] = [] got_one = asyncio.Event() + page = await context.new_page() + handler = partial( network.capture_req, captured=captured, @@ -102,10 +102,7 @@ async def process_event( await page.wait_for_timeout(2_000) try: - header = await page.wait_for_selector( - "text=/Stream Links/i", - timeout=5_000, - ) + header = await page.wait_for_selector("text=/Stream Links/i", timeout=5_000) text = await header.inner_text() except TimeoutError: @@ -120,8 +117,7 @@ async def process_event( try: first_available = await page.wait_for_selector( - 'a[href*="/stream/"]', - timeout=3_000, + 'a[href*="/stream/"]', timeout=3_000 ) except TimeoutError: log.warning(f"URL {url_num}) No available stream links.") @@ -133,22 +129,18 @@ async def process_event( return None, None + embed = re.sub( + pattern=r"^.*\/stream", + repl="https://spiderembed.top/embed", + string=href, + ) + await page.goto( - href, + embed, wait_until="domcontentloaded", timeout=5_000, ) - if not (iframe := await page.query_selector("iframe")): - log.warning(f"URL {url_num}) No iframe found.") - - return None, None - - if not (iframe_src := await iframe.get_attribute("src")): - log.warning(f"URL {url_num}) No iframe source found.") - - return None, None - wait_task = asyncio.create_task(got_one.wait()) try: @@ -170,7 +162,7 @@ async def process_event( if captured: log.info(f"URL {url_num}) Captured M3U8") - return captured[-1], iframe_src + return captured[0], embed log.warning(f"URL {url_num}) No M3U8 captured after waiting.") @@ -282,6 +274,7 @@ async def scrape() -> None: url, iframe = await network.safe_process( handler, url_num=i, + semaphore=network.PW_S, log=log, ) diff --git a/M3U8/scrapers/webcast.py b/M3U8/scrapers/webcast.py index 1b9ee12..8a5623b 100644 --- a/M3U8/scrapers/webcast.py +++ b/M3U8/scrapers/webcast.py @@ -141,6 +141,7 @@ async def scrape() -> None: url = await network.safe_process( handler, url_num=i, + semaphore=network.PW_S, log=log, )