iptv/M3U8/scrapers/utils/webwork.py

298 lines
7.4 KiB
Python
Raw Normal View History

2025-12-08 13:21:43 -05:00
import asyncio
import logging
import random
import re
from collections.abc import Awaitable, Callable
from contextlib import asynccontextmanager
from functools import cache, partial
from pathlib import Path
from typing import AsyncGenerator, TypeVar
from urllib.parse import urlparse
2025-12-08 13:21:43 -05:00
import httpx
from playwright.async_api import (
Browser,
BrowserContext,
Page,
Playwright,
Request,
Route,
)
2025-12-08 13:21:43 -05:00
from .logger import get_logger
2025-12-18 04:14:54 -05:00
logger = get_logger(__name__)
2025-12-08 13:21:43 -05:00
T = TypeVar("T")
class Network:
UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0"
)
HTTP_S = asyncio.Semaphore(10)
PW_S = asyncio.Semaphore(3)
2025-12-08 13:21:43 -05:00
def __init__(self) -> None:
self.client = httpx.AsyncClient(
timeout=httpx.Timeout(5.0),
2025-12-08 13:21:43 -05:00
follow_redirects=True,
headers={"User-Agent": Network.UA},
http2=True,
)
2025-12-18 03:04:11 -05:00
async def request(
self,
url: str,
log: logging.Logger | None = None,
**kwargs,
) -> httpx.Response | None:
2025-12-18 04:14:54 -05:00
log = log or logger
2025-12-18 03:04:11 -05:00
2025-12-08 13:21:43 -05:00
try:
2025-12-18 03:04:11 -05:00
r = await self.client.get(url, **kwargs)
2025-12-18 04:14:54 -05:00
2025-12-08 13:21:43 -05:00
r.raise_for_status()
2025-12-18 03:04:11 -05:00
2025-12-18 04:14:54 -05:00
return r
except (httpx.HTTPError, httpx.TimeoutException) as e:
log.error(f'Failed to fetch "{url}": {e}')
return ""
2025-12-08 13:21:43 -05:00
async def get_base(self, mirrors: list[str]) -> str | None:
random.shuffle(mirrors)
2025-12-18 03:04:11 -05:00
for mirror in mirrors:
if not (r := await self.request(mirror)):
continue
2025-12-08 13:21:43 -05:00
2025-12-18 03:04:11 -05:00
elif r.status_code != 200:
continue
2025-12-08 13:21:43 -05:00
2025-12-18 03:04:11 -05:00
return mirror
2025-12-08 13:21:43 -05:00
@staticmethod
async def safe_process(
fn: Callable[[], Awaitable[T]],
url_num: int,
semaphore: asyncio.Semaphore,
timeout: int | float = 30,
2025-12-08 13:21:43 -05:00
log: logging.Logger | None = None,
) -> T | None:
2025-12-18 04:14:54 -05:00
log = log or logger
2025-12-08 13:21:43 -05:00
async with semaphore:
task = asyncio.create_task(fn())
2025-12-08 13:21:43 -05:00
try:
return await asyncio.wait_for(task, timeout=timeout)
2025-12-08 13:21:43 -05:00
except asyncio.TimeoutError:
log.warning(
f"URL {url_num}) Timed out after {timeout}s, skipping event"
)
2025-12-08 13:21:43 -05:00
task.cancel()
2025-12-18 04:14:54 -05:00
try:
await task
except asyncio.CancelledError:
pass
2025-12-08 13:21:43 -05:00
except Exception as e:
log.warning(f"URL {url_num}) Ignore exception after timeout: {e}")
return
except Exception as e:
log.error(f"URL {url_num}) Unexpected error: {e}")
2025-12-18 04:14:54 -05:00
return
2025-12-08 13:21:43 -05:00
2026-03-02 00:16:08 -05:00
@cache
@staticmethod
2026-03-02 00:16:08 -05:00
def stealth_js() -> str:
return (Path(__file__).parent / "stealth.js").read_text(encoding="utf-8")
@cache
2026-03-02 00:16:08 -05:00
@staticmethod
def blocked_domains() -> list[str]:
return (
(Path(__file__).parent / "easylist.txt")
.read_text(encoding="utf-8")
.splitlines()
)
@staticmethod
def to_block(request: Request) -> bool:
hostname = (urlparse(request.url).hostname or "").lower()
return any(
hostname == domain or hostname.endswith(f".{domain}")
for domain in Network.blocked_domains()
)
@staticmethod
async def _adblock(route: Route) -> None:
request = route.request
if request.resource_type not in ["script", "image", "media", "xhr"]:
await route.continue_()
return
await route.abort() if Network.to_block(request) else await route.continue_()
@staticmethod
@asynccontextmanager
async def event_context(
browser: Browser,
stealth: bool = True,
ignore_https: bool = False,
) -> AsyncGenerator[BrowserContext, None]:
context: BrowserContext | None = None
try:
if stealth:
context = await browser.new_context(
user_agent=Network.UA,
ignore_https_errors=ignore_https,
viewport={"width": 1366, "height": 768},
device_scale_factor=1,
locale="en-US",
timezone_id="America/New_York",
color_scheme="dark",
extra_http_headers=(
{
"Accept-Language": "en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
}
),
)
2026-03-02 00:16:08 -05:00
await context.add_init_script(script=Network.stealth_js())
await context.route("**/*", Network._adblock)
else:
context = await browser.new_context()
yield context
finally:
if context:
await context.close()
@staticmethod
@asynccontextmanager
async def event_page(context: BrowserContext) -> AsyncGenerator[Page, None]:
page = await context.new_page()
try:
yield page
finally:
await page.close()
@staticmethod
async def browser(playwright: Playwright, external: bool = False) -> Browser:
return (
await playwright.chromium.connect_over_cdp("http://localhost:9222")
if external
else await playwright.firefox.launch(headless=True)
)
2025-12-08 13:21:43 -05:00
@staticmethod
def capture_req(
req: Request,
captured: list[str],
got_one: asyncio.Event,
) -> None:
2026-01-28 18:31:23 -05:00
invalids = ["amazonaws", "knitcdn", "jwpltx"]
2025-12-08 13:21:43 -05:00
escaped = [re.escape(i) for i in invalids]
2026-02-27 18:35:33 -05:00
pattern = re.compile(rf"^(?!.*({'|'.join(escaped)})).*\.m3u8", re.I)
2025-12-08 13:21:43 -05:00
if pattern.search(req.url):
captured.append(req.url)
got_one.set()
async def process_event(
self,
url: str,
url_num: int,
page: Page,
2025-12-08 13:21:43 -05:00
timeout: int | float = 10,
log: logging.Logger | None = None,
) -> str | None:
2025-12-18 04:14:54 -05:00
log = log or logger
2025-12-18 03:04:11 -05:00
2025-12-08 13:21:43 -05:00
captured: list[str] = []
got_one = asyncio.Event()
handler = partial(
self.capture_req,
captured=captured,
got_one=got_one,
)
page.on("request", handler)
try:
await page.goto(
url,
wait_until="domcontentloaded",
timeout=6_000,
2025-12-08 13:21:43 -05:00
)
wait_task = asyncio.create_task(got_one.wait())
try:
await asyncio.wait_for(wait_task, timeout=timeout)
except asyncio.TimeoutError:
log.warning(f"URL {url_num}) Timed out waiting for M3U8.")
2025-12-18 04:14:54 -05:00
2025-12-08 13:21:43 -05:00
return
finally:
if not wait_task.done():
wait_task.cancel()
try:
await wait_task
except asyncio.CancelledError:
pass
if captured:
log.info(f"URL {url_num}) Captured M3U8")
2025-12-18 04:14:54 -05:00
2025-12-08 13:21:43 -05:00
return captured[0]
log.warning(f"URL {url_num}) No M3U8 captured after waiting.")
2025-12-18 04:14:54 -05:00
2025-12-08 13:21:43 -05:00
return
except Exception as e:
log.warning(f"URL {url_num}) {e}")
2025-12-18 04:14:54 -05:00
2025-12-08 13:21:43 -05:00
return
finally:
page.remove_listener("request", handler)
2025-12-18 04:14:54 -05:00
2025-12-08 13:21:43 -05:00
network = Network()
__all__ = ["network"]