- add tflix.py
- remove strmd.py
- modify playwright browser/context usage
- misc. edits
This commit is contained in:
doms9 2026-01-23 23:44:59 -05:00
parent 1aa60a8ce1
commit 00000d9638
24 changed files with 481 additions and 462 deletions

View file

@ -3,6 +3,7 @@ import asyncio
import re
from pathlib import Path
from playwright.async_api import async_playwright
from scrapers import (
cdnlivetv,
embedhd,
@ -19,7 +20,7 @@ from scrapers import (
streamfree,
streamhub,
streamsgate,
strmd,
tflix,
totalsportek,
tvpass,
watchfooty,
@ -53,31 +54,46 @@ async def main() -> None:
base_m3u8, tvg_chno = load_base()
tasks = [
asyncio.create_task(cdnlivetv.scrape()),
asyncio.create_task(embedhd.scrape()),
asyncio.create_task(fawa.scrape()),
asyncio.create_task(istreameast.scrape()),
asyncio.create_task(pawa.scrape()),
asyncio.create_task(pixel.scrape()),
asyncio.create_task(ppv.scrape()),
asyncio.create_task(roxie.scrape()),
asyncio.create_task(shark.scrape()),
asyncio.create_task(sport9.scrape()),
asyncio.create_task(streambtw.scrape()),
asyncio.create_task(streamcenter.scrape()),
asyncio.create_task(streamfree.scrape()),
asyncio.create_task(streamhub.scrape()),
asyncio.create_task(streamsgate.scrape()),
# asyncio.create_task(strmd.scrape()),
asyncio.create_task(totalsportek.scrape()),
asyncio.create_task(tvpass.scrape()),
asyncio.create_task(webcast.scrape()),
]
async with async_playwright() as p:
try:
hdl_brwsr = await network.browser(p)
await asyncio.gather(*tasks)
xtrnl_brwsr = await network.browser(p, external=True)
await watchfooty.scrape()
pw_tasks = [
asyncio.create_task(cdnlivetv.scrape(hdl_brwsr)),
asyncio.create_task(embedhd.scrape(hdl_brwsr)),
asyncio.create_task(pixel.scrape(hdl_brwsr)),
asyncio.create_task(ppv.scrape(xtrnl_brwsr)),
asyncio.create_task(sport9.scrape(xtrnl_brwsr)),
asyncio.create_task(streamcenter.scrape(xtrnl_brwsr)),
asyncio.create_task(streamhub.scrape(xtrnl_brwsr)),
asyncio.create_task(streamsgate.scrape(xtrnl_brwsr)),
asyncio.create_task(tflix.scrape(xtrnl_brwsr)),
asyncio.create_task(webcast.scrape(hdl_brwsr)),
asyncio.create_task(watchfooty.scrape(xtrnl_brwsr)),
]
httpx_tasks = [
asyncio.create_task(fawa.scrape()),
asyncio.create_task(istreameast.scrape()),
asyncio.create_task(pawa.scrape()),
asyncio.create_task(roxie.scrape()),
asyncio.create_task(shark.scrape()),
asyncio.create_task(streambtw.scrape()),
asyncio.create_task(streamfree.scrape()),
asyncio.create_task(totalsportek.scrape()),
asyncio.create_task(tvpass.scrape()),
]
await asyncio.gather(*(pw_tasks + httpx_tasks))
finally:
await hdl_brwsr.close()
await xtrnl_brwsr.close()
await network.client.aclose()
additions = (
cdnlivetv.urls
@ -95,7 +111,7 @@ async def main() -> None:
| streamfree.urls
| streamhub.urls
| streamsgate.urls
| strmd.urls
| tflix.urls
| totalsportek.urls
| tvpass.urls
| watchfooty.urls
@ -153,8 +169,3 @@ async def main() -> None:
if __name__ == "__main__":
asyncio.run(main())
try:
asyncio.run(network.client.aclose())
except Exception:
pass

View file

@ -1,6 +1,6 @@
from functools import partial
from playwright.async_api import async_playwright
from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network
@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "CDNTV"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800)
API_FILE = Cache(f"{TAG}-api", exp=19_800)
API_URL = "https://api.cdn-live.tv/api/v1/events/sports"
@ -85,7 +85,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
@ -101,16 +101,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p)
try:
for i, ev in enumerate(events, start=1):
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
page=page,
log=log,
)
@ -144,9 +142,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -1,6 +1,6 @@
from functools import partial
from playwright.async_api import async_playwright
from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network
@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "EMBEDHD"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=5_400)
CACHE_FILE = Cache(TAG, exp=5_400)
API_CACHE = Cache(f"{TAG.lower()}-api.json", exp=28_800)
API_CACHE = Cache(f"{TAG}-api", exp=28_800)
BASE_URL = "https://embedhd.org/api-event.php"
@ -75,7 +75,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
@ -91,16 +91,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p)
try:
for i, ev in enumerate(events, start=1):
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
page=page,
log=log,
)
@ -134,9 +132,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "FAWA"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
BASE_URL = "http://www.fawanews.sc/"

View file

@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "iSTRMEAST"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
BASE_URL = "https://istreameast.app"

View file

@ -13,7 +13,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "PAWA"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
BASE_URL = "https://pawastreams.net/feed"

View file

@ -1,7 +1,7 @@
import json
from functools import partial
from playwright.async_api import BrowserContext, async_playwright
from playwright.async_api import BrowserContext, Page
from .utils import Cache, Time, get_logger, leagues, network
@ -11,15 +11,13 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "PIXEL"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=19_800)
CACHE_FILE = Cache(TAG, exp=19_800)
BASE_URL = "https://pixelsport.tv/backend/livetv/events"
async def get_api_data(context: BrowserContext) -> dict[str, list[dict, str, str]]:
async def get_api_data(page: Page) -> dict[str, list[dict, str, str]]:
try:
page = await context.new_page()
await page.goto(
BASE_URL,
wait_until="domcontentloaded",
@ -35,10 +33,10 @@ async def get_api_data(context: BrowserContext) -> dict[str, list[dict, str, str
return json.loads(raw_json)
async def get_events(context: BrowserContext) -> dict[str, dict[str, str | float]]:
async def get_events(page: Page) -> dict[str, dict[str, str | float]]:
now = Time.clean(Time.now())
api_data = await get_api_data(context)
api_data = await get_api_data(page)
events = {}
@ -75,7 +73,7 @@ async def get_events(context: BrowserContext) -> dict[str, dict[str, str | float
return events
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
if cached := CACHE_FILE.load():
urls.update(cached)
@ -85,11 +83,9 @@ async def scrape() -> None:
log.info(f'Scraping from "{BASE_URL}"')
async with async_playwright() as p:
browser, context = await network.browser(p)
try:
handler = partial(get_events, context=context)
async with network.event_context(browser) as context:
async with network.event_page(context) as page:
handler = partial(get_events, page=page)
events = await network.safe_process(
handler,
@ -98,9 +94,6 @@ async def scrape() -> None:
log=log,
)
finally:
await browser.close()
urls.update(events or {})
CACHE_FILE.write(urls)

View file

@ -1,6 +1,6 @@
from functools import partial
from playwright.async_api import async_playwright
from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network
@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "PPV"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800)
API_FILE = Cache(f"{TAG}-api", exp=19_800)
MIRRORS = [
"https://old.ppv.to/api/streams",
@ -78,7 +78,7 @@ async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]:
return events
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
@ -101,16 +101,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1):
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
page=page,
timeout=6,
log=log,
)
@ -146,9 +144,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -13,9 +13,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "ROXIE"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=19_800)
HTML_CACHE = Cache(f"{TAG}-html", exp=19_800)
BASE_URL = "https://roxiestreams.live"

View file

@ -11,9 +11,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "SHARK"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=19_800)
HTML_CACHE = Cache(f"{TAG}-html", exp=19_800)
BASE_URL = "https://sharkstreams.net"

View file

@ -2,7 +2,7 @@ import asyncio
from functools import partial
from urllib.parse import urljoin
from playwright.async_api import async_playwright
from playwright.async_api import BrowserContext
from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network
@ -13,7 +13,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "SPORT9"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=5_400)
CACHE_FILE = Cache(TAG, exp=5_400)
BASE_URL = "https://sport9.ru/"
@ -88,7 +88,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
@ -106,16 +106,14 @@ async def scrape() -> None:
if events:
now = Time.clean(Time.now()).timestamp()
async with async_playwright() as p:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1):
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
page=page,
log=log,
)
@ -148,9 +146,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -13,7 +13,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMBTW"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=3_600)
CACHE_FILE = Cache(TAG, exp=3_600)
BASE_URLS = ["https://hiteasport.info/", "https://streambtw.com/"]

View file

@ -1,6 +1,6 @@
from functools import partial
from playwright.async_api import async_playwright
from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network
@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMCNTR"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=28_800)
API_FILE = Cache(f"{TAG}-api", exp=28_800)
BASE_URL = "https://backend.streamcenter.live/api/Parties"
@ -90,7 +90,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
@ -106,16 +106,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1):
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
page=page,
log=log,
)
@ -149,9 +147,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -8,7 +8,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMFREE"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=19_800)
CACHE_FILE = Cache(TAG, exp=19_800)
BASE_URL = "https://streamfree.to/"

View file

@ -2,7 +2,7 @@ import asyncio
from functools import partial
from urllib.parse import urljoin
from playwright.async_api import async_playwright
from playwright.async_api import BrowserContext
from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network
@ -13,9 +13,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMHUB"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=28_800)
HTML_CACHE = Cache(f"{TAG}-html", exp=28_800)
BASE_URL = "https://streamhub.pro/"
@ -132,7 +132,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return live
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
@ -150,16 +150,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1):
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
page=page,
timeout=5,
log=log,
)
@ -199,9 +197,6 @@ async def scrape() -> None:
urls[key] = entry
finally:
await browser.close()
if new_count := valid_count - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -4,7 +4,7 @@ from itertools import chain
from typing import Any
from urllib.parse import urljoin
from playwright.async_api import async_playwright
from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network
@ -14,9 +14,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMSGATE"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800)
API_FILE = Cache(f"{TAG}-api", exp=19_800)
BASE_URL = "https://streamingon.org"
@ -120,7 +120,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
@ -136,16 +136,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1):
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
page=page,
log=log,
)
@ -179,9 +177,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -1,196 +0,0 @@
import re
from functools import partial
from urllib.parse import urljoin
from playwright.async_api import async_playwright
from .utils import Cache, Time, get_logger, leagues, network
log = get_logger(__name__)
urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMD"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=28_800)
MIRRORS = [
"https://streami.su",
# "https://streamed.st",
"https://streamed.pk",
]
def fix_sport(s: str) -> str:
if "-" in s:
return " ".join(i.capitalize() for i in s.split("-"))
elif s == "fight":
return "Fight (UFC/Boxing)"
return s.capitalize() if len(s) >= 4 else s.upper()
async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]:
now = Time.clean(Time.now())
if not (api_data := API_FILE.load(per_entry=False, index=-1)):
log.info("Refreshing API cache")
api_data = [{"timestamp": now.timestamp()}]
if r := await network.request(
urljoin(url, "api/matches/all-today"),
log=log,
):
api_data: list[dict] = r.json()
api_data[-1]["timestamp"] = now.timestamp()
API_FILE.write(api_data)
events = []
pattern = re.compile(r"[\n\r]+|\s{2,}")
start_dt = now.delta(minutes=-30)
end_dt = now.delta(minutes=30)
for event in api_data:
if (category := event.get("category")) == "other":
continue
if not (ts := event["date"]):
continue
start_ts = float(f"{ts}"[:-3])
event_dt = Time.from_ts(start_ts)
if not start_dt <= event_dt <= end_dt:
continue
sport = fix_sport(category)
parts = pattern.split(event["title"].strip())
name = " | ".join(p.strip() for p in parts if p.strip())
logo = urljoin(url, poster) if (poster := event.get("poster")) else None
if f"[{sport}] {name} ({TAG})" in cached_keys:
continue
sources: list[dict[str, str]] = event["sources"]
if not sources:
continue
skip_types = ["alpha", "bravo"]
valid_sources = [d for d in sources if d.get("source") not in skip_types]
if not valid_sources:
continue
srce = valid_sources[0]
source_type = srce.get("source")
stream_id = srce.get("id")
if not (source_type and stream_id):
continue
events.append(
{
"sport": sport,
"event": name,
"link": f"https://embedsports.top/embed/{source_type}/{stream_id}/1",
"logo": logo,
"timestamp": event_dt.timestamp(),
}
)
return events
async def scrape() -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
urls.update(cached_urls)
log.info(f"Loaded {cached_count} event(s) from cache")
if not (base_url := await network.get_base(MIRRORS)):
log.warning("No working STRMD mirrors")
CACHE_FILE.write(cached_urls)
return
log.info(f'Scraping from "{base_url}"')
events = await get_events(base_url, cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1):
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
log=log,
)
url = await network.safe_process(
handler,
url_num=i,
semaphore=network.PW_S,
log=log,
)
if url:
sport, event, logo, ts, link = (
ev["sport"],
ev["event"],
ev["logo"],
ev["timestamp"],
ev["link"],
)
key = f"[{sport}] {event} ({TAG})"
tvg_id, pic = leagues.get_tvg_info(sport, event)
entry = {
"url": url,
"logo": logo or pic,
"base": "https://embedsports.top/",
"timestamp": ts,
"id": tvg_id or "Live.Event.us",
"link": link,
}
urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")
else:
log.info("No new events found")
CACHE_FILE.write(cached_urls)

234
M3U8/scrapers/tflix.py Normal file
View file

@ -0,0 +1,234 @@
import asyncio
from functools import partial
from urllib.parse import urljoin
import feedparser
from playwright.async_api import BrowserContext, Error, Page, TimeoutError
from .utils import Cache, Time, get_logger, leagues, network
log = get_logger(__name__)
urls: dict[str, dict[str, str | float]] = {}
TAG = "TFLIX"
CACHE_FILE = Cache(TAG, exp=28_800)
BASE_URL = "https://tv.tflix.app/"
SPORT_ENDPOINTS = ["football", "nba", "nfl", "nhl"]
async def process_event(
url: str,
url_num: int,
page: Page,
) -> tuple[str | None, str | None]:
try:
await page.goto(
url,
wait_until="domcontentloaded",
timeout=15_000,
)
try:
iframe = await page.wait_for_selector(
"iframe.metaframe.rptss",
timeout=3_500,
)
except TimeoutError:
log.warning(f"URL {url_num}) No iframe element.")
return None, None
if (old_src := await iframe.get_attribute("src")) and old_src.startswith(
"https://kloxmkhs.site/stream"
):
new_src = old_src
else:
try:
option = await page.wait_for_selector(
'li.dooplay_player_option >> span.title:has-text("TFLIX HD - iOS")',
timeout=3_000,
)
await option.scroll_into_view_if_needed()
await option.evaluate("el => el.click()")
await page.wait_for_function(
"""
(oldSrc) => {
const iframe = document.querySelector('iframe.metaframe.rptss');
return iframe && iframe.src && iframe.src !== oldSrc;
};
""",
arg=old_src,
timeout=5_000,
)
iframe_2 = await page.wait_for_selector("iframe.metaframe.rptss")
if not iframe_2 or not (new_src := await iframe_2.get_attribute("src")):
log.warning(f"URL {url_num}) No iframe source.")
return None, None
except TimeoutError:
log.warning(f"URL {url_num}) No valid TFLIX source.")
return None, None
try:
await page.goto(
new_src,
wait_until="domcontentloaded",
timeout=10_000,
referer=url,
)
except Error:
log.warning(
f"URL {url_num}) HTTP 403/404 error while redirecting to iframe source."
)
return None, None
try:
play_btn = await page.wait_for_selector(
'button[data-url][onclick*="startPlcb"]',
timeout=5_000,
)
except TimeoutError:
log.warning(f"URL {url_num}) No play button found.")
return None, None
if not (data_url := await play_btn.get_attribute("data-url")):
log.warning(f"URL {url_num}) No PBID found.")
return None, None
log.info(f"URL {url_num}) Captured M3U8")
return (
f"https://kloxmkhs.site/stream/stream.m3u8?id={data_url}&format=.m3u8",
new_src,
)
except Exception as e:
log.warning(f"URL {url_num}) Exception while processing: {e}")
return None, None
async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
tasks = [
network.request(urljoin(BASE_URL, f"genre/{sport}/feed"), log=log)
for sport in SPORT_ENDPOINTS
]
results = await asyncio.gather(*tasks)
events = []
if not (feeds := [feedparser.parse(html.content) for html in results if html]):
return events
for feed in feeds:
title: str = feed["feed"]["title"]
sport = title.split("Archives")[0].strip()
for entry in feed.entries:
if not (link := entry.get("link")):
continue
if not (title := entry.get("title")):
continue
if f"[{sport}] {title} ({TAG})" in cached_keys:
continue
events.append(
{
"sport": sport,
"event": title,
"link": link,
}
)
return events
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
valid_count = cached_count = len(cached_urls)
urls.update(valid_urls)
log.info(f"Loaded {cached_count} event(s) from cache")
log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)")
if events:
now = Time.clean(Time.now()).timestamp()
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
process_event,
url=ev["link"],
url_num=i,
page=page,
)
url, iframe = await network.safe_process(
handler,
url_num=i,
semaphore=network.PW_S,
log=log,
timeout=20,
)
sport, event, link = (
ev["sport"],
ev["event"],
ev["link"],
)
key = f"[{sport}] {event} ({TAG})"
tvg_id, logo = leagues.get_tvg_info(sport, event)
entry = {
"url": url,
"logo": logo,
"base": iframe,
"timestamp": now,
"id": tvg_id or "Live.Event.us",
"link": link,
}
cached_urls[key] = entry
if url:
valid_count += 1
urls[key] = entry
if new_count := valid_count - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")
else:
log.info("No new events found")
CACHE_FILE.write(cached_urls)

View file

@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "TOTALSPRTK"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=28_800)
CACHE_FILE = Cache(TAG, exp=28_800)
MIRRORS = [
{

View file

@ -8,7 +8,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "TVPASS"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=86_400)
CACHE_FILE = Cache(TAG, exp=86_400)
BASE_URL = "https://tvpass.org/playlist/m3u"

View file

@ -7,8 +7,8 @@ from .config import Time
class Cache:
now_ts: float = Time.now().timestamp()
def __init__(self, file: str, exp: int | float) -> None:
self.file = Path(__file__).parent.parent / "caches" / file
def __init__(self, filename: str, exp: int | float) -> None:
self.file = Path(__file__).parent.parent / "caches" / f"{filename.lower()}.json"
self.exp = exp

View file

@ -3,12 +3,13 @@ import logging
import random
import re
from collections.abc import Awaitable, Callable
from contextlib import asynccontextmanager
from functools import partial
from typing import TypeVar
from typing import AsyncGenerator, TypeVar
from urllib.parse import urlencode, urljoin
import httpx
from playwright.async_api import Browser, BrowserContext, Playwright, Request
from playwright.async_api import Browser, BrowserContext, Page, Playwright, Request
from .logger import get_logger
@ -123,6 +124,112 @@ class Network:
return
@staticmethod
@asynccontextmanager
async def event_context(
browser: Browser,
stealth: bool = True,
) -> AsyncGenerator[BrowserContext, None]:
context: BrowserContext | None = None
try:
context = await browser.new_context(
user_agent=Network.UA if stealth else None,
viewport={"width": 1366, "height": 768},
device_scale_factor=1,
locale="en-US",
timezone_id="America/New_York",
color_scheme="dark",
permissions=["geolocation"],
extra_http_headers=(
{
"Accept-Language": "en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
}
if stealth
else None
),
)
if stealth:
await context.add_init_script("""
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
Object.defineProperty(navigator, "languages", {
get: () => ["en-US", "en"],
});
Object.defineProperty(navigator, "plugins", {
get: () => [1, 2, 3, 4],
});
const elementDescriptor = Object.getOwnPropertyDescriptor(
HTMLElement.prototype,
"offsetHeight"
);
Object.defineProperty(HTMLDivElement.prototype, "offsetHeight", {
...elementDescriptor,
get: function () {
if (this.id === "modernizr") {
return 24;
}
return elementDescriptor.get.apply(this);
},
});
Object.defineProperty(window.screen, "width", { get: () => 1366 });
Object.defineProperty(window.screen, "height", { get: () => 768 });
const getParameter = WebGLRenderingContext.prototype.getParameter;
WebGLRenderingContext.prototype.getParameter = function (param) {
if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL
if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL
return getParameter.apply(this, [param]);
};
const observer = new MutationObserver((mutations) => {
mutations.forEach((mutation) => {
mutation.addedNodes.forEach((node) => {
if (node.tagName === "IFRAME" && node.hasAttribute("sandbox")) {
node.removeAttribute("sandbox");
}
});
});
});
observer.observe(document.documentElement, { childList: true, subtree: true });
""")
else:
context = await browser.new_context()
yield context
finally:
if context:
await context.close()
@staticmethod
@asynccontextmanager
async def event_page(context: BrowserContext) -> AsyncGenerator[Page, None]:
page = await context.new_page()
try:
yield page
finally:
await page.close()
@staticmethod
async def browser(playwright: Playwright, external: bool = False) -> Browser:
return (
await playwright.chromium.connect_over_cdp("http://localhost:9222")
if external
else await playwright.firefox.launch(headless=True)
)
@staticmethod
def capture_req(
req: Request,
@ -147,15 +254,13 @@ class Network:
self,
url: str,
url_num: int,
context: BrowserContext,
page: Page,
timeout: int | float = 10,
log: logging.Logger | None = None,
) -> str | None:
log = log or logger
page = await context.new_page()
captured: list[str] = []
got_one = asyncio.Event()
@ -212,86 +317,6 @@ class Network:
await page.close()
@staticmethod
async def browser(
playwright: Playwright, browser: str = "internal"
) -> tuple[Browser, BrowserContext]:
if browser == "external":
brwsr = await playwright.chromium.connect_over_cdp("http://localhost:9222")
context = brwsr.contexts[0]
else:
brwsr = await playwright.firefox.launch(headless=True)
context = await brwsr.new_context(
user_agent=Network.UA,
ignore_https_errors=False,
viewport={"width": 1366, "height": 768},
device_scale_factor=1,
locale="en-US",
timezone_id="America/New_York",
color_scheme="dark",
permissions=["geolocation"],
extra_http_headers={
"Accept-Language": "en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
},
)
await context.add_init_script("""
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
Object.defineProperty(navigator, "languages", {
get: () => ["en-US", "en"],
});
Object.defineProperty(navigator, "plugins", {
get: () => [1, 2, 3, 4],
});
const elementDescriptor = Object.getOwnPropertyDescriptor(
HTMLElement.prototype,
"offsetHeight"
);
Object.defineProperty(HTMLDivElement.prototype, "offsetHeight", {
...elementDescriptor,
get: function () {
if (this.id === "modernizr") {
return 24;
}
return elementDescriptor.get.apply(this);
},
});
Object.defineProperty(window.screen, "width", { get: () => 1366 });
Object.defineProperty(window.screen, "height", { get: () => 768 });
const getParameter = WebGLRenderingContext.prototype.getParameter;
WebGLRenderingContext.prototype.getParameter = function (param) {
if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL
if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL
return getParameter.apply(this, [param]);
};
const observer = new MutationObserver((mutations) => {
mutations.forEach((mutation) => {
mutation.addedNodes.forEach((node) => {
if (node.tagName === "IFRAME" && node.hasAttribute("sandbox")) {
node.removeAttribute("sandbox");
}
});
});
});
observer.observe(document.documentElement, { childList: true, subtree: true });
""")
return brwsr, context
network = Network()

View file

@ -5,7 +5,7 @@ from itertools import chain
from typing import Any
from urllib.parse import urljoin
from playwright.async_api import BrowserContext, async_playwright
from playwright.async_api import BrowserContext, Page, TimeoutError
from .utils import Cache, Time, get_logger, leagues, network
@ -15,9 +15,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "WATCHFTY"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800)
API_FILE = Cache(f"{TAG}-api.json", exp=19_800)
API_URL = "https://api.watchfooty.st"
@ -73,7 +73,7 @@ async def refresh_api_cache(now: Time) -> list[dict[str, Any]]:
async def process_event(
url: str,
url_num: int,
context: BrowserContext,
page: Page,
) -> tuple[str | None, str | None]:
pattern = re.compile(r"\((\d+)\)")
@ -82,8 +82,6 @@ async def process_event(
got_one = asyncio.Event()
page = await context.new_page()
handler = partial(
network.capture_req,
captured=captured,
@ -117,7 +115,8 @@ async def process_event(
try:
first_available = await page.wait_for_selector(
'a[href*="/stream/"]', timeout=3_000
'a[href*="/stream/"]',
timeout=3_000,
)
except TimeoutError:
log.warning(f"URL {url_num}) No available stream links.")
@ -176,8 +175,6 @@ async def process_event(
finally:
page.remove_listener("request", handler)
await page.close()
async def get_events(base_url: str, cached_keys: list[str]) -> list[dict[str, str]]:
now = Time.clean(Time.now())
@ -235,7 +232,7 @@ async def get_events(base_url: str, cached_keys: list[str]) -> list[dict[str, st
return events
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
@ -260,16 +257,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1):
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
process_event,
url=ev["link"],
url_num=i,
context=context,
page=page,
)
url, iframe = await network.safe_process(
@ -307,9 +302,6 @@ async def scrape() -> None:
urls[key] = entry
finally:
await browser.close()
if new_count := valid_count - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -1,7 +1,7 @@
import asyncio
from functools import partial
from playwright.async_api import async_playwright
from playwright.async_api import BrowserContext
from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network
@ -12,9 +12,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "WEBCAST"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
CACHE_FILE = Cache(TAG, exp=10_800)
HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=86_400)
HTML_CACHE = Cache(f"{TAG}-html", exp=86_400)
BASE_URLS = {"NFL": "https://nflwebcast.com", "NHL": "https://slapstreams.com"}
@ -110,7 +110,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return live
async def scrape() -> None:
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
@ -126,16 +126,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p)
try:
for i, ev in enumerate(events, start=1):
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
page=page,
log=log,
)
@ -169,9 +167,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")