- add tflix.py
- remove strmd.py
- modify playwright browser/context usage
- misc. edits
This commit is contained in:
doms9 2026-01-23 23:44:59 -05:00
parent 1aa60a8ce1
commit 00000d9638
24 changed files with 481 additions and 462 deletions

View file

@ -3,6 +3,7 @@ import asyncio
import re import re
from pathlib import Path from pathlib import Path
from playwright.async_api import async_playwright
from scrapers import ( from scrapers import (
cdnlivetv, cdnlivetv,
embedhd, embedhd,
@ -19,7 +20,7 @@ from scrapers import (
streamfree, streamfree,
streamhub, streamhub,
streamsgate, streamsgate,
strmd, tflix,
totalsportek, totalsportek,
tvpass, tvpass,
watchfooty, watchfooty,
@ -53,31 +54,46 @@ async def main() -> None:
base_m3u8, tvg_chno = load_base() base_m3u8, tvg_chno = load_base()
tasks = [ async with async_playwright() as p:
asyncio.create_task(cdnlivetv.scrape()), try:
asyncio.create_task(embedhd.scrape()), hdl_brwsr = await network.browser(p)
xtrnl_brwsr = await network.browser(p, external=True)
pw_tasks = [
asyncio.create_task(cdnlivetv.scrape(hdl_brwsr)),
asyncio.create_task(embedhd.scrape(hdl_brwsr)),
asyncio.create_task(pixel.scrape(hdl_brwsr)),
asyncio.create_task(ppv.scrape(xtrnl_brwsr)),
asyncio.create_task(sport9.scrape(xtrnl_brwsr)),
asyncio.create_task(streamcenter.scrape(xtrnl_brwsr)),
asyncio.create_task(streamhub.scrape(xtrnl_brwsr)),
asyncio.create_task(streamsgate.scrape(xtrnl_brwsr)),
asyncio.create_task(tflix.scrape(xtrnl_brwsr)),
asyncio.create_task(webcast.scrape(hdl_brwsr)),
asyncio.create_task(watchfooty.scrape(xtrnl_brwsr)),
]
httpx_tasks = [
asyncio.create_task(fawa.scrape()), asyncio.create_task(fawa.scrape()),
asyncio.create_task(istreameast.scrape()), asyncio.create_task(istreameast.scrape()),
asyncio.create_task(pawa.scrape()), asyncio.create_task(pawa.scrape()),
asyncio.create_task(pixel.scrape()),
asyncio.create_task(ppv.scrape()),
asyncio.create_task(roxie.scrape()), asyncio.create_task(roxie.scrape()),
asyncio.create_task(shark.scrape()), asyncio.create_task(shark.scrape()),
asyncio.create_task(sport9.scrape()),
asyncio.create_task(streambtw.scrape()), asyncio.create_task(streambtw.scrape()),
asyncio.create_task(streamcenter.scrape()),
asyncio.create_task(streamfree.scrape()), asyncio.create_task(streamfree.scrape()),
asyncio.create_task(streamhub.scrape()),
asyncio.create_task(streamsgate.scrape()),
# asyncio.create_task(strmd.scrape()),
asyncio.create_task(totalsportek.scrape()), asyncio.create_task(totalsportek.scrape()),
asyncio.create_task(tvpass.scrape()), asyncio.create_task(tvpass.scrape()),
asyncio.create_task(webcast.scrape()),
] ]
await asyncio.gather(*tasks) await asyncio.gather(*(pw_tasks + httpx_tasks))
await watchfooty.scrape() finally:
await hdl_brwsr.close()
await xtrnl_brwsr.close()
await network.client.aclose()
additions = ( additions = (
cdnlivetv.urls cdnlivetv.urls
@ -95,7 +111,7 @@ async def main() -> None:
| streamfree.urls | streamfree.urls
| streamhub.urls | streamhub.urls
| streamsgate.urls | streamsgate.urls
| strmd.urls | tflix.urls
| totalsportek.urls | totalsportek.urls
| tvpass.urls | tvpass.urls
| watchfooty.urls | watchfooty.urls
@ -153,8 +169,3 @@ async def main() -> None:
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())
try:
asyncio.run(network.client.aclose())
except Exception:
pass

View file

@ -1,6 +1,6 @@
from functools import partial from functools import partial
from playwright.async_api import async_playwright from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "CDNTV" TAG = "CDNTV"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800) API_FILE = Cache(f"{TAG}-api", exp=19_800)
API_URL = "https://api.cdn-live.tv/api/v1/events/sports" API_URL = "https://api.cdn-live.tv/api/v1/events/sports"
@ -85,7 +85,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events return events
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
@ -101,16 +101,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p)
try:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, network.process_event,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
context=context, page=page,
log=log, log=log,
) )
@ -144,9 +142,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count: if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -1,6 +1,6 @@
from functools import partial from functools import partial
from playwright.async_api import async_playwright from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "EMBEDHD" TAG = "EMBEDHD"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=5_400) CACHE_FILE = Cache(TAG, exp=5_400)
API_CACHE = Cache(f"{TAG.lower()}-api.json", exp=28_800) API_CACHE = Cache(f"{TAG}-api", exp=28_800)
BASE_URL = "https://embedhd.org/api-event.php" BASE_URL = "https://embedhd.org/api-event.php"
@ -75,7 +75,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events return events
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
@ -91,16 +91,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p)
try:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, network.process_event,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
context=context, page=page,
log=log, log=log,
) )
@ -134,9 +132,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count: if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "FAWA" TAG = "FAWA"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
BASE_URL = "http://www.fawanews.sc/" BASE_URL = "http://www.fawanews.sc/"

View file

@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "iSTRMEAST" TAG = "iSTRMEAST"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
BASE_URL = "https://istreameast.app" BASE_URL = "https://istreameast.app"

View file

@ -13,7 +13,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "PAWA" TAG = "PAWA"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
BASE_URL = "https://pawastreams.net/feed" BASE_URL = "https://pawastreams.net/feed"

View file

@ -1,7 +1,7 @@
import json import json
from functools import partial from functools import partial
from playwright.async_api import BrowserContext, async_playwright from playwright.async_api import BrowserContext, Page
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -11,15 +11,13 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "PIXEL" TAG = "PIXEL"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=19_800) CACHE_FILE = Cache(TAG, exp=19_800)
BASE_URL = "https://pixelsport.tv/backend/livetv/events" BASE_URL = "https://pixelsport.tv/backend/livetv/events"
async def get_api_data(context: BrowserContext) -> dict[str, list[dict, str, str]]: async def get_api_data(page: Page) -> dict[str, list[dict, str, str]]:
try: try:
page = await context.new_page()
await page.goto( await page.goto(
BASE_URL, BASE_URL,
wait_until="domcontentloaded", wait_until="domcontentloaded",
@ -35,10 +33,10 @@ async def get_api_data(context: BrowserContext) -> dict[str, list[dict, str, str
return json.loads(raw_json) return json.loads(raw_json)
async def get_events(context: BrowserContext) -> dict[str, dict[str, str | float]]: async def get_events(page: Page) -> dict[str, dict[str, str | float]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
api_data = await get_api_data(context) api_data = await get_api_data(page)
events = {} events = {}
@ -75,7 +73,7 @@ async def get_events(context: BrowserContext) -> dict[str, dict[str, str | float
return events return events
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
if cached := CACHE_FILE.load(): if cached := CACHE_FILE.load():
urls.update(cached) urls.update(cached)
@ -85,11 +83,9 @@ async def scrape() -> None:
log.info(f'Scraping from "{BASE_URL}"') log.info(f'Scraping from "{BASE_URL}"')
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p) async with network.event_page(context) as page:
handler = partial(get_events, page=page)
try:
handler = partial(get_events, context=context)
events = await network.safe_process( events = await network.safe_process(
handler, handler,
@ -98,9 +94,6 @@ async def scrape() -> None:
log=log, log=log,
) )
finally:
await browser.close()
urls.update(events or {}) urls.update(events or {})
CACHE_FILE.write(urls) CACHE_FILE.write(urls)

View file

@ -1,6 +1,6 @@
from functools import partial from functools import partial
from playwright.async_api import async_playwright from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "PPV" TAG = "PPV"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800) API_FILE = Cache(f"{TAG}-api", exp=19_800)
MIRRORS = [ MIRRORS = [
"https://old.ppv.to/api/streams", "https://old.ppv.to/api/streams",
@ -78,7 +78,7 @@ async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]:
return events return events
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
@ -101,16 +101,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, network.process_event,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
context=context, page=page,
timeout=6, timeout=6,
log=log, log=log,
) )
@ -146,9 +144,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count: if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -13,9 +13,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "ROXIE" TAG = "ROXIE"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=19_800) HTML_CACHE = Cache(f"{TAG}-html", exp=19_800)
BASE_URL = "https://roxiestreams.live" BASE_URL = "https://roxiestreams.live"

View file

@ -11,9 +11,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "SHARK" TAG = "SHARK"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=19_800) HTML_CACHE = Cache(f"{TAG}-html", exp=19_800)
BASE_URL = "https://sharkstreams.net" BASE_URL = "https://sharkstreams.net"

View file

@ -2,7 +2,7 @@ import asyncio
from functools import partial from functools import partial
from urllib.parse import urljoin from urllib.parse import urljoin
from playwright.async_api import async_playwright from playwright.async_api import BrowserContext
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -13,7 +13,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "SPORT9" TAG = "SPORT9"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=5_400) CACHE_FILE = Cache(TAG, exp=5_400)
BASE_URL = "https://sport9.ru/" BASE_URL = "https://sport9.ru/"
@ -88,7 +88,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events return events
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
@ -106,16 +106,14 @@ async def scrape() -> None:
if events: if events:
now = Time.clean(Time.now()).timestamp() now = Time.clean(Time.now()).timestamp()
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, network.process_event,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
context=context, page=page,
log=log, log=log,
) )
@ -148,9 +146,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count: if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -13,7 +13,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMBTW" TAG = "STRMBTW"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=3_600) CACHE_FILE = Cache(TAG, exp=3_600)
BASE_URLS = ["https://hiteasport.info/", "https://streambtw.com/"] BASE_URLS = ["https://hiteasport.info/", "https://streambtw.com/"]

View file

@ -1,6 +1,6 @@
from functools import partial from functools import partial
from playwright.async_api import async_playwright from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -10,9 +10,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMCNTR" TAG = "STRMCNTR"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=28_800) API_FILE = Cache(f"{TAG}-api", exp=28_800)
BASE_URL = "https://backend.streamcenter.live/api/Parties" BASE_URL = "https://backend.streamcenter.live/api/Parties"
@ -90,7 +90,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events return events
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
@ -106,16 +106,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, network.process_event,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
context=context, page=page,
log=log, log=log,
) )
@ -149,9 +147,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count: if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -8,7 +8,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMFREE" TAG = "STRMFREE"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=19_800) CACHE_FILE = Cache(TAG, exp=19_800)
BASE_URL = "https://streamfree.to/" BASE_URL = "https://streamfree.to/"

View file

@ -2,7 +2,7 @@ import asyncio
from functools import partial from functools import partial
from urllib.parse import urljoin from urllib.parse import urljoin
from playwright.async_api import async_playwright from playwright.async_api import BrowserContext
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -13,9 +13,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMHUB" TAG = "STRMHUB"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=28_800) HTML_CACHE = Cache(f"{TAG}-html", exp=28_800)
BASE_URL = "https://streamhub.pro/" BASE_URL = "https://streamhub.pro/"
@ -132,7 +132,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return live return live
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
@ -150,16 +150,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, network.process_event,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
context=context, page=page,
timeout=5, timeout=5,
log=log, log=log,
) )
@ -199,9 +197,6 @@ async def scrape() -> None:
urls[key] = entry urls[key] = entry
finally:
await browser.close()
if new_count := valid_count - cached_count: if new_count := valid_count - cached_count:
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -4,7 +4,7 @@ from itertools import chain
from typing import Any from typing import Any
from urllib.parse import urljoin from urllib.parse import urljoin
from playwright.async_api import async_playwright from playwright.async_api import BrowserContext
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -14,9 +14,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMSGATE" TAG = "STRMSGATE"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800) API_FILE = Cache(f"{TAG}-api", exp=19_800)
BASE_URL = "https://streamingon.org" BASE_URL = "https://streamingon.org"
@ -120,7 +120,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return events return events
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
@ -136,16 +136,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, network.process_event,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
context=context, page=page,
log=log, log=log,
) )
@ -179,9 +177,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count: if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -1,196 +0,0 @@
import re
from functools import partial
from urllib.parse import urljoin
from playwright.async_api import async_playwright
from .utils import Cache, Time, get_logger, leagues, network
log = get_logger(__name__)
urls: dict[str, dict[str, str | float]] = {}
TAG = "STRMD"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=28_800)
MIRRORS = [
"https://streami.su",
# "https://streamed.st",
"https://streamed.pk",
]
def fix_sport(s: str) -> str:
if "-" in s:
return " ".join(i.capitalize() for i in s.split("-"))
elif s == "fight":
return "Fight (UFC/Boxing)"
return s.capitalize() if len(s) >= 4 else s.upper()
async def get_events(url: str, cached_keys: list[str]) -> list[dict[str, str]]:
now = Time.clean(Time.now())
if not (api_data := API_FILE.load(per_entry=False, index=-1)):
log.info("Refreshing API cache")
api_data = [{"timestamp": now.timestamp()}]
if r := await network.request(
urljoin(url, "api/matches/all-today"),
log=log,
):
api_data: list[dict] = r.json()
api_data[-1]["timestamp"] = now.timestamp()
API_FILE.write(api_data)
events = []
pattern = re.compile(r"[\n\r]+|\s{2,}")
start_dt = now.delta(minutes=-30)
end_dt = now.delta(minutes=30)
for event in api_data:
if (category := event.get("category")) == "other":
continue
if not (ts := event["date"]):
continue
start_ts = float(f"{ts}"[:-3])
event_dt = Time.from_ts(start_ts)
if not start_dt <= event_dt <= end_dt:
continue
sport = fix_sport(category)
parts = pattern.split(event["title"].strip())
name = " | ".join(p.strip() for p in parts if p.strip())
logo = urljoin(url, poster) if (poster := event.get("poster")) else None
if f"[{sport}] {name} ({TAG})" in cached_keys:
continue
sources: list[dict[str, str]] = event["sources"]
if not sources:
continue
skip_types = ["alpha", "bravo"]
valid_sources = [d for d in sources if d.get("source") not in skip_types]
if not valid_sources:
continue
srce = valid_sources[0]
source_type = srce.get("source")
stream_id = srce.get("id")
if not (source_type and stream_id):
continue
events.append(
{
"sport": sport,
"event": name,
"link": f"https://embedsports.top/embed/{source_type}/{stream_id}/1",
"logo": logo,
"timestamp": event_dt.timestamp(),
}
)
return events
async def scrape() -> None:
cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls)
urls.update(cached_urls)
log.info(f"Loaded {cached_count} event(s) from cache")
if not (base_url := await network.get_base(MIRRORS)):
log.warning("No working STRMD mirrors")
CACHE_FILE.write(cached_urls)
return
log.info(f'Scraping from "{base_url}"')
events = await get_events(base_url, cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)")
if events:
async with async_playwright() as p:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1):
handler = partial(
network.process_event,
url=ev["link"],
url_num=i,
context=context,
log=log,
)
url = await network.safe_process(
handler,
url_num=i,
semaphore=network.PW_S,
log=log,
)
if url:
sport, event, logo, ts, link = (
ev["sport"],
ev["event"],
ev["logo"],
ev["timestamp"],
ev["link"],
)
key = f"[{sport}] {event} ({TAG})"
tvg_id, pic = leagues.get_tvg_info(sport, event)
entry = {
"url": url,
"logo": logo or pic,
"base": "https://embedsports.top/",
"timestamp": ts,
"id": tvg_id or "Live.Event.us",
"link": link,
}
urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")
else:
log.info("No new events found")
CACHE_FILE.write(cached_urls)

234
M3U8/scrapers/tflix.py Normal file
View file

@ -0,0 +1,234 @@
import asyncio
from functools import partial
from urllib.parse import urljoin
import feedparser
from playwright.async_api import BrowserContext, Error, Page, TimeoutError
from .utils import Cache, Time, get_logger, leagues, network
log = get_logger(__name__)
urls: dict[str, dict[str, str | float]] = {}
TAG = "TFLIX"
CACHE_FILE = Cache(TAG, exp=28_800)
BASE_URL = "https://tv.tflix.app/"
SPORT_ENDPOINTS = ["football", "nba", "nfl", "nhl"]
async def process_event(
url: str,
url_num: int,
page: Page,
) -> tuple[str | None, str | None]:
try:
await page.goto(
url,
wait_until="domcontentloaded",
timeout=15_000,
)
try:
iframe = await page.wait_for_selector(
"iframe.metaframe.rptss",
timeout=3_500,
)
except TimeoutError:
log.warning(f"URL {url_num}) No iframe element.")
return None, None
if (old_src := await iframe.get_attribute("src")) and old_src.startswith(
"https://kloxmkhs.site/stream"
):
new_src = old_src
else:
try:
option = await page.wait_for_selector(
'li.dooplay_player_option >> span.title:has-text("TFLIX HD - iOS")',
timeout=3_000,
)
await option.scroll_into_view_if_needed()
await option.evaluate("el => el.click()")
await page.wait_for_function(
"""
(oldSrc) => {
const iframe = document.querySelector('iframe.metaframe.rptss');
return iframe && iframe.src && iframe.src !== oldSrc;
};
""",
arg=old_src,
timeout=5_000,
)
iframe_2 = await page.wait_for_selector("iframe.metaframe.rptss")
if not iframe_2 or not (new_src := await iframe_2.get_attribute("src")):
log.warning(f"URL {url_num}) No iframe source.")
return None, None
except TimeoutError:
log.warning(f"URL {url_num}) No valid TFLIX source.")
return None, None
try:
await page.goto(
new_src,
wait_until="domcontentloaded",
timeout=10_000,
referer=url,
)
except Error:
log.warning(
f"URL {url_num}) HTTP 403/404 error while redirecting to iframe source."
)
return None, None
try:
play_btn = await page.wait_for_selector(
'button[data-url][onclick*="startPlcb"]',
timeout=5_000,
)
except TimeoutError:
log.warning(f"URL {url_num}) No play button found.")
return None, None
if not (data_url := await play_btn.get_attribute("data-url")):
log.warning(f"URL {url_num}) No PBID found.")
return None, None
log.info(f"URL {url_num}) Captured M3U8")
return (
f"https://kloxmkhs.site/stream/stream.m3u8?id={data_url}&format=.m3u8",
new_src,
)
except Exception as e:
log.warning(f"URL {url_num}) Exception while processing: {e}")
return None, None
async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
tasks = [
network.request(urljoin(BASE_URL, f"genre/{sport}/feed"), log=log)
for sport in SPORT_ENDPOINTS
]
results = await asyncio.gather(*tasks)
events = []
if not (feeds := [feedparser.parse(html.content) for html in results if html]):
return events
for feed in feeds:
title: str = feed["feed"]["title"]
sport = title.split("Archives")[0].strip()
for entry in feed.entries:
if not (link := entry.get("link")):
continue
if not (title := entry.get("title")):
continue
if f"[{sport}] {title} ({TAG})" in cached_keys:
continue
events.append(
{
"sport": sport,
"event": title,
"link": link,
}
)
return events
async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
valid_count = cached_count = len(cached_urls)
urls.update(valid_urls)
log.info(f"Loaded {cached_count} event(s) from cache")
log.info(f'Scraping from "{BASE_URL}"')
events = await get_events(cached_urls.keys())
log.info(f"Processing {len(events)} new URL(s)")
if events:
now = Time.clean(Time.now()).timestamp()
async with network.event_context(browser) as context:
for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial(
process_event,
url=ev["link"],
url_num=i,
page=page,
)
url, iframe = await network.safe_process(
handler,
url_num=i,
semaphore=network.PW_S,
log=log,
timeout=20,
)
sport, event, link = (
ev["sport"],
ev["event"],
ev["link"],
)
key = f"[{sport}] {event} ({TAG})"
tvg_id, logo = leagues.get_tvg_info(sport, event)
entry = {
"url": url,
"logo": logo,
"base": iframe,
"timestamp": now,
"id": tvg_id or "Live.Event.us",
"link": link,
}
cached_urls[key] = entry
if url:
valid_count += 1
urls[key] = entry
if new_count := valid_count - cached_count:
log.info(f"Collected and cached {new_count} new event(s)")
else:
log.info("No new events found")
CACHE_FILE.write(cached_urls)

View file

@ -12,7 +12,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "TOTALSPRTK" TAG = "TOTALSPRTK"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=28_800) CACHE_FILE = Cache(TAG, exp=28_800)
MIRRORS = [ MIRRORS = [
{ {

View file

@ -8,7 +8,7 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "TVPASS" TAG = "TVPASS"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=86_400) CACHE_FILE = Cache(TAG, exp=86_400)
BASE_URL = "https://tvpass.org/playlist/m3u" BASE_URL = "https://tvpass.org/playlist/m3u"

View file

@ -7,8 +7,8 @@ from .config import Time
class Cache: class Cache:
now_ts: float = Time.now().timestamp() now_ts: float = Time.now().timestamp()
def __init__(self, file: str, exp: int | float) -> None: def __init__(self, filename: str, exp: int | float) -> None:
self.file = Path(__file__).parent.parent / "caches" / file self.file = Path(__file__).parent.parent / "caches" / f"{filename.lower()}.json"
self.exp = exp self.exp = exp

View file

@ -3,12 +3,13 @@ import logging
import random import random
import re import re
from collections.abc import Awaitable, Callable from collections.abc import Awaitable, Callable
from contextlib import asynccontextmanager
from functools import partial from functools import partial
from typing import TypeVar from typing import AsyncGenerator, TypeVar
from urllib.parse import urlencode, urljoin from urllib.parse import urlencode, urljoin
import httpx import httpx
from playwright.async_api import Browser, BrowserContext, Playwright, Request from playwright.async_api import Browser, BrowserContext, Page, Playwright, Request
from .logger import get_logger from .logger import get_logger
@ -123,6 +124,112 @@ class Network:
return return
@staticmethod
@asynccontextmanager
async def event_context(
browser: Browser,
stealth: bool = True,
) -> AsyncGenerator[BrowserContext, None]:
context: BrowserContext | None = None
try:
context = await browser.new_context(
user_agent=Network.UA if stealth else None,
viewport={"width": 1366, "height": 768},
device_scale_factor=1,
locale="en-US",
timezone_id="America/New_York",
color_scheme="dark",
permissions=["geolocation"],
extra_http_headers=(
{
"Accept-Language": "en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
}
if stealth
else None
),
)
if stealth:
await context.add_init_script("""
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
Object.defineProperty(navigator, "languages", {
get: () => ["en-US", "en"],
});
Object.defineProperty(navigator, "plugins", {
get: () => [1, 2, 3, 4],
});
const elementDescriptor = Object.getOwnPropertyDescriptor(
HTMLElement.prototype,
"offsetHeight"
);
Object.defineProperty(HTMLDivElement.prototype, "offsetHeight", {
...elementDescriptor,
get: function () {
if (this.id === "modernizr") {
return 24;
}
return elementDescriptor.get.apply(this);
},
});
Object.defineProperty(window.screen, "width", { get: () => 1366 });
Object.defineProperty(window.screen, "height", { get: () => 768 });
const getParameter = WebGLRenderingContext.prototype.getParameter;
WebGLRenderingContext.prototype.getParameter = function (param) {
if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL
if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL
return getParameter.apply(this, [param]);
};
const observer = new MutationObserver((mutations) => {
mutations.forEach((mutation) => {
mutation.addedNodes.forEach((node) => {
if (node.tagName === "IFRAME" && node.hasAttribute("sandbox")) {
node.removeAttribute("sandbox");
}
});
});
});
observer.observe(document.documentElement, { childList: true, subtree: true });
""")
else:
context = await browser.new_context()
yield context
finally:
if context:
await context.close()
@staticmethod
@asynccontextmanager
async def event_page(context: BrowserContext) -> AsyncGenerator[Page, None]:
page = await context.new_page()
try:
yield page
finally:
await page.close()
@staticmethod
async def browser(playwright: Playwright, external: bool = False) -> Browser:
return (
await playwright.chromium.connect_over_cdp("http://localhost:9222")
if external
else await playwright.firefox.launch(headless=True)
)
@staticmethod @staticmethod
def capture_req( def capture_req(
req: Request, req: Request,
@ -147,15 +254,13 @@ class Network:
self, self,
url: str, url: str,
url_num: int, url_num: int,
context: BrowserContext, page: Page,
timeout: int | float = 10, timeout: int | float = 10,
log: logging.Logger | None = None, log: logging.Logger | None = None,
) -> str | None: ) -> str | None:
log = log or logger log = log or logger
page = await context.new_page()
captured: list[str] = [] captured: list[str] = []
got_one = asyncio.Event() got_one = asyncio.Event()
@ -212,86 +317,6 @@ class Network:
await page.close() await page.close()
@staticmethod
async def browser(
playwright: Playwright, browser: str = "internal"
) -> tuple[Browser, BrowserContext]:
if browser == "external":
brwsr = await playwright.chromium.connect_over_cdp("http://localhost:9222")
context = brwsr.contexts[0]
else:
brwsr = await playwright.firefox.launch(headless=True)
context = await brwsr.new_context(
user_agent=Network.UA,
ignore_https_errors=False,
viewport={"width": 1366, "height": 768},
device_scale_factor=1,
locale="en-US",
timezone_id="America/New_York",
color_scheme="dark",
permissions=["geolocation"],
extra_http_headers={
"Accept-Language": "en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
},
)
await context.add_init_script("""
Object.defineProperty(navigator, "webdriver", { get: () => undefined });
Object.defineProperty(navigator, "languages", {
get: () => ["en-US", "en"],
});
Object.defineProperty(navigator, "plugins", {
get: () => [1, 2, 3, 4],
});
const elementDescriptor = Object.getOwnPropertyDescriptor(
HTMLElement.prototype,
"offsetHeight"
);
Object.defineProperty(HTMLDivElement.prototype, "offsetHeight", {
...elementDescriptor,
get: function () {
if (this.id === "modernizr") {
return 24;
}
return elementDescriptor.get.apply(this);
},
});
Object.defineProperty(window.screen, "width", { get: () => 1366 });
Object.defineProperty(window.screen, "height", { get: () => 768 });
const getParameter = WebGLRenderingContext.prototype.getParameter;
WebGLRenderingContext.prototype.getParameter = function (param) {
if (param === 37445) return "Intel Inc."; // UNMASKED_VENDOR_WEBGL
if (param === 37446) return "Intel Iris OpenGL Engine"; // UNMASKED_RENDERER_WEBGL
return getParameter.apply(this, [param]);
};
const observer = new MutationObserver((mutations) => {
mutations.forEach((mutation) => {
mutation.addedNodes.forEach((node) => {
if (node.tagName === "IFRAME" && node.hasAttribute("sandbox")) {
node.removeAttribute("sandbox");
}
});
});
});
observer.observe(document.documentElement, { childList: true, subtree: true });
""")
return brwsr, context
network = Network() network = Network()

View file

@ -5,7 +5,7 @@ from itertools import chain
from typing import Any from typing import Any
from urllib.parse import urljoin from urllib.parse import urljoin
from playwright.async_api import BrowserContext, async_playwright from playwright.async_api import BrowserContext, Page, TimeoutError
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -15,9 +15,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "WATCHFTY" TAG = "WATCHFTY"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
API_FILE = Cache(f"{TAG.lower()}-api.json", exp=19_800) API_FILE = Cache(f"{TAG}-api.json", exp=19_800)
API_URL = "https://api.watchfooty.st" API_URL = "https://api.watchfooty.st"
@ -73,7 +73,7 @@ async def refresh_api_cache(now: Time) -> list[dict[str, Any]]:
async def process_event( async def process_event(
url: str, url: str,
url_num: int, url_num: int,
context: BrowserContext, page: Page,
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
pattern = re.compile(r"\((\d+)\)") pattern = re.compile(r"\((\d+)\)")
@ -82,8 +82,6 @@ async def process_event(
got_one = asyncio.Event() got_one = asyncio.Event()
page = await context.new_page()
handler = partial( handler = partial(
network.capture_req, network.capture_req,
captured=captured, captured=captured,
@ -117,7 +115,8 @@ async def process_event(
try: try:
first_available = await page.wait_for_selector( first_available = await page.wait_for_selector(
'a[href*="/stream/"]', timeout=3_000 'a[href*="/stream/"]',
timeout=3_000,
) )
except TimeoutError: except TimeoutError:
log.warning(f"URL {url_num}) No available stream links.") log.warning(f"URL {url_num}) No available stream links.")
@ -176,8 +175,6 @@ async def process_event(
finally: finally:
page.remove_listener("request", handler) page.remove_listener("request", handler)
await page.close()
async def get_events(base_url: str, cached_keys: list[str]) -> list[dict[str, str]]: async def get_events(base_url: str, cached_keys: list[str]) -> list[dict[str, str]]:
now = Time.clean(Time.now()) now = Time.clean(Time.now())
@ -235,7 +232,7 @@ async def get_events(base_url: str, cached_keys: list[str]) -> list[dict[str, st
return events return events
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
valid_urls = {k: v for k, v in cached_urls.items() if v["url"]} valid_urls = {k: v for k, v in cached_urls.items() if v["url"]}
@ -260,16 +257,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p, browser="external")
try:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
process_event, process_event,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
context=context, page=page,
) )
url, iframe = await network.safe_process( url, iframe = await network.safe_process(
@ -307,9 +302,6 @@ async def scrape() -> None:
urls[key] = entry urls[key] = entry
finally:
await browser.close()
if new_count := valid_count - cached_count: if new_count := valid_count - cached_count:
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")

View file

@ -1,7 +1,7 @@
import asyncio import asyncio
from functools import partial from functools import partial
from playwright.async_api import async_playwright from playwright.async_api import BrowserContext
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
from .utils import Cache, Time, get_logger, leagues, network from .utils import Cache, Time, get_logger, leagues, network
@ -12,9 +12,9 @@ urls: dict[str, dict[str, str | float]] = {}
TAG = "WEBCAST" TAG = "WEBCAST"
CACHE_FILE = Cache(f"{TAG.lower()}.json", exp=10_800) CACHE_FILE = Cache(TAG, exp=10_800)
HTML_CACHE = Cache(f"{TAG.lower()}-html.json", exp=86_400) HTML_CACHE = Cache(f"{TAG}-html", exp=86_400)
BASE_URLS = {"NFL": "https://nflwebcast.com", "NHL": "https://slapstreams.com"} BASE_URLS = {"NFL": "https://nflwebcast.com", "NHL": "https://slapstreams.com"}
@ -110,7 +110,7 @@ async def get_events(cached_keys: list[str]) -> list[dict[str, str]]:
return live return live
async def scrape() -> None: async def scrape(browser: BrowserContext) -> None:
cached_urls = CACHE_FILE.load() cached_urls = CACHE_FILE.load()
cached_count = len(cached_urls) cached_count = len(cached_urls)
@ -126,16 +126,14 @@ async def scrape() -> None:
log.info(f"Processing {len(events)} new URL(s)") log.info(f"Processing {len(events)} new URL(s)")
if events: if events:
async with async_playwright() as p: async with network.event_context(browser) as context:
browser, context = await network.browser(p)
try:
for i, ev in enumerate(events, start=1): for i, ev in enumerate(events, start=1):
async with network.event_page(context) as page:
handler = partial( handler = partial(
network.process_event, network.process_event,
url=ev["link"], url=ev["link"],
url_num=i, url_num=i,
context=context, page=page,
log=log, log=log,
) )
@ -169,9 +167,6 @@ async def scrape() -> None:
urls[key] = cached_urls[key] = entry urls[key] = cached_urls[key] = entry
finally:
await browser.close()
if new_count := len(cached_urls) - cached_count: if new_count := len(cached_urls) - cached_count:
log.info(f"Collected and cached {new_count} new event(s)") log.info(f"Collected and cached {new_count} new event(s)")