e
This commit is contained in:
parent
26eac184ee
commit
00000d96a5
3 changed files with 76 additions and 77 deletions
|
|
@ -97,67 +97,6 @@ async def fetch_xml_stream(url: str, ssl_ctx: ssl.SSLContext) -> io.BytesIO:
|
||||||
return io.BytesIO(b"")
|
return io.BytesIO(b"")
|
||||||
|
|
||||||
|
|
||||||
async def parse_feed(
|
|
||||||
url: str,
|
|
||||||
ssl_ctx: ssl.SSLContext,
|
|
||||||
cached_keys: set[str],
|
|
||||||
) -> list[dict[str, str]]:
|
|
||||||
|
|
||||||
events: list[dict[str, str]] = []
|
|
||||||
pub_date_format = "%a, %d %b %Y %H:%M:%S %z"
|
|
||||||
now = datetime.now(TZ)
|
|
||||||
|
|
||||||
window_start, window_end = now - timedelta(hours=3), now + timedelta(hours=1)
|
|
||||||
|
|
||||||
buffer = await fetch_xml_stream(url, ssl_ctx)
|
|
||||||
|
|
||||||
for _, elem in ET.iterparse(buffer, events=("end",)):
|
|
||||||
if elem.tag == "item":
|
|
||||||
title = elem.findtext("title")
|
|
||||||
desc = elem.findtext("description")
|
|
||||||
pub_date = elem.findtext("pubDate")
|
|
||||||
link = elem.findtext("link")
|
|
||||||
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(pub_date, pub_date_format)
|
|
||||||
dt = dt.astimezone(TZ)
|
|
||||||
except Exception:
|
|
||||||
elem.clear()
|
|
||||||
continue
|
|
||||||
|
|
||||||
if window_start <= dt <= window_end:
|
|
||||||
sport, event = (
|
|
||||||
(
|
|
||||||
desc.split(".")[0].strip(),
|
|
||||||
" ".join(p.strip() for p in desc.split(".")[1:]),
|
|
||||||
)
|
|
||||||
if desc
|
|
||||||
else ("", "")
|
|
||||||
)
|
|
||||||
|
|
||||||
key = f"[{sport}: {event}] {title}"
|
|
||||||
|
|
||||||
if key in cached_keys:
|
|
||||||
elem.clear()
|
|
||||||
continue
|
|
||||||
|
|
||||||
if exist_sprts & {sport, event}:
|
|
||||||
continue
|
|
||||||
|
|
||||||
events.append(
|
|
||||||
{
|
|
||||||
"sport": sport,
|
|
||||||
"event": event,
|
|
||||||
"title": title,
|
|
||||||
"link": link,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
elem.clear()
|
|
||||||
|
|
||||||
return events
|
|
||||||
|
|
||||||
|
|
||||||
async def process_event(url: str, url_num: int) -> str | None:
|
async def process_event(url: str, url_num: int) -> str | None:
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.firefox.launch(headless=True)
|
browser = await p.firefox.launch(headless=True)
|
||||||
|
|
@ -176,10 +115,8 @@ async def process_event(url: str, url_num: int) -> str | None:
|
||||||
".m3u8" in req.url
|
".m3u8" in req.url
|
||||||
and "amazonaws" not in req.url
|
and "amazonaws" not in req.url
|
||||||
and "knitcdn" not in req.url
|
and "knitcdn" not in req.url
|
||||||
and not captured
|
|
||||||
):
|
):
|
||||||
captured.append(req.url)
|
captured.append(req.url)
|
||||||
|
|
||||||
got_one.set()
|
got_one.set()
|
||||||
|
|
||||||
popup = None
|
popup = None
|
||||||
|
|
@ -284,6 +221,67 @@ async def process_event(url: str, url_num: int) -> str | None:
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def get_events(
|
||||||
|
url: str,
|
||||||
|
ssl_ctx: ssl.SSLContext,
|
||||||
|
cached_keys: set[str],
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
|
||||||
|
events: list[dict[str, str]] = []
|
||||||
|
pub_date_format = "%a, %d %b %Y %H:%M:%S %z"
|
||||||
|
now = datetime.now(TZ)
|
||||||
|
|
||||||
|
window_start, window_end = now - timedelta(hours=3), now + timedelta(hours=1)
|
||||||
|
|
||||||
|
buffer = await fetch_xml_stream(url, ssl_ctx)
|
||||||
|
|
||||||
|
for _, elem in ET.iterparse(buffer, events=("end",)):
|
||||||
|
if elem.tag == "item":
|
||||||
|
title = elem.findtext("title")
|
||||||
|
desc = elem.findtext("description")
|
||||||
|
pub_date = elem.findtext("pubDate")
|
||||||
|
link = elem.findtext("link")
|
||||||
|
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(pub_date, pub_date_format)
|
||||||
|
dt = dt.astimezone(TZ)
|
||||||
|
except Exception:
|
||||||
|
elem.clear()
|
||||||
|
continue
|
||||||
|
|
||||||
|
if window_start <= dt <= window_end:
|
||||||
|
sport, event = (
|
||||||
|
(
|
||||||
|
desc.split(".")[0].strip(),
|
||||||
|
" ".join(p.strip() for p in desc.split(".")[1:]),
|
||||||
|
)
|
||||||
|
if desc
|
||||||
|
else ("", "")
|
||||||
|
)
|
||||||
|
|
||||||
|
key = f"[{sport}: {event}] {title}"
|
||||||
|
|
||||||
|
if key in cached_keys:
|
||||||
|
elem.clear()
|
||||||
|
continue
|
||||||
|
|
||||||
|
if exist_sprts & {sport, event}:
|
||||||
|
continue
|
||||||
|
|
||||||
|
events.append(
|
||||||
|
{
|
||||||
|
"sport": sport,
|
||||||
|
"event": event,
|
||||||
|
"title": title,
|
||||||
|
"link": link,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
elem.clear()
|
||||||
|
|
||||||
|
return events
|
||||||
|
|
||||||
|
|
||||||
async def main(client: httpx.AsyncClient) -> None:
|
async def main(client: httpx.AsyncClient) -> None:
|
||||||
log.info(f'Scraping from "{BASE_URL}"')
|
log.info(f'Scraping from "{BASE_URL}"')
|
||||||
|
|
||||||
|
|
@ -292,13 +290,13 @@ async def main(client: httpx.AsyncClient) -> None:
|
||||||
cached_urls = load_cache()
|
cached_urls = load_cache()
|
||||||
cached_count = len(cached_urls)
|
cached_count = len(cached_urls)
|
||||||
|
|
||||||
events = await parse_feed(BASE_URL, cert, set(cached_urls.keys()))
|
events = await get_events(BASE_URL, cert, set(cached_urls.keys()))
|
||||||
|
|
||||||
log.info(f"Processing {len(events)} URLs")
|
log.info(f"Processing {len(events)} URLs")
|
||||||
|
|
||||||
now_ts = datetime.now(TZ).timestamp()
|
now_ts = datetime.now(TZ).timestamp()
|
||||||
|
|
||||||
for num, ev in enumerate(events, start=1):
|
for i, ev in enumerate(events, start=1):
|
||||||
sport = ev["sport"]
|
sport = ev["sport"]
|
||||||
event = ev["event"]
|
event = ev["event"]
|
||||||
title = ev["title"]
|
title = ev["title"]
|
||||||
|
|
@ -307,8 +305,8 @@ async def main(client: httpx.AsyncClient) -> None:
|
||||||
key = f"[{sport}: {event}] {title}"
|
key = f"[{sport}: {event}] {title}"
|
||||||
|
|
||||||
url = await safe_process_event(
|
url = await safe_process_event(
|
||||||
lambda: process_event(link, url_num=num),
|
lambda: process_event(link, url_num=i),
|
||||||
url_num=num,
|
url_num=i,
|
||||||
log=log,
|
log=log,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -330,4 +328,4 @@ async def main(client: httpx.AsyncClient) -> None:
|
||||||
|
|
||||||
log.info(f"Cached {cached_count} event(s)")
|
log.info(f"Cached {cached_count} event(s)")
|
||||||
|
|
||||||
log.info(f"Collected {new_count} event(s)")
|
log.info(f"Collected {new_count} new event(s)")
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,6 @@ async def process_event(url: str, url_num: int) -> str | None:
|
||||||
".m3u8" in req.url
|
".m3u8" in req.url
|
||||||
and "amazonaws" not in req.url
|
and "amazonaws" not in req.url
|
||||||
and "knitcdn" not in req.url
|
and "knitcdn" not in req.url
|
||||||
and not captured
|
|
||||||
):
|
):
|
||||||
captured.append(req.url)
|
captured.append(req.url)
|
||||||
got_one.set()
|
got_one.set()
|
||||||
|
|
@ -119,12 +118,15 @@ async def process_event(url: str, url_num: int) -> str | None:
|
||||||
async def get_events(
|
async def get_events(
|
||||||
client: httpx.AsyncClient,
|
client: httpx.AsyncClient,
|
||||||
api_url: str,
|
api_url: str,
|
||||||
cached_keys: list[str],
|
cached_keys: set[str],
|
||||||
) -> dict[str, dict[str, str | str]]:
|
) -> dict[str, dict[str, str | str]]:
|
||||||
events = []
|
|
||||||
|
events: list[dict[str, str]] = []
|
||||||
|
|
||||||
base_url = re.match(r"(https?://.+?)/", api_url)[1]
|
base_url = re.match(r"(https?://.+?)/", api_url)[1]
|
||||||
|
|
||||||
|
now = datetime.now(TZ)
|
||||||
|
|
||||||
if not (api_data := load_api_cache()):
|
if not (api_data := load_api_cache()):
|
||||||
api_data = await refresh_api_cache(client, api_url)
|
api_data = await refresh_api_cache(client, api_url)
|
||||||
API_FILE.write_text(json.dumps(api_data, indent=2), encoding="utf-8")
|
API_FILE.write_text(json.dumps(api_data, indent=2), encoding="utf-8")
|
||||||
|
|
@ -156,7 +158,7 @@ async def get_events(
|
||||||
|
|
||||||
end_dt = datetime.fromtimestamp(end_ts, tz=TZ)
|
end_dt = datetime.fromtimestamp(end_ts, tz=TZ)
|
||||||
|
|
||||||
if not start_dt <= datetime.now(TZ) < end_dt:
|
if not start_dt <= now < end_dt:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
events.append(
|
events.append(
|
||||||
|
|
@ -189,10 +191,10 @@ async def main(client: httpx.AsyncClient) -> None:
|
||||||
|
|
||||||
log.info(f"Processing {len(events)} URLs")
|
log.info(f"Processing {len(events)} URLs")
|
||||||
|
|
||||||
for num, ev in enumerate(events, start=1):
|
for i, ev in enumerate(events, start=1):
|
||||||
url = await safe_process_event(
|
url = await safe_process_event(
|
||||||
lambda: process_event(ev["link"], url_num=num),
|
lambda: process_event(ev["link"], url_num=i),
|
||||||
url_num=num,
|
url_num=i,
|
||||||
log=log,
|
log=log,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -212,7 +214,7 @@ async def main(client: httpx.AsyncClient) -> None:
|
||||||
|
|
||||||
log.info(f"Cached {cached_count} event(s)")
|
log.info(f"Cached {cached_count} event(s)")
|
||||||
|
|
||||||
log.info(f"Collected {new_count} event(s)")
|
log.info(f"Collected {new_count} new event(s)")
|
||||||
|
|
||||||
|
|
||||||
# works if no cloudflare bot detection
|
# works if no cloudflare bot detection
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,6 @@ CACHE_FILE = Path(__file__).parent / "caches" / "tvpass.json"
|
||||||
def load_cache() -> dict[str, str]:
|
def load_cache() -> dict[str, str]:
|
||||||
try:
|
try:
|
||||||
data = json.loads(CACHE_FILE.read_text(encoding="utf-8"))
|
data = json.loads(CACHE_FILE.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
return {} if 8 <= datetime.now(TZ).hour <= 12 else data
|
return {} if 8 <= datetime.now(TZ).hour <= 12 else data
|
||||||
except (FileNotFoundError, json.JSONDecodeError):
|
except (FileNotFoundError, json.JSONDecodeError):
|
||||||
return {}
|
return {}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue