from __future__ import annotations import asyncio import time from dataclasses import dataclass from govcrawler.settings import get_settings CHALLENGE_MARKERS = ("ctct-slider-canvas", "请稍候") @dataclass class FetchResult: url: str final_url: str status: int html: str fetched_at: float duration_ms: int is_challenge: bool error: str | None = None strategy: str = "playwright" # "httpx" | "playwright" | "drission" def is_challenge_page(status: int, html: str) -> bool: if status == 412: return True return any(m in (html or "") for m in CHALLENGE_MARKERS) async def _fetch_async(url: str, *, user_agent: str, timeout_ms: int = 30000) -> FetchResult: from patchright.async_api import async_playwright t0 = time.time() try: async with async_playwright() as p: browser = await p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled"], ) try: context = await browser.new_context( user_agent=user_agent, locale="zh-CN", timezone_id="Asia/Shanghai", viewport={"width": 1440, "height": 900}, ) page = await context.new_page() try: DETAIL_SELECTOR = "div.article-content, div.content, div.TRS_Editor, body" response = await page.goto( url, wait_until="domcontentloaded", timeout=timeout_ms ) selector_ok = False try: await page.wait_for_selector(DETAIL_SELECTOR, timeout=5000) selector_ok = True except Exception: selector_ok = False if not selector_ok: try: response = await page.goto( url, wait_until="networkidle", timeout=45000 ) try: await page.wait_for_selector(DETAIL_SELECTOR, timeout=5000) selector_ok = True except Exception: selector_ok = False except Exception: selector_ok = False # Challenge-page poll: ctct 盾的 JS 挑战是两阶段 — # 阶段 1: 挑战页 → 验证 JS 跑完 → 换成小 shell（仅） # 阶段 2: shell 里的 ctct_bundle 再 AJAX 把真正文挂到 # 所以光等"挑战 marker 消失"不够，还要等里出现内容容器。 BODY_READY_JS = ( "() => { const root = document.querySelector(" "'div.article-content, div.content, div.TRS_Editor, div.article, article');" " return !!(root && root.innerText && root.innerText.trim().length > 50); }" ) for _ in range(60): # 30s budget html_now = await page.content() has_challenge = any(m in html_now for m in CHALLENGE_MARKERS) body_ready = False if not has_challenge: try: body_ready = bool(await page.evaluate(BODY_READY_JS)) except Exception: body_ready = False if not has_challenge and body_ready: break await asyncio.sleep(0.5) html = await page.content() final = page.url # After poll succeeds, the real page is loaded — the initial # 412 from the challenge response no longer reflects reality. # Promote status to 200 when markers are gone and body is # populated (we checked above, but re-check defensively). raw_status = response.status if response else 0 markers_gone = not any(m in html for m in CHALLENGE_MARKERS) if raw_status == 412 and markers_gone and len(html) > 2000: status = 200 else: status = raw_status challenge = is_challenge_page(status, html) # FETCH-04 Cookie pool: harvest cookies when the fetch is # "clean enough" (no challenge detected, got some HTML). # The WAF's session cookie lives in this set; replaying it # from httpx on the next same-host request skips Chromium. try: if not challenge and html: jar = await context.cookies(url) pairs = { c.get("name"): c.get("value", "") for c in (jar or []) if c.get("name") } if pairs: from urllib.parse import urlparse as _urlparse from govcrawler.cookies import get_default_store host = _urlparse(final or url).netloc.lower() get_default_store().set(host, pairs) except Exception: # Cookie harvest is best-effort; never fail the fetch. pass return FetchResult( url=url, final_url=final, status=status, html=html, fetched_at=time.time(), duration_ms=int((time.time() - t0) * 1000), is_challenge=challenge, ) finally: await context.close() finally: await browser.close() except Exception as e: return FetchResult( url=url, final_url=url, status=0, html="", fetched_at=time.time(), duration_ms=int((time.time() - t0) * 1000), is_challenge=False, error=f"{type(e).__name__}: {e}", ) def fetch_html(url: str, *, timeout_ms: int = 30000) -> FetchResult: """Sync wrapper over the patchright async fetch.""" settings = get_settings() # OBS: record the browser launch (Cookie-pool-miss cost proxy) try: from urllib.parse import urlparse from govcrawler.observability import record_browser_launch host = urlparse(url).netloc.lower() record_browser_launch(host) except Exception: pass return asyncio.run( _fetch_async(url, user_agent=settings.user_agent, timeout_ms=timeout_ms) )