from __future__ import annotations
import asyncio
import time
from dataclasses import dataclass

from govcrawler.settings import get_settings

CHALLENGE_MARKERS = ("ctct-slider-canvas", "请稍候")


@dataclass
class FetchResult:
    url: str
    final_url: str
    status: int
    html: str
    fetched_at: float
    duration_ms: int
    is_challenge: bool
    error: str | None = None
    strategy: str = "playwright"   # "httpx" | "playwright" | "drission"


def is_challenge_page(status: int, html: str) -> bool:
    if status == 412:
        return True
    return any(m in (html or "") for m in CHALLENGE_MARKERS)


async def _fetch_async(url: str, *, user_agent: str, timeout_ms: int = 30000) -> FetchResult:
    from patchright.async_api import async_playwright

    t0 = time.time()
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                headless=True,
                args=["--disable-blink-features=AutomationControlled"],
            )
            try:
                context = await browser.new_context(
                    user_agent=user_agent,
                    locale="zh-CN",
                    timezone_id="Asia/Shanghai",
                    viewport={"width": 1440, "height": 900},
                )
                page = await context.new_page()
                try:
                    DETAIL_SELECTOR = "div.article-content, div.content, div.TRS_Editor, body"

                    response = await page.goto(
                        url, wait_until="domcontentloaded", timeout=timeout_ms
                    )

                    selector_ok = False
                    try:
                        await page.wait_for_selector(DETAIL_SELECTOR, timeout=5000)
                        selector_ok = True
                    except Exception:
                        selector_ok = False

                    if not selector_ok:
                        try:
                            response = await page.goto(
                                url, wait_until="networkidle", timeout=45000
                            )
                            try:
                                await page.wait_for_selector(DETAIL_SELECTOR, timeout=5000)
                                selector_ok = True
                            except Exception:
                                selector_ok = False
                        except Exception:
                            selector_ok = False

                    # Challenge-page poll: ctct 盾 的 JS 挑战是两阶段 —
                    #  阶段 1: 挑战页 → 验证 JS 跑完 → 换成小 shell（仅 <head>）
                    #  阶段 2: shell 里的 ctct_bundle 再 AJAX 把真正文挂到 <body>
                    # 所以光等"挑战 marker 消失"不够，还要等 <body> 里出现内容容器。
                    BODY_READY_JS = (
                        "() => { const root = document.querySelector("
                        "'div.article-content, div.content, div.TRS_Editor, div.article, article');"
                        " return !!(root && root.innerText && root.innerText.trim().length > 50); }"
                    )
                    for _ in range(60):   # 30s budget
                        html_now = await page.content()
                        has_challenge = any(m in html_now for m in CHALLENGE_MARKERS)
                        body_ready = False
                        if not has_challenge:
                            try:
                                body_ready = bool(await page.evaluate(BODY_READY_JS))
                            except Exception:
                                body_ready = False
                        if not has_challenge and body_ready:
                            break
                        await asyncio.sleep(0.5)

                    html = await page.content()
                    final = page.url
                    # After poll succeeds, the real page is loaded — the initial
                    # 412 from the challenge response no longer reflects reality.
                    # Promote status to 200 when markers are gone and body is
                    # populated (we checked above, but re-check defensively).
                    raw_status = response.status if response else 0
                    markers_gone = not any(m in html for m in CHALLENGE_MARKERS)
                    if raw_status == 412 and markers_gone and len(html) > 2000:
                        status = 200
                    else:
                        status = raw_status
                    challenge = is_challenge_page(status, html)

                    # FETCH-04 Cookie pool: harvest cookies when the fetch is
                    # "clean enough" (no challenge detected, got some HTML).
                    # The WAF's session cookie lives in this set; replaying it
                    # from httpx on the next same-host request skips Chromium.
                    try:
                        if not challenge and html:
                            jar = await context.cookies(url)
                            pairs = {
                                c.get("name"): c.get("value", "")
                                for c in (jar or [])
                                if c.get("name")
                            }
                            if pairs:
                                from urllib.parse import urlparse as _urlparse
                                from govcrawler.cookies import get_default_store
                                host = _urlparse(final or url).netloc.lower()
                                get_default_store().set(host, pairs)
                    except Exception:
                        # Cookie harvest is best-effort; never fail the fetch.
                        pass

                    return FetchResult(
                        url=url,
                        final_url=final,
                        status=status,
                        html=html,
                        fetched_at=time.time(),
                        duration_ms=int((time.time() - t0) * 1000),
                        is_challenge=challenge,
                    )
                finally:
                    await context.close()
            finally:
                await browser.close()
    except Exception as e:
        return FetchResult(
            url=url,
            final_url=url,
            status=0,
            html="",
            fetched_at=time.time(),
            duration_ms=int((time.time() - t0) * 1000),
            is_challenge=False,
            error=f"{type(e).__name__}: {e}",
        )


def fetch_html(url: str, *, timeout_ms: int = 30000) -> FetchResult:
    """Sync wrapper over the patchright async fetch."""
    settings = get_settings()
    # OBS: record the browser launch (Cookie-pool-miss cost proxy)
    try:
        from urllib.parse import urlparse
        from govcrawler.observability import record_browser_launch
        host = urlparse(url).netloc.lower()
        record_browser_launch(host)
    except Exception:
        pass
    return asyncio.run(
        _fetch_async(url, user_agent=settings.user_agent, timeout_ms=timeout_ms)
    )
