"""Probe the gkmlpt (政务公开目录平台) CMS to learn its real API structure.

Launches patchright, opens a known gkmlpt column URL with a fragment anchor,
records every XHR/fetch response during page load and for ~3 s after, then
saves a summary so we can draft the adapter YAML from real data.

Output: data/probe/gkmlpt_<site>.json  and  *.raw/*.html for raw payloads.

Usage:
    uv run python scripts/probe_gkmlpt.py
"""
from __future__ import annotations

import asyncio
import json
import re
import sys
from pathlib import Path
from urllib.parse import urlparse

from patchright.async_api import async_playwright

OUT = Path(__file__).resolve().parents[1] / "data" / "probe"
OUT.mkdir(parents=True, exist_ok=True)

TARGETS = [
    # (site label, list URL with fragment, sample detail URL)
    (
        "qingcheng_fgw_2849",
        "http://www.qingcheng.gov.cn/qyqcfgw/gkmlpt/index#2849",
        "http://www.qingcheng.gov.cn/qyqcfgw/gkmlpt/content/2/2116/post_2116964.html#2849",
    ),
    (
        "lianzhou_fgw_5074",
        "http://www.lianzhou.gov.cn/qylzfgw/gkmlpt/index#5074",
        "http://www.lianzhou.gov.cn/qylzfgw/gkmlpt/content/2/2046/post_2046981.html#5076",
    ),
    (
        "qingxin_fgw_3860",
        "https://www.qingxin.gov.cn/qyqxfgw/gkmlpt/index#3860",
        "https://www.qingxin.gov.cn/qyqxfgw/gkmlpt/content/2/2099/post_2099047.html#3861",
    ),
]


async def probe_one(browser_ctx, label: str, list_url: str, detail_url: str) -> dict:
    page = await browser_ctx.new_page()
    reqs: list[dict] = []

    async def on_response(resp):
        try:
            url = resp.url
            # only keep same-origin or gov.cn requests (skip CDN, analytics)
            host = urlparse(url).hostname or ""
            if "gov.cn" not in host and "/" not in url:
                return
            ct = (resp.headers or {}).get("content-type", "")
            # skip static assets
            if any(x in ct for x in ("image/", "font/", "text/css", "javascript")):
                return
            body = None
            try:
                # cap body to 50KB to keep output readable
                raw = await resp.body()
                body = raw[:50000].decode("utf-8", "replace")
            except Exception as e:
                body = f"<body read failed: {e}>"
            reqs.append(
                {
                    "url": url,
                    "method": resp.request.method,
                    "status": resp.status,
                    "content_type": ct,
                    "body_preview": body,
                }
            )
        except Exception as e:
            print(f"[{label}] on_response err: {e}", file=sys.stderr)

    page.on("response", on_response)

    print(f"\n=== {label} ===")
    print(f"LIST:   {list_url}")
    try:
        await page.goto(list_url, wait_until="networkidle", timeout=45000)
    except Exception as e:
        print(f"  goto list failed: {e}")
    # give fragment-triggered XHR some time
    await page.wait_for_timeout(2500)

    # snapshot the rendered DOM to see what ends up on the page
    list_html = await page.content()

    # now visit detail page in same context (cookie reuse)
    print(f"DETAIL: {detail_url}")
    try:
        await page.goto(detail_url, wait_until="networkidle", timeout=45000)
    except Exception as e:
        print(f"  goto detail failed: {e}")
    await page.wait_for_timeout(1500)
    detail_html = await page.content()

    await page.close()

    # persist artifacts
    (OUT / f"{label}_list.html").write_text(list_html, encoding="utf-8")
    (OUT / f"{label}_detail.html").write_text(detail_html, encoding="utf-8")

    summary = {
        "label": label,
        "list_url": list_url,
        "detail_url": detail_url,
        "xhr_count": len(reqs),
        "xhrs": reqs,
    }
    (OUT / f"{label}.json").write_text(
        json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8"
    )

    # brief console digest: URLs that look like APIs (json or list-like)
    print(f"  captured {len(reqs)} responses:")
    for r in reqs:
        flags = []
        if "json" in r["content_type"]:
            flags.append("JSON")
        bp = r.get("body_preview") or ""
        if re.search(r"(columnId|getColumnList|getColumnTree|pubDate|post_\d+)", bp):
            flags.append("LIST-LIKE")
        if flags:
            print(f"    [{'|'.join(flags):>14}] {r['status']} {r['method']} {r['url']}")
    return summary


async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=["--disable-blink-features=AutomationControlled"],
        )
        ctx = await browser.new_context(
            user_agent=(
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/121.0.0.0 Safari/537.36"
            ),
            locale="zh-CN",
            viewport={"width": 1280, "height": 900},
        )

        for label, lu, du in TARGETS:
            try:
                await probe_one(ctx, label, lu, du)
            except Exception as e:
                print(f"[{label}] probe failed: {e}")
            # politeness: space out sites
            await asyncio.sleep(3)

        await ctx.close()
        await browser.close()
    print(f"\nArtifacts in: {OUT}")


if __name__ == "__main__":
    asyncio.run(main())
