"""Verify gkmlpt adapter on CITY-level sites (市级), where anti-bot likely bites.

Cluster hypothesis (user-confirmed): 市县同一套 CMS，前端皮肤可换，后端一致。
县区级 (清城/连州/清新) 裸 httpx 通过；市级 (gdqy.gov.cn) 预期会触发 ctct / 云盾。

Run both tiers side-by-side on the SAME URL to characterize the delta:
  tier-1: plain httpx (no cookies)
  tier-2: patchright once to seed Cookie, then httpx with that Cookie
Record outcome per stage → feed site.yaml strategy choice.

Targets:
  gdqy (清远市政府 / 教育局) — known API pattern 同 PoC，取两个栏目
Usage: uv run python scripts/verify_gkmlpt_city.py
"""
from __future__ import annotations

import asyncio
import json
import re
import sys
from pathlib import Path

import httpx
from parsel import Selector
from patchright.async_api import async_playwright

OUT = Path(__file__).resolve().parents[1] / "data" / "probe"

# 市级（清远市本级）样本 —— 需要先打开入口探 sid
CITY_TARGETS = [
    # label, dept_path, index_url, column_id_frag
    ("gdqy_jyj_128",  "qyjyj",  "https://www.gdqy.gov.cn/qyjyj/gkmlpt/index#128",  128),
    ("gdqy_jyj_137",  "qyjyj",  "https://www.gdqy.gov.cn/qyjyj/gkmlpt/index#137",  137),
]
BASE = "https://www.gdqy.gov.cn"

UA = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
)
HDR = {"User-Agent": UA, "Accept": "application/json, text/plain, */*",
       "Accept-Language": "zh-CN,zh;q=0.9"}


def tier1_httpx(api_url: str) -> dict:
    rep = {"tier": "httpx_naked"}
    try:
        with httpx.Client(verify=False, follow_redirects=True, timeout=20.0) as c:
            r = c.get(api_url, headers=HDR)
        rep["status"] = r.status_code
        rep["ct"] = r.headers.get("content-type", "")
        body = r.text
        rep["body_head"] = body[:400]
        rep["shield_markers"] = [m for m in (
            "ctct", "云盾", "滑块", "安全验证", "challenge", "robot"
        ) if m in body]
        try:
            data = r.json()
            rep["articles_len"] = len(data.get("articles") or [])
            rep["ok"] = True
        except Exception as e:
            rep["json_err"] = str(e)
            rep["ok"] = False
    except Exception as e:
        rep["ok"] = False
        rep["err"] = str(e)
    return rep


async def tier2_seeded(index_url: str, api_url: str) -> dict:
    """Open index in patchright to let ctct set cookie, export it, replay api via httpx."""
    rep = {"tier": "cookie_seeded_httpx"}
    async with async_playwright() as p:
        br = await p.chromium.launch(headless=True,
                                     args=["--disable-blink-features=AutomationControlled"])
        ctx = await br.new_context(user_agent=UA, locale="zh-CN")
        page = await ctx.new_page()
        # capture any api/all hit during load (may reveal actual sid)
        seen_apis: list[str] = []
        page.on("response", lambda r: (
            seen_apis.append(r.url) if "/gkmlpt/api/all/" in r.url else None
        ))
        try:
            await page.goto(index_url, wait_until="networkidle", timeout=45000)
        except Exception as e:
            rep["goto_err"] = str(e)
        await page.wait_for_timeout(2500)
        cookies = await ctx.cookies()
        rep["cookie_names"] = sorted({c["name"] for c in cookies})
        rep["sniffed_apis"] = seen_apis[:5]
        await br.close()

    jar = httpx.Cookies()
    for c in cookies:
        try:
            jar.set(c["name"], c["value"], domain=c.get("domain", ""),
                    path=c.get("path", "/"))
        except Exception:
            pass
    try:
        with httpx.Client(verify=False, follow_redirects=True, timeout=20.0,
                          cookies=jar) as cli:
            r = cli.get(api_url, headers={**HDR, "Referer": index_url})
        rep["status"] = r.status_code
        rep["ct"] = r.headers.get("content-type", "")
        body = r.text
        rep["body_head"] = body[:400]
        rep["shield_markers"] = [m for m in (
            "ctct", "云盾", "滑块", "安全验证", "challenge", "robot"
        ) if m in body]
        try:
            data = r.json()
            rep["articles_len"] = len(data.get("articles") or [])
            rep["ok"] = True
        except Exception as e:
            rep["json_err"] = str(e)
            rep["ok"] = False
    except Exception as e:
        rep["ok"] = False
        rep["err"] = str(e)
    return rep


async def discover_sid(index_url: str) -> dict:
    """Spy the initial XHR to learn sid for this city site."""
    rep = {}
    async with async_playwright() as p:
        br = await p.chromium.launch(headless=True,
                                     args=["--disable-blink-features=AutomationControlled"])
        ctx = await br.new_context(user_agent=UA, locale="zh-CN")
        page = await ctx.new_page()
        apis: list[str] = []
        page.on("response", lambda r: (
            apis.append(r.url) if "/gkmlpt/api/all/" in r.url else None
        ))
        try:
            await page.goto(index_url, wait_until="networkidle", timeout=45000)
        except Exception as e:
            rep["err"] = str(e)
        await page.wait_for_timeout(2000)
        await br.close()
    rep["apis"] = apis
    for u in apis:
        m = re.search(r"sid=(\d+)", u)
        if m:
            rep["sid"] = int(m.group(1))
            break
    return rep


async def main():
    all_reports = []
    for label, dept, idx_url, col in CITY_TARGETS:
        print(f"\n=== {label} — discover sid ===")
        disc = await discover_sid(idx_url)
        print(json.dumps(disc, ensure_ascii=False, indent=2))
        sid = disc.get("sid")
        if not sid:
            print("  no sid discovered; skip")
            all_reports.append({"label": label, "disc": disc, "skipped": True})
            continue
        api = f"{BASE}/{dept}/gkmlpt/api/all/{col}?page=1&sid={sid}"
        print(f"  api = {api}")

        t1 = tier1_httpx(api)
        print(f"  [t1 naked httpx] {json.dumps(t1, ensure_ascii=False)[:240]}")

        t2 = await tier2_seeded(idx_url, api)
        print(f"  [t2 seeded httpx] {json.dumps(t2, ensure_ascii=False)[:240]}")

        all_reports.append({"label": label, "sid": sid, "api": api,
                            "tier1_httpx": t1, "tier2_seeded": t2})

    OUT.mkdir(parents=True, exist_ok=True)
    (OUT / "verify_city.json").write_text(
        json.dumps(all_reports, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    print(f"\nwrote {OUT / 'verify_city.json'}")


if __name__ == "__main__":
    asyncio.run(main())
