"""Verify gkmlpt APIs work stateless via plain httpx (no browser).

If this passes, the gkmlpt adapter can run on tier-1 (httpx) without
patchright/Cookie pool — big win for throughput and cost.

Checks for each site:
  1. GET /api/all/{column_id}?page=1&sid={sid}  →  200 + JSON parseable
  2. items at $.articles[*] with fields (id,title,url,date,publisher,attachment)
  3. detail url bucket pattern: url contains floor(post_id/1000) segment
  4. GET one detail URL via httpx  →  200 + HTML containing <ucaptitle>/<publishtime>/<div id="zoomcon">

Usage:  uv run python scripts/verify_gkmlpt_httpx.py
"""
from __future__ import annotations

import json
import re
import sys
from pathlib import Path

import httpx
from parsel import Selector

OUT = Path(__file__).resolve().parents[1] / "data" / "probe"

CASES = [
    # (label, api_url, sid, detail_url)
    (
        "qingcheng_fgw_2849",
        "http://www.qingcheng.gov.cn/qyqcfgw/gkmlpt/api/all/2849?page=1&sid=763042",
        763042,
        "http://www.qingcheng.gov.cn/qyqcfgw/gkmlpt/content/2/2116/post_2116964.html",
    ),
    (
        "lianzhou_fgw_5074",
        "http://www.lianzhou.gov.cn/qylzfgw/gkmlpt/api/all/5074?page=1&sid=763156",
        763156,
        "http://www.lianzhou.gov.cn/qylzfgw/gkmlpt/content/2/2046/post_2046981.html",
    ),
    (
        "qingxin_fgw_3860",
        "https://www.qingxin.gov.cn/qyqxfgw/gkmlpt/api/all/3860?page=1&sid=763068",
        763068,
        "https://www.qingxin.gov.cn/qyqxfgw/gkmlpt/content/2/2099/post_2099047.html",
    ),
]

HDR = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/121.0.0.0 Safari/537.36"
    ),
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "zh-CN,zh;q=0.9",
}


def verify_one(client: httpx.Client, label: str, api: str, sid: int, detail: str) -> dict:
    rep: dict = {"label": label, "api_ok": False, "detail_ok": False, "notes": []}

    # 1) API hit
    try:
        r = client.get(api, headers=HDR, timeout=20.0)
    except Exception as e:
        rep["notes"].append(f"api GET exception: {e}")
        return rep
    rep["api_status"] = r.status_code
    rep["api_ct"] = r.headers.get("content-type", "")
    if r.status_code != 200:
        rep["notes"].append(f"api status {r.status_code}")
        return rep
    try:
        data = r.json()
    except Exception as e:
        rep["notes"].append(f"api not json: {e}; body[:200]={r.text[:200]!r}")
        return rep

    arts = data.get("articles") or []
    rep["api_ok"] = True
    rep["article_count"] = len(arts)
    rep["classify"] = (data.get("classify") or {}).get("name")

    # 2) field schema probe on first 3
    sample_fields = {"id", "title", "url", "date", "publisher", "attachment", "first_publish_time"}
    missing_fields: set[str] = set()
    bucket_ok = True
    for a in arts[:3]:
        for f in sample_fields:
            if f not in a:
                missing_fields.add(f)
        # 3) bucket check
        url = a.get("url") or ""
        pid = a.get("id")
        if pid is not None:
            expect = str(pid // 1000)
            # url shape .../content/2/{bucket}/post_{pid}.html
            m = re.search(r"/content/\d+/(\d+)/post_(\d+)\.html", url)
            if not m or m.group(1) != expect or m.group(2) != str(pid):
                bucket_ok = False
                rep["notes"].append(f"bucket mismatch: pid={pid} url={url}")
    rep["missing_fields"] = sorted(missing_fields)
    rep["bucket_pattern_ok"] = bucket_ok
    # 'site' field should equal sid (cross-check)
    if arts:
        site_field = arts[0].get("site")
        rep["site_matches_sid"] = site_field == sid

    # 4) detail fetch + selector check
    try:
        r2 = client.get(detail, headers=HDR, timeout=25.0)
    except Exception as e:
        rep["notes"].append(f"detail GET exception: {e}")
        return rep
    rep["detail_status"] = r2.status_code
    if r2.status_code != 200:
        return rep
    html = r2.text
    sel = Selector(text=html)
    # gkmlpt detail DOM (empirically verified 2026-04-23 across 3 sites):
    #   title:   div.content h1.title::text
    #   date:    div.content div.date-row::text  — "发布日期：YYYY-MM-DD  浏览次数：…"
    #   body:    div.article-content
    title = sel.css("div.content h1.title::text").get()
    date_row = " ".join(
        t.strip() for t in sel.css("div.content div.date-row::text").getall() if t.strip()
    )
    m = re.search(r"(\d{4}-\d{2}-\d{2})", date_row)
    body = sel.css("div.article-content").get()
    found = {
        "title": bool(title),
        "date": bool(m),
        "article_content": bool(body),
    }
    rep["detail_selectors"] = found
    rep["detail_title_sample"] = title
    rep["detail_date_parsed"] = m.group(1) if m else None
    rep["detail_body_bytes"] = len(body or "")
    rep["detail_ok"] = all(found.values())
    # keep a tiny sample for eyeballing
    rep["sample_article"] = (
        {k: arts[0].get(k) for k in ("id", "title", "url", "date", "first_publish_time",
                                       "publisher", "document_number", "attachment", "site")}
        if arts else None
    )
    return rep


def main() -> int:
    reports: list[dict] = []
    with httpx.Client(verify=False, follow_redirects=True) as client:
        for label, api, sid, det in CASES:
            print(f"\n--- {label} ---")
            rep = verify_one(client, label, api, sid, det)
            reports.append(rep)
            print(json.dumps(rep, ensure_ascii=False, indent=2, default=str))
    OUT.mkdir(parents=True, exist_ok=True)
    (OUT / "verify_httpx.json").write_text(
        json.dumps(reports, ensure_ascii=False, indent=2, default=str), encoding="utf-8"
    )
    ok = all(r.get("api_ok") and r.get("detail_ok") and r.get("bucket_pattern_ok") for r in reports)
    print("\n>>> ALL GREEN" if ok else "\n>>> FAILURES — inspect JSON")
    return 0 if ok else 1


if __name__ == "__main__":
    sys.exit(main())
