"""Probe the gkmlpt (政务公开目录平台) CMS to learn its real API structure. Launches patchright, opens a known gkmlpt column URL with a fragment anchor, records every XHR/fetch response during page load and for ~3 s after, then saves a summary so we can draft the adapter YAML from real data. Output: data/probe/gkmlpt_.json and *.raw/*.html for raw payloads. Usage: uv run python scripts/probe_gkmlpt.py """ from __future__ import annotations import asyncio import json import re import sys from pathlib import Path from urllib.parse import urlparse from patchright.async_api import async_playwright OUT = Path(__file__).resolve().parents[1] / "data" / "probe" OUT.mkdir(parents=True, exist_ok=True) TARGETS = [ # (site label, list URL with fragment, sample detail URL) ( "qingcheng_fgw_2849", "http://www.qingcheng.gov.cn/qyqcfgw/gkmlpt/index#2849", "http://www.qingcheng.gov.cn/qyqcfgw/gkmlpt/content/2/2116/post_2116964.html#2849", ), ( "lianzhou_fgw_5074", "http://www.lianzhou.gov.cn/qylzfgw/gkmlpt/index#5074", "http://www.lianzhou.gov.cn/qylzfgw/gkmlpt/content/2/2046/post_2046981.html#5076", ), ( "qingxin_fgw_3860", "https://www.qingxin.gov.cn/qyqxfgw/gkmlpt/index#3860", "https://www.qingxin.gov.cn/qyqxfgw/gkmlpt/content/2/2099/post_2099047.html#3861", ), ] async def probe_one(browser_ctx, label: str, list_url: str, detail_url: str) -> dict: page = await browser_ctx.new_page() reqs: list[dict] = [] async def on_response(resp): try: url = resp.url # only keep same-origin or gov.cn requests (skip CDN, analytics) host = urlparse(url).hostname or "" if "gov.cn" not in host and "/" not in url: return ct = (resp.headers or {}).get("content-type", "") # skip static assets if any(x in ct for x in ("image/", "font/", "text/css", "javascript")): return body = None try: # cap body to 50KB to keep output readable raw = await resp.body() body = raw[:50000].decode("utf-8", "replace") except Exception as e: body = f"" reqs.append( { "url": url, "method": resp.request.method, "status": resp.status, "content_type": ct, "body_preview": body, } ) except Exception as e: print(f"[{label}] on_response err: {e}", file=sys.stderr) page.on("response", on_response) print(f"\n=== {label} ===") print(f"LIST: {list_url}") try: await page.goto(list_url, wait_until="networkidle", timeout=45000) except Exception as e: print(f" goto list failed: {e}") # give fragment-triggered XHR some time await page.wait_for_timeout(2500) # snapshot the rendered DOM to see what ends up on the page list_html = await page.content() # now visit detail page in same context (cookie reuse) print(f"DETAIL: {detail_url}") try: await page.goto(detail_url, wait_until="networkidle", timeout=45000) except Exception as e: print(f" goto detail failed: {e}") await page.wait_for_timeout(1500) detail_html = await page.content() await page.close() # persist artifacts (OUT / f"{label}_list.html").write_text(list_html, encoding="utf-8") (OUT / f"{label}_detail.html").write_text(detail_html, encoding="utf-8") summary = { "label": label, "list_url": list_url, "detail_url": detail_url, "xhr_count": len(reqs), "xhrs": reqs, } (OUT / f"{label}.json").write_text( json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8" ) # brief console digest: URLs that look like APIs (json or list-like) print(f" captured {len(reqs)} responses:") for r in reqs: flags = [] if "json" in r["content_type"]: flags.append("JSON") bp = r.get("body_preview") or "" if re.search(r"(columnId|getColumnList|getColumnTree|pubDate|post_\d+)", bp): flags.append("LIST-LIKE") if flags: print(f" [{'|'.join(flags):>14}] {r['status']} {r['method']} {r['url']}") return summary async def main(): async with async_playwright() as p: browser = await p.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled"], ) ctx = await browser.new_context( user_agent=( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/121.0.0.0 Safari/537.36" ), locale="zh-CN", viewport={"width": 1280, "height": 900}, ) for label, lu, du in TARGETS: try: await probe_one(ctx, label, lu, du) except Exception as e: print(f"[{label}] probe failed: {e}") # politeness: space out sites await asyncio.sleep(3) await ctx.close() await browser.close() print(f"\nArtifacts in: {OUT}") if __name__ == "__main__": asyncio.run(main())