"""Adapter for 中国政府网 (gov.cn) policy libraries.

Two profiles, both shipped under the same adapter because the document
shape is identical (TRS-CMS detail pages on www.gov.cn) — they differ
only in the list-API protocol:

  • profile=xxgk   — 政府信息公开 (https://www.gov.cn/zhengce/xxgk/)
        POST sousuoht.www.gov.cn/athena/forward/<list_token>
        with a session "code" obtained from a one-shot warmup GET.
        ~6178 articles, full 国务院 + 国办 catalog, sorted by
        publish_time DESC.

  • profile=zcwjk  — 政策文件库 (https://sousuo.www.gov.cn/zcwjk/)
        GET sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary&p=N&n=20
        Direct query, no session.
        Can be scoped per target with zcwjk_t + zcwjk_cat_key:
          - zhengcelibrary_gw / gongwen     => 国务院文件
          - zhengcelibrary_bm / bumenfile   => 国务院部门文件
        Carries puborg + childtype facets unavailable on the xxgk endpoint.

  • profile=rules  — 国家规章库 / 部门规章
        POST sousuoht.www.gov.cn/athena/forward/<rules_list_token>
        with fixed 规章库 Athena caller headers.
        ~2654 department regulations, sorted by publish date DESC.

Selecting profile: crawl_site.adapter_params_json.policy_profile = "xxgk" | "zcwjk" | "rules".

The pipeline picks up `fetch_list_page` as a custom fetcher (vs the
default build_list_url / GET / json.loads path) so we can do POSTs,
warmups, and non-trivial body shapes.
"""
from __future__ import annotations

import json
import logging
import re
import time
from datetime import datetime, timezone
from html import unescape
from typing import Any

import httpx

from govcrawler.adapters.contract import CrawlItem, FetchStrategy, Status
from govcrawler.fetcher.browser import FetchResult
from govcrawler.utils.url_norm import url_hash as compute_url_hash

log = logging.getLogger(__name__)

ADAPTER_ID = "gov_cn_policy"
DEFAULT_INTERVAL_SEC: float = 10.0

# TRS-CMS detail page selectors (verified on multiple gov.cn /zhengce/content/*
# articles). title is in <title> + <h1>; publish_time is in a <meta> tag;
# content body is `div.pages_content`.
DEFAULT_DETAIL_SELECTORS: dict[str, Any] = {
    "title": (
        "h1.article_title::text, h1::text, "
        "div.article h1::text, "
        "meta[name='ArticleTitle']::attr(content)"
    ),
    "publish_time": (
        "meta[name='others']::attr(content), "
        "meta[name='PubDate']::attr(content), "
        "div.pages-date::text, "
        "div.article-info span::text"
    ),
    "source": (
        "meta[name='source']::attr(content), "
        "meta[name='ContentSource']::attr(content), "
        "span.source::text"
    ),
    "content": (
        "div.pages_content, div.TRS_Editor, div.article-content, div.article"
    ),
    "attachment_css": (
        "div.pages_content a[href$='.pdf'], div.pages_content a[href$='.doc'], "
        "div.pages_content a[href$='.docx'], div.pages_content a[href$='.xls'], "
        "div.pages_content a[href$='.xlsx'], div.pages_content a[href$='.zip'], "
        "div.pages_content a[href$='.wps']"
    ),
}


# ---------------------------------------------------------------------------
# Profile registry
# ---------------------------------------------------------------------------
# `xxgk` athena POST. The hex tokens are baked into the gov.cn frontend
# bundle and have been stable across multiple probes; if gov.cn republishes
# the SPA (rare on this site) the tokens may change and this needs an update.
ATHENA_BASE = "https://sousuoht.www.gov.cn"
ATHENA_WARMUP_ROUTE = "athena/forward/DA2FE8C6CAD0EEBC5F97F7E3F3633A7188DAA40373EEA0E4024A081201F4D546"
ATHENA_LIST_ROUTE = "athena/forward/486B5ABFBAD0FF5743F5E82E007EF04DDD6388E7989E9EC9CC7B84917AC81A5F"
ATHENA_THIRD_PARTY = "thirdparty_code_107"
ATHENA_TABLE_ID = 30
# These constants live inside the gov.cn xxgk page HTML and rarely change.
# `athenaAppKey` is the static caller id; `publicKey` is the RSA pubkey
# used to encrypt the appkey before sending it as the `athenaappkey`
# header. The frontend's JSEncrypt library does PKCS1-v1.5; we mirror.
ATHENA_APP_NAME_RAW = "国网搜索"
ATHENA_APP_KEY_RAW = "a46884b2013e4d189f2a8e2d49a23525"
ATHENA_PUBLIC_KEY_B64 = (
    "MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCSMhMJQ+XLI7oW0k9Bwufur4Ag40tc"
    "srzT7WZf6Ao0O/hyY1gZtCSYFxkxIZUXjW46j27XSW8IDX1rTJoHaMxHCWsOpTi2W5st"
    "ybGYZytsY5on8gd8AIaS1d52h9eaS2TFydtJJtE50xHmT0WmoyoinWCuVCOkdCLhh9b9"
    "jSdeSQIDAQAB"
)

# 国家规章库 (https://www.gov.cn/zhengce/xxgk/gjgzk/) uses a separate Athena
# app from 国网搜索. Constants are embedded in the page bundle.
RULES_LIST_ROUTE = "athena/forward/BD8730CDDA12515E2D9E1B21AA11C0D6"
RULES_APP_NAME_RAW = "规章库"
RULES_APP_KEY_RAW = "f8f49ea85885466598c5261f7f8607fb"
RULES_PUBLIC_KEY_B64 = (
    "MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCWGTHvPbNkzQNxTJwSZbsgHKyLl/OK11"
    "kCZNmVVSFK3lUbmHgh7Ain1gdaf7G/ETh/wQm/9BAO/U36yWPizzlwHCUcWJXBRsY10Psn"
    "YIlBXH/cjqQaEbmEghxcjdYtLtkudoMfoMDiJk+tPC7UEZd8TI2u26vttNF++6tHi1HdeQ"
    "IDAQAB"
)
RULES_CODE = "18258ab0ac9"
RULES_TABLE_NAME = "t_1860c735d31"
RULES_DEPARTMENT_CLASS = "部门规章"


def _rsa_encrypt_b64(plaintext: str, public_key_b64: str = ATHENA_PUBLIC_KEY_B64) -> str:
    """RSA-PKCS1v15 encrypt + base64 encode, matching JSEncrypt.encrypt()."""
    from base64 import b64encode
    from cryptography.hazmat.primitives.asymmetric import padding
    from cryptography.hazmat.primitives.serialization import load_pem_public_key
    pem = (
        b"-----BEGIN PUBLIC KEY-----\n"
        + public_key_b64.encode()
        + b"\n-----END PUBLIC KEY-----"
    )
    pub = load_pem_public_key(pem)
    cipher = pub.encrypt(plaintext.encode(), padding.PKCS1v15())
    return b64encode(cipher).decode()


def _athena_headers(
    *,
    app_name: str = ATHENA_APP_NAME_RAW,
    app_key: str = ATHENA_APP_KEY_RAW,
    public_key_b64: str = ATHENA_PUBLIC_KEY_B64,
) -> dict[str, str]:
    """Build the request headers including a fresh RSA-encrypted appkey.
    Each call re-encrypts (PKCS1v15 padding produces non-deterministic
    ciphertexts), matching what the gov.cn JS does on every request."""
    from urllib.parse import quote
    return {
        "User-Agent": "Mozilla/5.0 GovCrawler",
        "Accept": "*/*",
        "Origin": "https://www.gov.cn",
        "athenaappname": quote(app_name, safe=""),
        "athenaappkey": quote(_rsa_encrypt_b64(app_key, public_key_b64), safe=""),
    }
# Class IDs cover the 国务院/国办 fwzh families: 国发(1108) / 国办发(1107) /
# 国办函(1106) / 国办发明电(1105) / 国发明电(1104) / 国函(1103) / 国令(1102) /
# 7547/7548/7549 (newer subclasses) / 1101 / 1100. Sourced from the
# request body fired by the production xxgk page on 2026-04-28.
ATHENA_CHILDREN_INFO_IDS = [
    [1108, 1107, 1106, 1105, 1104, 1103, 1102, 7547, 7548, 7549, 1101, "1100"]
]

# `zcwjk` direct GET. No session needed.
ZCWJK_LIST_URL = "https://sousuo.www.gov.cn/search-gov/data"


# ---------------------------------------------------------------------------
# Profile dispatch
# ---------------------------------------------------------------------------
def _normalize_publish_time(s: str | None) -> datetime | None:
    """Parse '2026-04-21 17:00:00' / '2026.04.21' / unix-ms into UTC datetime."""
    if not s:
        return None
    if isinstance(s, (int, float)):
        ts = float(s)
        # Gov.cn emits millisecond timestamps even for older policy dates.
        # 1990s ms values are below 1e12, so use a lower boundary than
        # "current-ish" milliseconds while still keeping normal unix seconds.
        if ts > 1e11:  # unix ms
            ts /= 1000
        return datetime.fromtimestamp(ts, tz=timezone.utc)
    s = s.strip()
    for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%d", "%Y.%m.%d"):
        try:
            dt = datetime.strptime(s, fmt)
            return dt.replace(tzinfo=timezone.utc)
        except ValueError:
            continue
    return None


def _split_policy_category(raw: str | None) -> tuple[str | None, str | None, str | None]:
    """Return (top_category, sub_category, original_category) for gov.cn facets."""
    if not raw:
        return None, None, None
    cleaned = str(raw).strip().replace("/", "\\")
    if not cleaned:
        return None, None, None
    parts = [p.strip() for p in cleaned.split("\\") if p.strip()]
    top = parts[0] if parts else None
    sub = parts[1] if len(parts) > 1 else None
    return top, sub, cleaned


def _first_str(value: Any) -> str | None:
    if isinstance(value, list):
        value = next((v for v in value if v), None)
    if value is None:
        return None
    s = str(value).strip()
    return s or None


def _clean_text(value: Any) -> str | None:
    s = _first_str(value)
    if not s:
        return None
    s = re.sub(r"<[^>]+>", "", s)
    s = unescape(s)
    s = re.sub(r"\s+", " ", s).strip()
    return s or None


def _parse_cn_date(raw: str | None) -> datetime | None:
    if not raw:
        return None
    m = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日", raw)
    if not m:
        return None
    try:
        return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), tzinfo=timezone.utc)
    except ValueError:
        return None


def _extract_effective_date(statement: str | None):
    if not statement:
        return None
    m = re.search(r"自\s*(\d{4}年\d{1,2}月\d{1,2}日)\s*起施行", statement)
    dt = _parse_cn_date(m.group(1)) if m else None
    return dt.date() if dt else None


def _extract_rules_doc_no(statement: str | None) -> str | None:
    if not statement:
        return None
    cleaned = statement.strip("（）() ")
    marker = re.search(r"(?:公布|发布)", cleaned)
    if not marker:
        return None
    dates = list(re.finditer(r"\d{4}年\d{1,2}月\d{1,2}日", cleaned[:marker.start()]))
    start = dates[-1].end() if dates else 0
    doc_no = re.sub(r"\s+", "", cleaned[start:marker.start()])
    return doc_no or None


def _make_fetch_result(url: str, status: int, body: str, *, t0: float) -> FetchResult:
    return FetchResult(
        url=url, final_url=url, status=status, html=body or "",
        fetched_at=time.time(),
        duration_ms=int((time.time() - t0) * 1000),
        is_challenge=False,
        strategy="httpx",
    )


def _athena_warmup() -> str:
    """One-shot GET to obtain a session `code` string used by every
    subsequent list POST. Returns the code or raises."""
    url = f"{ATHENA_BASE}/{ATHENA_WARMUP_ROUTE}?thirdPartyName=hycloud&thirdPartyTenantId=8"
    r = httpx.get(url, timeout=20, headers=_athena_headers())
    r.raise_for_status()
    j = r.json()
    code = j.get("result", {}).get("data")
    if not code:
        raise RuntimeError(f"athena warmup returned no code: {j!r}")
    return code


def _fetch_list_page_athena(
    *, page_num: int, page_size: int = 20
) -> tuple[str, list[CrawlItem], FetchResult]:
    """xxgk profile: warmup + POST list."""
    t0 = time.time()
    code = _athena_warmup()
    list_url = f"{ATHENA_BASE}/{ATHENA_LIST_ROUTE}"
    body = {
        "code": code,
        "thirdPartyCode": ATHENA_THIRD_PARTY,
        "thirdPartyTableId": ATHENA_TABLE_ID,
        "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
        "trackTotalHits": "true",
        "searchFields": [{"fieldName": "maintitle", "searchWord": ""}],
        "isPreciseSearch": 0,
        "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}],
        "childrenInfoIds": ATHENA_CHILDREN_INFO_IDS,
        "pageSize": page_size,
        "pageNo": page_num,
    }
    r = httpx.post(
        list_url, json=body, timeout=30,
        headers={**_athena_headers(), "Content-Type": "application/json"},
    )
    r.raise_for_status()
    payload = r.json()
    raw_items = (
        payload.get("result", {}).get("data", {}).get("list") or []
    )
    items = []
    for it in raw_items:
        url = it.get("pub_url") or ""
        if not url:
            continue
        title = (it.get("maintitle") or "").strip() or "(无标题)"
        pub_dt = _normalize_publish_time(it.get("publish_time"))
        cwrq = _normalize_publish_time(it.get("cwrq"))
        items.append(CrawlItem(
            site_id="gov_cn_policy",  # placeholder; fetch_list_page rewrites with site_code
            url=url,
            url_hash=compute_url_hash(url),
            title=title,
            publish_time=pub_dt,
            publish_date=cwrq.date() if cwrq else (pub_dt.date() if pub_dt else None),
            doc_no=it.get("fwzh") or None,
            publisher="国务院" if (it.get("fwzh") or "").startswith("国发") or
                                  (it.get("fwzh") or "").startswith("国令") else
                      ("国务院办公厅" if (it.get("fwzh") or "").startswith("国办") else None),
            metadata_json={"raw": it},
        ))
    return list_url, items, _make_fetch_result(list_url, r.status_code, r.text, t0=t0)


def _fetch_list_page_zcwjk(
    *,
    page_num: int,
    page_size: int = 20,
    search_t: str = "zhengcelibrary_gw",
    cat_key: str = "gongwen",
    library_label: str = "国务院文件",
) -> tuple[str, list[CrawlItem], FetchResult]:
    """zcwjk profile: direct GET search-gov/data."""
    t0 = time.time()
    params = {
        "t": search_t,
        "q": "",
        "sort": "pubtime",
        "sortType": "1",
        "searchfield": "title",
        "p": str(page_num),
        "n": str(page_size),
        "type": "gwyzcwjk",
    }
    r = httpx.get(
        ZCWJK_LIST_URL, params=params, timeout=20,
        headers={"User-Agent": "Mozilla/5.0 GovCrawler",
                 "Referer": "https://sousuo.www.gov.cn/zcwjk/policyDocumentLibrary"},
    )
    r.raise_for_status()
    payload = r.json()
    raw_items = (
        payload.get("searchVO", {}).get("catMap", {}).get(cat_key, {}).get("listVO") or []
    )
    items = []
    for it in raw_items:
        url = it.get("url") or ""
        if not url:
            continue
        title = (it.get("title") or "").strip() or "(无标题)"
        # gov.cn zcwjk carries both dates:
        #   pubtime/pubtimeStr => 发布日期
        #   ptime              => 成文日期
        pub_dt = _normalize_publish_time(it.get("pubtime") or it.get("pubtimeStr"))
        if pub_dt is None:
            pub_dt = _normalize_publish_time(it.get("ptime"))
        doc_dt = _normalize_publish_time(it.get("ptime"))
        top_category, sub_category, open_category = _split_policy_category(it.get("childtype"))
        publisher = it.get("puborg") or None
        doc_no = it.get("pcode") or None
        items.append(CrawlItem(
            site_id="gov_cn_policy",  # placeholder; fetch_list_page rewrites with site_code
            native_post_id=str(it.get("id")) if it.get("id") else None,
            url=url,
            url_hash=compute_url_hash(url),
            title=title,
            publish_time=pub_dt,
            source_raw=publisher,
            publisher=publisher,
            doc_no=doc_no,
            index_no=it.get("index") or None,
            # `publish_date` is the schema's date-only public attribute.
            # For gov.cn policy pages it corresponds to 成文日期; the precise
            # 发布日期 remains in publish_time.
            publish_date=doc_dt.date() if doc_dt else (pub_dt.date() if pub_dt else None),
            topic_words=it.get("subjectword") or None,
            open_category=open_category,
            content_category=top_category,
            content_subcategory=sub_category,
            metadata_json={
                "raw": it,
                "zcwjk_t": search_t,
                "zcwjk_cat_key": cat_key,
                "zcwjk_library_label": library_label,
            },
        ))
    # Build display URL for logs
    log_url = f"{ZCWJK_LIST_URL}?{httpx.QueryParams(params)}&cat_key={cat_key}"
    return log_url, items, _make_fetch_result(log_url, r.status_code, r.text, t0=t0)


def _fetch_list_page_rules(
    *, page_num: int, page_size: int = 50
) -> tuple[str, list[CrawlItem], FetchResult]:
    """国家规章库 profile: POST list API filtered to 部门规章."""
    t0 = time.time()
    list_url = f"{ATHENA_BASE}/{RULES_LIST_ROUTE}"
    body = {
        "code": RULES_CODE,
        "preference": None,
        "searchFields": [
            {
                "fieldName": "f_202321807875",
                "searchWord": RULES_DEPARTMENT_CLASS,
                "searchType": "TERM",
                "withHighLight": True,
            },
            {"fieldName": "f_202321360426", "searchWord": "", "withHighLight": True},
            {"fieldName": "f_202321758948", "searchWord": "", "withHighLight": True},
            {"fieldName": "f_202321423473", "searchType": "TERM", "withHighLight": True},
            {"fieldName": "f_202321159816", "searchWord": "", "searchType": "TERM"},
            {"fieldName": "f_20232380533", "searchType": "TERM", "withHighLight": True},
            {"fieldName": "f_202328191239", "withHighLight": True, "searchType": "TERM"},
            {"fieldName": "f_20221110222856", "withHighLight": True, "searchType": "TERM"},
        ],
        "sorts": [{}, {"sortField": "f_202321915922", "sortOrder": "DESC"}],
        "resultFields": [
            "f_202355832506",
            "f_20232124962",
            "f_202321124775",
            "f_202321159816",
            "f_202321360426",
            "f_202321423473",
            "f_202321758948",
            "f_202321807875",
            "f_202321864401",
            "f_202321915922",
            "f_202323394765",
            "f_202328191239",
            "f_202344311304",
            "f_2023425676953",
            "f_2023425808265",
            "f_202321136868",
            "f_20232380533",
            "f_20232151076",
            "doc_pub_url",
        ],
        "trackTotalHits": "true",
        "tableName": RULES_TABLE_NAME,
        "pageSize": page_size,
        "pageNo": page_num,
        "granularity": "ALL",
    }
    r = httpx.post(
        list_url,
        json=body,
        timeout=30,
        headers={
            **_athena_headers(
                app_name=RULES_APP_NAME_RAW,
                app_key=RULES_APP_KEY_RAW,
                public_key_b64=RULES_PUBLIC_KEY_B64,
            ),
            "Content-Type": "application/json;charset=UTF-8",
            "Referer": "https://www.gov.cn/zhengce/xxgk/gjgzk/index.htm",
        },
    )
    r.raise_for_status()
    payload = r.json()
    raw_items = payload.get("result", {}).get("data", {}).get("list") or []
    items = []
    for it in raw_items:
        url = _first_str(it.get("doc_pub_url")) or _first_str(it.get("f_20232124962"))
        if not url:
            continue
        title = _clean_text(it.get("f_202321360426")) or "(无标题)"
        pub_dt = _normalize_publish_time(it.get("f_202321915922"))
        statement = (
            _clean_text(it.get("f_202344311304"))
            or _clean_text(it.get("f_202321136868"))
        )
        publisher = (
            _clean_text(it.get("f_202355832506"))
            or _clean_text(it.get("f_202323394765"))
            or _clean_text(it.get("f_20232151076"))
        )
        source = _clean_text(it.get("f_202323394765")) or publisher
        native_id = _first_str(it.get("f_202321124775"))
        items.append(CrawlItem(
            site_id="gov_cn_policy",
            native_post_id=native_id,
            url=url,
            url_hash=compute_url_hash(url),
            title=title,
            publish_time=pub_dt,
            source_raw=source,
            publisher=publisher,
            doc_no=_extract_rules_doc_no(statement),
            publish_date=pub_dt.date() if pub_dt else None,
            effective_date=_extract_effective_date(statement),
            open_category=RULES_DEPARTMENT_CLASS,
            content_category="法规文件",
            content_subcategory=RULES_DEPARTMENT_CLASS,
            metadata_json={
                "raw": it,
                "rules_statement": statement,
                "rules_profile": "department_rules",
            },
        ))
    log_url = f"{list_url}?pageNo={page_num}&pageSize={page_size}&rule_class={RULES_DEPARTMENT_CLASS}"
    return log_url, items, _make_fetch_result(log_url, r.status_code, r.text, t0=t0)


# ---------------------------------------------------------------------------
# Adapter contract
# ---------------------------------------------------------------------------
def fetch_list_page(
    rt, *, page_num: int, params: dict[str, Any] | None = None,
    interval_sec: float | None = None,
):
    """Pipeline entry — dispatches to the right profile based on
    `crawl_site.adapter_params_json.policy_profile` ('xxgk', 'zcwjk', or 'rules').
    """
    params = params or {}
    profile = (params.get("policy_profile") or "xxgk").lower()
    page_size = int(params.get("page_size") or 20)

    if profile == "xxgk":
        list_url, items, fr = _fetch_list_page_athena(page_num=page_num, page_size=page_size)
    elif profile == "zcwjk":
        list_url, items, fr = _fetch_list_page_zcwjk(
            page_num=page_num,
            page_size=page_size,
            search_t=str(params.get("zcwjk_t") or "zhengcelibrary_gw"),
            cat_key=str(params.get("zcwjk_cat_key") or "gongwen"),
            library_label=str(params.get("zcwjk_library_label") or "国务院文件"),
        )
    elif profile in {"rules", "department_rules", "gjgzk_rules"}:
        list_url, items, fr = _fetch_list_page_rules(page_num=page_num, page_size=page_size)
    else:
        raise ValueError(
            f"unknown gov_cn_policy profile {profile!r} — set "
            "crawl_site.adapter_params_json.policy_profile to 'xxgk', 'zcwjk', or 'rules'"
        )

    # Stamp site/target/dept FK references on every item now that we have rt.
    for it in items:
        # Pydantic model — copy via model_copy with overrides
        pass
    # CrawlItem.site_id is the site_code (string), not the int FK; pipeline
    # later resolves it via insert_article_from_contract. Fill it in.
    new_items = []
    for it in items:
        new_items.append(it.model_copy(update={
            "site_id": rt.site.site_code,
            "target_id": rt.target.id,
            "dept_id": rt.target.dept_id,
            "channel_name": rt.target.channel_name,
            "channel_path": rt.target.channel_path,
            "content_category": it.content_category or rt.target.content_category,
            "content_subcategory": it.content_subcategory or rt.target.content_subcategory,
            "fetch_strategy": FetchStrategy.HTTPX,
            "status": Status.RAW,
        }))
    return list_url, new_items, fr


def parse_list_response(*args, **kwargs):
    """Not used — fetch_list_page returns CrawlItems directly. Kept for
    contract symmetry with adapters that follow the GET-and-parse model."""
    raise NotImplementedError(
        "gov_cn_policy uses fetch_list_page directly; parse_list_response is unused"
    )


def build_list_url(*args, **kwargs):
    """Not used — see fetch_list_page docstring."""
    raise NotImplementedError(
        "gov_cn_policy uses POST + session warmup, no static list URL"
    )