"""gkmlpt 适配器 — 清远政务云 CMS 族 (§7.5.3 配套).

工作模型：
  * 列表 URL 形如 `{base_url}/{dept_path}/gkmlpt/api/all/{column_id}?page=N&sid={sid}`
  * 响应是 JSON，结构为 `{articles: [...], total: N, ...}`
  * 每条 article 有 CMS 级稳定 ID（`articles[*].id`），URL 形如
    `{base_url}/{dept_path}/gkmlpt/content/{bucket}/{id}/post_{id}.html`
    其中 `bucket = id // 1000`

本模块的责任：**把 list-API JSON → `CrawlItem[]`**。正文抓取（detail stage）
由下游另一个函数跑（当前 Phase-A 仅做 list projection；正文阶段预留接口）。

项目分成两层：

  1. `build_list_url(...)`：纯函数，根据 `adapter_params` 拼出 list URL。
     由 `crawl_site.adapter_params_json.list_api_path_tpl` 驱动，不在代码里
     硬编码 URL 模板。

  2. `parse_list_response(...)`：JSON → `list[CrawlItem]`，status=raw。
     这是 §7.5.7 契约测试的主要对象。

没有独立的 CMS client — HTTP 抓取走现成的 `govcrawler.fetcher.*` 层。
"""
from __future__ import annotations

import re
from datetime import datetime, timezone
from typing import Any, Iterable

from govcrawler.adapters.contract import ContractViolation, CrawlItem, Status
from govcrawler.utils.url_norm import url_hash as compute_url_hash


ADAPTER_ID = "gkmlpt"

# Sanity-check regex for article URL shape; matches {bucket}/{id}/post_{id}.html
_URL_RE = re.compile(r"/content/\d+/(\d+)/post_(\d+)\.html")


# Empirically-tuned fallback when `crawl_target.interval_sec` is NULL. The
# gkmlpt fleet (清远政务云) visibly rate-limits ~1 req/host/sec; we saw every
# crawl after the first URL fail with transient timeouts when this was 0.
# Operators can still override per-target in the admin UI.
DEFAULT_INTERVAL_SEC: float = 3.0


# Default detail-page CSS/xpath selectors for 清远政务云 gkmlpt content pages.
# These are the baseline; per-target overrides live in
# `crawl_target.parser_override_json.detail` and take precedence.


DEFAULT_DETAIL_SELECTORS: dict[str, Any] = {
    "title": "h1.article-title::text, div.article-title::text, h1::text",
    "publish_time": "div.article-info span::text, div.info span::text",
    "source": "div.article-info span.source::text, div.info span.source::text",
    "content": "div.article-content, div.content, div.TRS_Editor",
    "attachment_css": "div.article-content a[href], div.content a[href]",
}


# ---------------------------------------------------------------------------
# list URL building
# ---------------------------------------------------------------------------
def build_list_url(
    *,
    base_url: str,
    dept_path: str,
    column_id: str,
    page: int = 1,
    sid: str | None = None,
    path_tpl: str | None = None,
) -> str:
    """Assemble the list-API URL.

    `path_tpl` comes from `crawl_site.adapter_params_json.list_api_path_tpl`.
    Defaults match the gkmlpt live probe; override per site when a CMS variant
    drifts (e.g. some depts expose `/api/v2/all/`).
    """
    tpl = path_tpl or "{base_url}/{dept_path}/gkmlpt/api/all/{column_id}"
    url = tpl.format(
        base_url=base_url.rstrip("/"),
        dept_path=dept_path.strip("/"),
        column_id=column_id,
    )
    sep = "&" if "?" in url else "?"
    url = f"{url}{sep}page={page}"
    if sid:
        url = f"{url}&sid={sid}"
    return url


# ---------------------------------------------------------------------------
# article projection
# ---------------------------------------------------------------------------
def _coerce_publish_time(a: dict[str, Any]) -> datetime | None:
    """gkmlpt exposes unix seconds in `first_publish_time` (preferred, UTC)
    and sometimes only `date` (can be unix seconds OR ISO string)."""
    ts = a.get("first_publish_time")
    if ts is None:
        ts = a.get("date")
    if ts is None:
        return None
    if isinstance(ts, (int, float)):
        return datetime.fromtimestamp(ts, tz=timezone.utc)
    if isinstance(ts, str):
        # Rarely a CMS variant sends "YYYY-MM-DD"; be forgiving.
        try:
            return datetime.fromisoformat(ts.replace("Z", "+00:00")).astimezone(
                timezone.utc
            )
        except ValueError:
            return None
    return None


def _project_one(
    a: dict[str, Any],
    *,
    site_id: str,
    target_id: int | None,
    dept_id: int | None,
) -> CrawlItem:
    if not a.get("url"):
        raise ContractViolation(f"gkmlpt article missing 'url': id={a.get('id')}")
    url = a["url"]

    return CrawlItem(
        site_id=site_id,
        target_id=target_id,
        dept_id=dept_id,
        native_post_id=str(a["id"]) if a.get("id") is not None else None,
        url=url,
        url_hash=compute_url_hash(url),
        title=a.get("title") or "(untitled)",
        publish_time=_coerce_publish_time(a),
        source_raw=a.get("publisher"),
        publisher=a.get("publisher"),
        doc_no=(a.get("document_number") or None),
        index_no=(a.get("identifier") or None),
        status=Status.RAW,
    )


def parse_list_response(
    payload: dict[str, Any] | list[dict[str, Any]],
    *,
    site_id: str,
    target_id: int | None = None,
    dept_id: int | None = None,
) -> list[CrawlItem]:
    """Project gkmlpt list-API JSON → `CrawlItem[]`.

    Accepts either:
      * a dict with `{articles: [...]}` (the common shape), or
      * a bare list of article dicts (when caller pre-extracted).

    All items come back with `status=raw`: they have no body yet, that's the
    detail stage's job. Caller is responsible for de-dup against `article`
    table (by `native_post_id` or `url_hash`).

    On per-row validation failure, we skip the row and continue — one broken
    article shouldn't kill a whole page. Callers wanting strict behaviour can
    set `target_id` + rely on the adapter contract self-tests.
    """
    if isinstance(payload, list):
        articles: Iterable[dict[str, Any]] = payload
    elif isinstance(payload, dict):
        articles = payload.get("articles") or []
    else:
        raise ContractViolation(
            f"gkmlpt list response must be dict or list, got {type(payload).__name__}"
        )

    out: list[CrawlItem] = []
    for a in articles:
        if not isinstance(a, dict):
            continue
        try:
            out.append(
                _project_one(a, site_id=site_id, target_id=target_id, dept_id=dept_id)
            )
        except (ContractViolation, ValueError):
            # malformed row — skip, don't crash the whole page
            continue
    return out


# ---------------------------------------------------------------------------
# bucket-invariant probe (§5.5 sanity check used during adapter health-checks)
# ---------------------------------------------------------------------------
def verify_bucket_invariant(article: dict[str, Any]) -> bool:
    """Return True iff `url` matches `bucket = id // 1000`.

    Used by adapter smoke tests and dept probe to catch CMS format drift early.
    """
    pid = article.get("id")
    url = article.get("url") or ""
    if pid is None:
        return False
    m = _URL_RE.search(url)
    if not m:
        return False
    return m.group(1) == str(int(pid) // 1000) and m.group(2) == str(pid)


__all__ = [
    "ADAPTER_ID",
    "build_list_url",
    "parse_list_response",
    "verify_bucket_invariant",
]
