"""gkmlpt 适配器 — 清远政务云 CMS 族 (§7.5.3 配套).

工作模型：
  * 列表 URL 形如 `{base_url}/{dept_path}/gkmlpt/api/all/{column_id}?page=N&sid={sid}`
  * 响应是 JSON，结构为 `{articles: [...], total: N, ...}`
  * 每条 article 有 CMS 级稳定 ID（`articles[*].id`），URL 形如
    `{base_url}/{dept_path}/gkmlpt/content/{bucket}/{id}/post_{id}.html`
    其中 `bucket = id // 1000`

本模块的责任：**把 list-API JSON → `CrawlItem[]`**。正文抓取（detail stage）
由下游另一个函数跑（当前 Phase-A 仅做 list projection；正文阶段预留接口）。

项目分成两层：

  1. `build_list_url(...)`：纯函数，根据 `adapter_params` 拼出 list URL。
     由 `crawl_site.adapter_params_json.list_api_path_tpl` 驱动，不在代码里
     硬编码 URL 模板。

  2. `parse_list_response(...)`：JSON → `list[CrawlItem]`，status=raw。
     这是 §7.5.7 契约测试的主要对象。

没有独立的 CMS client — HTTP 抓取走现成的 `govcrawler.fetcher.*` 层。
"""
from __future__ import annotations

import re
from datetime import date, datetime, timedelta, timezone
from typing import Any, Iterable

from govcrawler.adapters.contract import ContractViolation, CrawlItem, Status
from govcrawler.utils.url_norm import url_hash as compute_url_hash


ADAPTER_ID = "gkmlpt"

# Sanity-check regex for article URL shape; matches {bucket}/{id}/post_{id}.html
_URL_RE = re.compile(r"/content/\d+/(\d+)/post_(\d+)\.html")


# Empirically-tuned fallback when `crawl_target.interval_sec` is NULL. The
# gkmlpt fleet (清远政务云) visibly rate-limits ~1 req/host/sec; we saw every
# crawl after the first URL fail with transient timeouts when this was 0.
# Operators can still override per-target in the admin UI.
DEFAULT_INTERVAL_SEC: float = 3.0


# Default detail-page CSS/xpath selectors for 清远政务云 gkmlpt content pages.
# These are the baseline; per-target overrides live in
# `crawl_target.parser_override_json.detail` and take precedence.


DEFAULT_DETAIL_SELECTORS: dict[str, Any] = {
    "title": "h1.article-title::text, div.article-title::text, h1::text",
    "publish_time": "div.article-info span::text, div.info span::text",
    "source": "div.article-info span.source::text, div.info span.source::text",
    "content": "div.article-content, div.content, div.TRS_Editor",
    "attachment_css": "div.article-content a[href], div.content a[href]",
}


# ---------------------------------------------------------------------------
# list URL building
# ---------------------------------------------------------------------------
def build_list_url(
    *,
    base_url: str,
    dept_path: str,
    column_id: str,
    page: int = 1,
    sid: str | None = None,
    path_tpl: str | None = None,
) -> str:
    """Assemble the list-API URL.

    `path_tpl` comes from `crawl_site.adapter_params_json.list_api_path_tpl`.
    Defaults match the gkmlpt live probe; override per site when a CMS variant
    drifts (e.g. some depts expose `/api/v2/all/`).
    """
    tpl = path_tpl or "{base_url}/{dept_path}/gkmlpt/api/all/{column_id}"
    url = tpl.format(
        base_url=base_url.rstrip("/"),
        dept_path=dept_path.strip("/"),
        column_id=column_id,
    )
    # Office-level entries (e.g. https://www.gdqy.gov.cn/gkmlpt/index) have
    # no dept_path → the default template produces 'gov.cn//gkmlpt/...'.
    # Most servers tolerate the double slash, but normalize it so the URL
    # we log + send out is clean. Only flatten consecutive slashes after
    # the scheme — keep '//' in `https://`.
    if "://" in url:
        scheme, rest = url.split("://", 1)
        while "//" in rest:
            rest = rest.replace("//", "/")
        url = f"{scheme}://{rest}"
    sep = "&" if "?" in url else "?"
    url = f"{url}{sep}page={page}"
    if sid:
        url = f"{url}&sid={sid}"
    return url


# ---------------------------------------------------------------------------
# article projection
# ---------------------------------------------------------------------------
_CN_TZ = timezone(timedelta(hours=8))


def _coerce_datetime(v: Any) -> datetime | None:
    if v is None:
        return None
    if isinstance(v, (int, float)):
        return datetime.fromtimestamp(v, tz=timezone.utc)
    if isinstance(v, str):
        raw = v.strip()
        if not raw:
            return None
        if raw.isdigit():
            return datetime.fromtimestamp(int(raw), tz=timezone.utc)
        try:
            return datetime.fromisoformat(raw.replace("Z", "+00:00")).astimezone(
                timezone.utc
            )
        except ValueError:
            return None
    return None


def _coerce_publish_time(a: dict[str, Any]) -> datetime | None:
    """发布日期.

    gd.gov.cn detail pages display this as a date-only value. The API carries
    both `display_publish_time` and `first_publish_time`; prefer the display
    timestamp because it is the value intended for the public metadata table.
    """
    ts = a.get("display_publish_time")
    if ts in (0, "0"):
        ts = None
    if ts is None:
        ts = a.get("first_publish_time")
    if ts is None:
        ts = a.get("publish_time")
    if ts is None:
        ts = a.get("date")
    return _coerce_datetime(ts)


def _coerce_publish_date(a: dict[str, Any]) -> date | None:
    """成文日期 (publish_date) — date-only field.

    gd.gov.cn's gkmlpt API uses `date` for the document date displayed as
    成文日期. Numeric values are midnight in China time, so convert using UTC+8
    before taking the calendar day.
    """
    raw = a.get("publish_date") or a.get("document_date")
    if isinstance(raw, str) and raw.strip():
        try:
            return date.fromisoformat(raw.strip()[:10])
        except ValueError:
            pass
    raw = a.get("date")
    if isinstance(raw, (int, float)):
        return datetime.fromtimestamp(raw, tz=_CN_TZ).date()
    if isinstance(raw, str) and raw.strip():
        if raw.strip().isdigit():
            return datetime.fromtimestamp(int(raw.strip()), tz=_CN_TZ).date()
        try:
            return date.fromisoformat(raw.strip()[:10])
        except ValueError:
            return None
    return None


def _coerce_status_date(v: Any) -> date | None:
    if v in (None, "", 0, "0"):
        return None
    dt = _coerce_datetime(v)
    if dt is not None:
        return dt.astimezone(_CN_TZ).date()
    if isinstance(v, str):
        raw = v.strip()
        try:
            return date.fromisoformat(raw[:10])
        except ValueError:
            return None
    return None


def _coerce_is_effective(a: dict[str, Any]) -> bool | None:
    """Current validity flag.

    gkmlpt exposes separate status hints. `is_expired` / `is_abolished` are
    the most explicit. `validity` is numeric on several Guangdong sites; 0 is
    the observed active value, non-zero means the document is no longer fully
    effective. Leave unknown as None instead of guessing.
    """
    for key in ("is_expired", "is_abolished"):
        raw = a.get(key)
        if raw in (1, "1", True):
            return False
    validity = a.get("validity")
    if validity in (0, "0"):
        return True
    if validity not in (None, ""):
        return False
    return None


def _coerce_expiry_date(a: dict[str, Any]) -> date | None:
    for key in ("expired_time", "abolished_time", "expiry_date", "invalid_date"):
        dt = _coerce_status_date(a.get(key))
        if dt is not None:
            return dt
    return None


def _coerce_topic_words(a: dict[str, Any]) -> str | None:
    """主题词 — gkmlpt stores under `keywords`, sometimes as a list, sometimes
    a single comma-or-space separated string. Normalize to single string.
    """
    raw = a.get("keywords") or a.get("topic_words")
    if raw is None or raw == "":
        return None
    if isinstance(raw, list):
        joined = "、".join(str(x).strip() for x in raw if str(x).strip())
        return joined or None
    return str(raw).strip() or None


def _coerce_open_category(a: dict[str, Any]) -> str | None:
    """开放属性 / 公开类别 — accept either the human-readable label
    (`category_name`, `open_category`) or fall back to a numeric code if
    that's all we got. gkmlpt's `category: 0|""` means unspecified — drop.
    """
    for k in ("open_category", "category_name", "category_text"):
        v = a.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip()
    cat = a.get("category")
    if isinstance(cat, str) and cat.strip() and cat.strip() != "0":
        return cat.strip()
    theme = (a.get("classify_theme_name") or "").strip()
    genre = (a.get("classify_genre_name") or "").strip()
    if theme and genre:
        return f"{theme}、{genre}"
    if theme:
        return theme
    if genre:
        return genre
    main = (a.get("classify_main_name") or "").strip()
    if main:
        return main
    return None


def _coerce_content_category(a: dict[str, Any]) -> str | None:
    """主题分类 top-level value.

    For gd.gov.cn policy documents, `classify_theme_name` is the actual theme
    used in the public metadata table. Fall back to `classify_main_name` for
    non-policy columns such as 财政预决算/政务动态 where no theme is present.
    """
    theme = (a.get("classify_theme_name") or "").strip()
    if theme:
        return theme
    main = (a.get("classify_main_name") or "").strip()
    return main or None


def _coerce_content_subcategory(a: dict[str, Any]) -> str | None:
    genre = (a.get("classify_genre_name") or "").strip()
    return genre or None


def _project_one(
    a: dict[str, Any],
    *,
    site_id: str,
    target_id: int | None,
    dept_id: int | None,
) -> CrawlItem:
    if not a.get("url"):
        raise ContractViolation(f"gkmlpt article missing 'url': id={a.get('id')}")
    url = a["url"]

    return CrawlItem(
        site_id=site_id,
        target_id=target_id,
        dept_id=dept_id,
        native_post_id=str(a["id"]) if a.get("id") is not None else None,
        url=url,
        url_hash=compute_url_hash(url),
        title=a.get("title") or "(untitled)",
        publish_time=_coerce_publish_time(a),
        source_raw=a.get("publisher"),
        publisher=a.get("publisher"),
        doc_no=(a.get("document_number") or None),
        index_no=(a.get("identifier") or None),
        publish_date=_coerce_publish_date(a),
        is_effective=_coerce_is_effective(a),
        expiry_date=_coerce_expiry_date(a),
        topic_words=_coerce_topic_words(a),
        open_category=_coerce_open_category(a),
        content_category=_coerce_content_category(a),
        content_subcategory=_coerce_content_subcategory(a),
        metadata_json={"raw": a},
        status=Status.RAW,
    )


def parse_list_response(
    payload: dict[str, Any] | list[dict[str, Any]],
    *,
    site_id: str,
    target_id: int | None = None,
    dept_id: int | None = None,
) -> list[CrawlItem]:
    """Project gkmlpt list-API JSON → `CrawlItem[]`.

    Accepts either:
      * a dict with `{articles: [...]}` (the common shape), or
      * a bare list of article dicts (when caller pre-extracted).

    All items come back with `status=raw`: they have no body yet, that's the
    detail stage's job. Caller is responsible for de-dup against `article`
    table (by `native_post_id` or `url_hash`).

    On per-row validation failure, we skip the row and continue — one broken
    article shouldn't kill a whole page. Callers wanting strict behaviour can
    set `target_id` + rely on the adapter contract self-tests.
    """
    if isinstance(payload, list):
        articles: Iterable[dict[str, Any]] = payload
    elif isinstance(payload, dict):
        articles = payload.get("articles") or []
    else:
        raise ContractViolation(
            f"gkmlpt list response must be dict or list, got {type(payload).__name__}"
        )

    out: list[CrawlItem] = []
    for a in articles:
        if not isinstance(a, dict):
            continue
        try:
            out.append(
                _project_one(a, site_id=site_id, target_id=target_id, dept_id=dept_id)
            )
        except (ContractViolation, ValueError):
            # malformed row — skip, don't crash the whole page
            continue
    return out


# ---------------------------------------------------------------------------
# bucket-invariant probe (§5.5 sanity check used during adapter health-checks)
# ---------------------------------------------------------------------------
def verify_bucket_invariant(article: dict[str, Any]) -> bool:
    """Return True iff `url` matches `bucket = id // 1000`.

    Used by adapter smoke tests and dept probe to catch CMS format drift early.
    """
    pid = article.get("id")
    url = article.get("url") or ""
    if pid is None:
        return False
    m = _URL_RE.search(url)
    if not m:
        return False
    return m.group(1) == str(int(pid) // 1000) and m.group(2) == str(pid)


__all__ = [
    "ADAPTER_ID",
    "build_list_url",
    "parse_list_response",
    "verify_bucket_invariant",
]