"""v2 site-config loader — per design doc §7.5.3.

Shape:

    site:                              # required, 1 per YAML
      code: qingcheng_fgw              # → crawl_site.site_code
      name: 清城区发改局
      base_url: http://fgw.qingcheng.gov.cn
      role: qingyuan_local             # qingyuan_local / county_local / province_ref / nation_ref
      cms_adapter: gkmlpt              # XOR with yaml_path
      # yaml_path: auto                # if cms_adapter absent, yaml_path = current file
      default_fetch_strategy: httpx    # httpx / playwright
      respect_robots: true
      enabled: true
      remark: "..."

    depts:                             # OPTIONAL; Qingyuan-specific dept_path layer
      - dept_path: qycsj               # CMS-side dept segment; unique per site
        dept_binding: mapped           # pending / mapped / city_level / cross_dept / external_ref
        local_dept_id: 301             # iff dept_binding == mapped
        dept_display_name: 统计局
        enabled: true
        columns:
          - column_id: "1234"
            name: 财政预决算
            entry_url: http://.../qycsj/gkmlpt/index/1234
            sample_article_url: http://.../post_987654.html
            channel_name: 信息公开
            channel_path: 政务公开/财政信息
            content_category: 法规文件
            schedule_cron: "0 2 * * *"
            expected_cadence_days: 30
            enabled: true

    columns:                           # OPTIONAL; for sites that don't have a
                                       # dept_path concept (legacy HTML scrape
                                       # sites). crawl_target.site_department_id
                                       # stays NULL for these.
      - column_id: ...

**Exactly one of `cms_adapter` / `yaml_path` is set** — matches the DB CHECK
on crawl_site. Loader enforces this. If `cms_adapter` is absent we auto-fill
`yaml_path` with the YAML file's own path, which is the common case.

**dept_binding ↔ local_dept_id coupling** is enforced at load time (same rule
as the DB CHECK), so bad YAML gets rejected before touching the database.
"""
from __future__ import annotations

from pathlib import Path
from typing import Any

import yaml
from pydantic import BaseModel, ConfigDict, Field, model_validator


# ---------------------------------------------------------------------------
# Column — lives under a dept or directly under the site
# ---------------------------------------------------------------------------
class ColumnV2(BaseModel):
    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)

    column_id: str = Field(..., min_length=1, max_length=64)
    name: str | None = Field(default=None, max_length=200)
    entry_url: str | None = Field(default=None, max_length=1000)
    sample_article_url: str | None = Field(default=None, max_length=1000)
    channel_name: str | None = Field(default=None, max_length=200)
    channel_path: str | None = Field(default=None, max_length=1000)
    content_category: str | None = Field(default=None, max_length=100)
    content_subcategory: str | None = Field(default=None, max_length=100)
    schedule_cron: str | None = Field(default=None, max_length=100)
    expected_cadence_days: int = Field(default=30, ge=1, le=3650)
    interval_sec: int | None = Field(default=None, ge=0)
    interval_jitter_sec: int | None = Field(default=None, ge=0)
    track_checkpoint: bool = False
    parser_override: dict[str, Any] | None = None
    enabled: bool = True


# ---------------------------------------------------------------------------
# Dept — Qingyuan专项增强；可选
# ---------------------------------------------------------------------------
DEPT_BINDINGS = ("pending", "mapped", "city_level", "cross_dept", "external_ref")


class DeptV2(BaseModel):
    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)

    dept_path: str = Field(..., min_length=1, max_length=100)
    dept_binding: str = "pending"
    local_dept_id: int | None = None
    dept_display_name: str | None = Field(default=None, max_length=200)
    enabled: bool = True
    columns: list[ColumnV2] = Field(default_factory=list)

    @model_validator(mode="after")
    def _binding_consistent(self) -> "DeptV2":
        if self.dept_binding not in DEPT_BINDINGS:
            raise ValueError(
                f"dept_binding must be one of {DEPT_BINDINGS}, got {self.dept_binding!r}"
            )
        if self.dept_binding == "mapped" and self.local_dept_id is None:
            raise ValueError(
                f"dept_path={self.dept_path!r}: dept_binding='mapped' "
                "requires local_dept_id"
            )
        if self.dept_binding != "mapped" and self.local_dept_id is not None:
            raise ValueError(
                f"dept_path={self.dept_path!r}: dept_binding={self.dept_binding!r} "
                "must have local_dept_id unset (it's a non-OA dept)"
            )
        return self


# ---------------------------------------------------------------------------
# Site header — one per YAML
# ---------------------------------------------------------------------------
SITE_ROLES = ("qingyuan_local", "county_local", "province_ref", "nation_ref")
FETCH_STRATEGIES = ("httpx", "playwright")


class SiteHeaderV2(BaseModel):
    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)

    code: str = Field(..., min_length=1, max_length=50)
    name: str | None = Field(default=None, max_length=200)
    base_url: str | None = Field(default=None, max_length=500)
    role: str | None = None
    cms_adapter: str | None = Field(default=None, max_length=50)
    adapter_params: dict[str, Any] | None = None
    default_fetch_strategy: str | None = None
    strategy_override_reason: str | None = Field(default=None, max_length=200)
    respect_robots: bool = True
    enabled: bool = True
    remark: str | None = Field(default=None, max_length=500)

    @model_validator(mode="after")
    def _check_enums(self) -> "SiteHeaderV2":
        if self.role is not None and self.role not in SITE_ROLES:
            raise ValueError(f"role must be one of {SITE_ROLES}, got {self.role!r}")
        if (
            self.default_fetch_strategy is not None
            and self.default_fetch_strategy not in FETCH_STRATEGIES
        ):
            raise ValueError(
                f"default_fetch_strategy must be one of {FETCH_STRATEGIES}, "
                f"got {self.default_fetch_strategy!r}"
            )
        return self


# ---------------------------------------------------------------------------
# Top-level document
# ---------------------------------------------------------------------------
class SiteDocV2(BaseModel):
    """Fully-validated v2 YAML document."""
    model_config = ConfigDict(extra="forbid")

    site: SiteHeaderV2
    depts: list[DeptV2] = Field(default_factory=list)
    columns: list[ColumnV2] = Field(default_factory=list)

    # runtime-injected path so the loader can fill yaml_path on DB sync.
    source_path: str | None = None

    @model_validator(mode="after")
    def _structure_consistent(self) -> "SiteDocV2":
        # dept_path uniqueness within this site
        seen = set()
        for d in self.depts:
            if d.dept_path in seen:
                raise ValueError(f"duplicate dept_path: {d.dept_path!r}")
            seen.add(d.dept_path)

        # column_id global uniqueness within the YAML (across all depts + site-level)
        all_cols: list[tuple[str, str]] = [
            (d.dept_path, c.column_id) for d in self.depts for c in d.columns
        ] + [("<root>", c.column_id) for c in self.columns]
        col_ids_seen: dict[str, str] = {}
        for where, cid in all_cols:
            if cid in col_ids_seen:
                raise ValueError(
                    f"column_id {cid!r} appears under both {col_ids_seen[cid]!r} "
                    f"and {where!r}; column_id must be site-unique"
                )
            col_ids_seen[cid] = where
        return self


# ---------------------------------------------------------------------------
# Loading
# ---------------------------------------------------------------------------
def load_v2(path: str | Path) -> SiteDocV2:
    """Load a v2 YAML file. File stem must equal site.code."""
    p = Path(path)
    raw: Any = yaml.safe_load(p.read_text(encoding="utf-8"))
    if not isinstance(raw, dict):
        raise ValueError(f"{p}: YAML root must be a mapping")
    raw["source_path"] = str(p)
    doc = SiteDocV2(**raw)
    if doc.site.code != p.stem:
        raise ValueError(
            f"{p}: filename stem {p.stem!r} must equal site.code {doc.site.code!r}"
        )
    return doc


def load_v2_dir(directory: str | Path) -> dict[str, SiteDocV2]:
    """Load every *.yaml in a directory. Keyed by site.code."""
    d = Path(directory)
    out: dict[str, SiteDocV2] = {}
    if not d.is_dir():
        return out
    for p in sorted(d.iterdir()):
        if p.suffix.lower() not in (".yaml", ".yml"):
            continue
        doc = load_v2(p)
        if doc.site.code in out:
            raise ValueError(f"duplicate site.code {doc.site.code!r}: {p}")
        out[doc.site.code] = doc
    return out
