"""yaml → DB unidirectional sync (§7.5.3 "职责边界铁律").

One direction only: YAML is authoritative for config; runtime state
(last_crawled_at, last_article_time, etc.) lives in DB and is NEVER written
back to YAML.

Sync is idempotent: running twice produces the same DB state. We upsert by
natural keys:

  * crawl_site.site_code
  * (site_id, dept_path) for site_department
  * crawl_target.target_code  ← synthesized as "<site_code>__<dept_path>__<column_id>"
                               (dept_path omitted when under site-level columns)

Removed-from-YAML rows are **disabled**, not deleted — we never risk cascading
an accidental delete of historical article rows.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable

from sqlalchemy.orm import Session

from govcrawler.config.v2 import ColumnV2, DeptV2, SiteDocV2, load_v2_dir
from govcrawler.models import CrawlSite, CrawlTarget, LocalDepartment, SiteDepartment
from govcrawler.repositories import depts, sites, targets


class UnknownLocalDept(ValueError):
    """Raised when a mapped dept references a local_dept_id that doesn't exist
    in `local_department`. OA snapshot must be synced first."""


# ---------------------------------------------------------------------------
# report structures — returned by sync_* functions for CLI / tests
# ---------------------------------------------------------------------------
@dataclass
class SyncReport:
    sites_created: int = 0
    sites_updated: int = 0
    depts_created: int = 0
    depts_updated: int = 0
    targets_created: int = 0
    targets_updated: int = 0
    targets_disabled: int = 0
    depts_disabled: int = 0
    sites_disabled: int = 0
    warnings: list[str] = field(default_factory=list)

    def as_dict(self) -> dict[str, object]:
        return {
            "sites": {
                "created": self.sites_created,
                "updated": self.sites_updated,
                "disabled": self.sites_disabled,
            },
            "depts": {
                "created": self.depts_created,
                "updated": self.depts_updated,
                "disabled": self.depts_disabled,
            },
            "targets": {
                "created": self.targets_created,
                "updated": self.targets_updated,
                "disabled": self.targets_disabled,
            },
            "warnings": self.warnings,
        }


# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
def _target_code(site_code: str, dept_path: str | None, column_id: str) -> str:
    if dept_path:
        return f"{site_code}__{dept_path}__{column_id}"
    return f"{site_code}__{column_id}"


def _site_fields_from_doc(doc: SiteDocV2) -> dict[str, object]:
    s = doc.site
    # Enforce the cms_adapter XOR yaml_path invariant mandated by crawl_site's
    # CHECK constraint. When the YAML doesn't declare cms_adapter, we stamp
    # yaml_path with the file's own path so the row is self-describing.
    yaml_path = None if s.cms_adapter else (doc.source_path or "")
    return {
        "site_name": s.name,
        "base_url": s.base_url,
        "site_role": s.role,
        "cms_adapter": s.cms_adapter,
        "adapter_params_json": s.adapter_params,
        "default_fetch_strategy": s.default_fetch_strategy,
        "strategy_override_reason": s.strategy_override_reason,
        "respect_robots": s.respect_robots,
        "yaml_path": yaml_path,
        "enabled": s.enabled,
        "remark": s.remark,
    }


def _target_fields_from_column(
    col: ColumnV2,
    *,
    site_id: int,
    site_department_id: int | None,
    local_dept_id: int | None = None,
) -> dict[str, object]:
    # `dept_id` is a denormalized FK to local_department so pipeline.fetch_and_store
    # can thread it straight onto Article without re-joining site_department.
    # Only populated when the parent dept is `mapped` (has a local_dept_id).
    return {
        "site_id": site_id,
        "site_department_id": site_department_id,
        "dept_id": local_dept_id,
        "target_name": col.name,
        "entry_url": col.entry_url,
        "sample_article_url": col.sample_article_url,
        "channel_name": col.channel_name,
        "channel_path": col.channel_path,
        "content_category": col.content_category,
        "content_subcategory": col.content_subcategory,
        "schedule_cron": col.schedule_cron,
        "expected_cadence_days": col.expected_cadence_days,
        "interval_sec": col.interval_sec,
        "interval_jitter_sec": col.interval_jitter_sec,
        "track_checkpoint": col.track_checkpoint,
        "parser_override_json": col.parser_override,
        "enabled": col.enabled,
    }


# ---------------------------------------------------------------------------
# per-site sync
# ---------------------------------------------------------------------------
def _preflight_local_dept_ids(session: Session, doc: SiteDocV2) -> None:
    """Before touching DB, make sure every mapped dept's local_dept_id exists.

    Raises `UnknownLocalDept` with the full list of missing ids so the user
    can fix OA-sync in one go instead of hitting them one-by-one.
    """
    required: set[int] = {
        d.local_dept_id for d in doc.depts
        if d.dept_binding == "mapped" and d.local_dept_id is not None
    }
    if not required:
        return
    present = set(
        session.scalars(
            LocalDepartment.__table__.select()
            .with_only_columns(LocalDepartment.dept_id)
            .where(LocalDepartment.dept_id.in_(required))
        )
    )
    missing = sorted(required - present)
    if missing:
        raise UnknownLocalDept(
            f"site {doc.site.code!r}: local_dept_id(s) not in local_department: "
            f"{missing}. Run OA snapshot sync first."
        )


def sync_site(session: Session, doc: SiteDocV2, report: SyncReport) -> CrawlSite:
    """Upsert one site + its depts + targets. Caller owns the transaction."""
    _preflight_local_dept_ids(session, doc)
    code = doc.site.code
    existing = sites.get_by_code(session, code)
    site_fields = _site_fields_from_doc(doc)
    if existing is None:
        # A site brand-new to the DB must be yaml-origin by construction.
        row = sites.upsert_by_code(session, code, managed_by="yaml", **site_fields)
        report.sites_created += 1
    elif getattr(existing, "managed_by", "yaml") == "ui":
        # §7.5.3 [2.0-补丁]: UI-托管的站点若被 YAML 同名覆盖 → 拒绝污染，
        # 留个 warning 让运维处理：要么改 site_code，要么删 UI 行。
        report.warnings.append(
            f"site {code!r} is UI-managed; YAML doc ignored. "
            f"Rename the YAML site_code or delete the UI-managed row first."
        )
        row = existing
        return row  # skip depts/targets too — don't pollute UI-managed tree
    else:
        sites.upsert_by_code(session, code, **site_fields)
        report.sites_updated += 1
        row = existing
    session.flush()  # need site.id before inserting depts/targets

    # --- depts ---------------------------------------------------------
    yaml_dept_paths = {d.dept_path for d in doc.depts}
    for d in doc.depts:
        _upsert_dept(session, row.id, d, report)

    # depts present in DB but missing from YAML → disable (don't delete)
    for db_dept in depts.list_for_site(session, row.id):
        if db_dept.dept_path not in yaml_dept_paths and db_dept.enabled:
            db_dept.enabled = False
            report.depts_disabled += 1

    session.flush()

    # --- targets ------------------------------------------------------
    yaml_target_codes: set[str] = set()

    # targets under each dept
    for d in doc.depts:
        dept_row = depts.get(session, row.id, d.dept_path)
        assert dept_row is not None  # just upserted
        for col in d.columns:
            tc = _target_code(code, d.dept_path, col.column_id)
            yaml_target_codes.add(tc)
            _upsert_target(
                session, tc, col,
                site_id=row.id, site_department_id=dept_row.id,
                local_dept_id=dept_row.local_dept_id, report=report,
            )

    # site-level columns (no dept_path)
    for col in doc.columns:
        tc = _target_code(code, None, col.column_id)
        yaml_target_codes.add(tc)
        _upsert_target(
            session, tc, col,
            site_id=row.id, site_department_id=None, report=report,
        )

    # orphaned targets on this site → disable
    for db_target in targets.list_for_site(session, row.id, enabled_only=False):
        if db_target.target_code not in yaml_target_codes and db_target.enabled:
            db_target.enabled = False
            report.targets_disabled += 1

    return row


def _upsert_dept(
    session: Session, site_id: int, d: DeptV2, report: SyncReport
) -> None:
    existing = depts.get(session, site_id, d.dept_path)
    payload = {
        "dept_display_name": d.dept_display_name,
        "enabled": d.enabled,
    }
    depts.upsert(
        session,
        site_id=site_id,
        dept_path=d.dept_path,
        dept_binding=d.dept_binding,
        local_dept_id=d.local_dept_id,
        **payload,
    )
    if existing is None:
        report.depts_created += 1
    else:
        report.depts_updated += 1


def _upsert_target(
    session: Session,
    target_code: str,
    col: ColumnV2,
    *,
    site_id: int,
    site_department_id: int | None,
    local_dept_id: int | None = None,
    report: SyncReport,
) -> None:
    existing = targets.get_by_code(session, target_code)
    fields = _target_fields_from_column(
        col, site_id=site_id, site_department_id=site_department_id,
        local_dept_id=local_dept_id,
    )
    targets.upsert_by_code(
        session,
        target_code=target_code,
        site_id=site_id,
        site_department_id=site_department_id,
        **{k: v for k, v in fields.items() if k not in ("site_id", "site_department_id")},
    )
    if existing is None:
        report.targets_created += 1
    else:
        report.targets_updated += 1


# ---------------------------------------------------------------------------
# directory sync — the CLI entrypoint
# ---------------------------------------------------------------------------
def sync_dir(session: Session, directory: str | Path) -> SyncReport:
    """Sync every YAML in `directory` into the database.

    Sites present in DB but missing from the YAML directory are **disabled**
    (not deleted). This matches §7.5.3: YAML is source of truth for config,
    but historical article/log rows stay referenceable forever.
    """
    report = SyncReport()
    docs = load_v2_dir(directory)
    for doc in docs.values():
        sync_site(session, doc, report)

    yaml_codes = set(docs.keys())
    for db_site in sites.list_enabled(session):
        if db_site.site_code in yaml_codes:
            continue
        # §7.5.3 [2.0-补丁]: UI-托管的 crawl_site 不受 sync_dir 管辖。
        # managed_by='ui' 表示运维在管理后台手工建的，sync 只处理 YAML 源的。
        if getattr(db_site, "managed_by", "yaml") == "ui":
            continue
        db_site.enabled = False
        report.sites_disabled += 1
        report.warnings.append(
            f"site {db_site.site_code!r} no longer in YAML dir → disabled"
        )
    return report


# ---------------------------------------------------------------------------
# convenience for callers that want to sync a single file
# ---------------------------------------------------------------------------
def sync_file(session: Session, path: str | Path) -> SyncReport:
    from govcrawler.config.v2 import load_v2

    doc = load_v2(path)
    report = SyncReport()
    sync_site(session, doc, report)
    return report


__all__ = ["SyncReport", "sync_site", "sync_dir", "sync_file"]
