"""Backfill normalized article metadata from saved detail HTML.

This is the generic companion to the ingest-time detail parser. It recovers
public-document fields from saved `raw_html_path` snapshots when available:

  索引号、分类、发布机构、成文日期、文号、发布日期、主题词

For pages without a government public-metadata table, it still normalizes the
common RAG filters:

  publisher    <- existing publisher OR source_raw
  publish_date <- existing publish_date OR publish_time.date()

Run inside the api container:

    docker exec docker-api-1 python /app/scripts/backfill_article_detail_metadata.py --dry-run
    docker exec docker-api-1 python /app/scripts/backfill_article_detail_metadata.py
"""
from __future__ import annotations

import argparse
import sys
from collections import Counter
from dataclasses import dataclass
from datetime import date, datetime
from pathlib import Path, PurePosixPath
from typing import Any

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from parsel import Selector  # noqa: E402
from sqlalchemy import select  # noqa: E402

from govcrawler.db import get_sessionmaker  # noqa: E402
from govcrawler.models import Article, CrawlSite, CrawlTarget  # noqa: E402
from govcrawler.parser.detail_parser import _extract_gov_public_meta  # noqa: E402
from govcrawler.parser.detail_parser import _clean_source_text  # noqa: E402
from govcrawler.settings import get_settings  # noqa: E402
from govcrawler.storage.paths import to_os_path  # noqa: E402


@dataclass(frozen=True)
class BackfillStats:
    scanned: int
    changed: int
    skipped_no_source: int
    field_counts: Counter[str]


def _is_empty(v: Any) -> bool:
    return v is None or v == ""


def _same_value(left: Any, right: Any) -> bool:
    if isinstance(left, datetime) and isinstance(right, datetime):
        return left.replace(microsecond=0) == right.replace(microsecond=0)
    if isinstance(left, date) and isinstance(right, date):
        return left == right
    return left == right


def _public_values_from_html(html: str) -> dict[str, Any]:
    meta = _extract_gov_public_meta(Selector(text=html))
    public_meta = meta.pop("public_meta", {}) or {}
    values = {k: v for k, v in meta.items() if not _is_empty(v)}
    if public_meta:
        values["metadata_json"] = {"public_meta": public_meta}
    return values


def _html_values_for_article(article: Article, *, data_dir: Path) -> dict[str, Any]:
    if not article.raw_html_path:
        return {}
    try:
        path = to_os_path(data_dir, PurePosixPath(article.raw_html_path))
        path.resolve().relative_to(data_dir.resolve())
        return _public_values_from_html(path.read_text(encoding="utf-8", errors="ignore"))
    except Exception:
        return {}


def values_for_article(article: Article, *, data_dir: Path) -> dict[str, Any]:
    values = _html_values_for_article(article, data_dir=data_dir)
    if _is_empty(values.get("publisher")) and not _is_empty(article.source_raw):
        values["publisher"] = _clean_source_text(article.source_raw)
    if _is_empty(values.get("publish_date")) and article.publish_time is not None:
        values["publish_date"] = article.publish_time.date()
    return values


def apply_values(article: Article, values: dict[str, Any], *, only_missing: bool) -> list[str]:
    changed_fields: list[str] = []
    for field, new_value in values.items():
        if _is_empty(new_value):
            continue
        old_value = getattr(article, field)
        if field == "metadata_json":
            if not isinstance(old_value, dict):
                old_value = {}
            new_value = {**old_value, **new_value}
        if only_missing and not _is_empty(old_value):
            continue
        if _same_value(old_value, new_value):
            continue
        setattr(article, field, new_value)
        changed_fields.append(field)
    return changed_fields


def _article_stmt(s, *, site_code: str | None, target_code: str | None):
    stmt = select(Article).order_by(Article.id.asc())
    if target_code:
        target = s.scalar(select(CrawlTarget).where(CrawlTarget.target_code == target_code))
        if target is None:
            raise SystemExit(f"target not found: {target_code}")
        return stmt.where(Article.target_id == target.id)
    if site_code:
        site = s.scalar(select(CrawlSite).where(CrawlSite.site_code == site_code))
        if site is None:
            raise SystemExit(f"site not found: {site_code}")
        return stmt.where(Article.site_id == site.id)
    return stmt


def run_backfill(
    *,
    site_code: str | None,
    target_code: str | None,
    dry_run: bool,
    only_missing: bool,
    limit: int | None,
) -> BackfillStats:
    Session = get_sessionmaker()
    data_dir = Path(get_settings().data_dir)
    scanned = 0
    changed = 0
    skipped_no_source = 0
    field_counts: Counter[str] = Counter()

    with Session() as s:
        stmt = _article_stmt(s, site_code=site_code, target_code=target_code)
        if limit is not None:
            stmt = stmt.limit(limit)
        for article in s.scalars(stmt):
            scanned += 1
            values = values_for_article(article, data_dir=data_dir)
            if not values:
                skipped_no_source += 1
                continue
            fields = apply_values(article, values, only_missing=only_missing)
            if fields:
                changed += 1
                field_counts.update(fields)

        if dry_run:
            s.rollback()
        else:
            s.commit()

    return BackfillStats(scanned, changed, skipped_no_source, field_counts)


def _build_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument("--site-code", default=None)
    p.add_argument("--target-code", default=None)
    p.add_argument("--dry-run", action="store_true")
    p.add_argument("--only-missing", action="store_true")
    p.add_argument("--limit", type=int, default=None)
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_parser().parse_args(argv)
    stats = run_backfill(
        site_code=args.site_code,
        target_code=args.target_code,
        dry_run=args.dry_run,
        only_missing=args.only_missing,
        limit=args.limit,
    )
    scope = args.target_code or args.site_code or "all"
    mode = "dry_run" if args.dry_run else "committed"
    print(
        f"mode={mode} scope={scope} scanned={stats.scanned} "
        f"changed={stats.changed} skipped_no_source={stats.skipped_no_source}"
    )
    if stats.field_counts:
        print("field_counts=" + ",".join(
            f"{k}:{v}" for k, v in sorted(stats.field_counts.items())
        ))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
