"""Backfill short FLK/NPC article text from downloaded DOCX attachments.

Some FLK detail records expose only metadata through the OFD text reader while
the downloaded DOCX contains the full legal text. This script repairs already
stored rows by reading local DOCX attachments and rewriting article.content_text
plus the cached text file.
"""
from __future__ import annotations

from pathlib import PurePosixPath

from govcrawler.adapters.flk_npc import (
    MIN_USEFUL_BODY_CHARS,
    _drop_leading_duplicate_title,
    _extract_docx_text,
    normalize_content_text,
)
from govcrawler.db import get_sessionmaker
from govcrawler.models import Article, Attachment, CrawlSite, CrawlTarget
from govcrawler.settings import get_settings
from govcrawler.storage.paths import to_os_path


def _header_from_article(article: Article) -> str:
    lines = [
        article.title or "",
        f"制定机关：{article.publisher or article.source_raw or ''}",
        f"法律法规分类：{article.content_category or ''}",
        f"时效性：{_validity_label(article)}",
        f"公布日期：{article.publish_date.isoformat() if article.publish_date else ''}",
        f"施行日期：{article.effective_date.isoformat() if article.effective_date else ''}",
    ]
    return "\n".join(line for line in lines if line.strip("："))


def _validity_label(article: Article) -> str:
    meta = article.metadata_json or {}
    for container in (meta, meta.get("public_meta") or {}):
        label = container.get("validity_state_label")
        if label:
            return str(label)
    if article.is_effective is True:
        return "有效"
    if article.is_effective is False:
        return "非现行有效"
    return ""


def _read_docx_attachment(settings, attachment: Attachment) -> str:
    if not attachment.file_path or (attachment.file_ext or "").lower() != "docx":
        return ""
    path = to_os_path(settings.data_dir, PurePosixPath(attachment.file_path))
    try:
        return _extract_docx_text(path.read_bytes())
    except Exception:
        return ""


def main() -> None:
    settings = get_settings()
    Session = get_sessionmaker()
    checked = 0
    updated = 0
    skipped = 0
    with Session() as s:
        target_ids = [
            tid
            for (tid,) in (
                s.query(CrawlTarget.id)
                .join(CrawlSite, CrawlSite.id == CrawlTarget.site_id)
                .filter(CrawlSite.site_code == "flk_npc")
                .all()
            )
        ]
        if not target_ids:
            print("checked=0 updated=0 skipped=0 reason=no_flk_targets")
            return

        rows = (
            s.query(Article)
            .filter(Article.target_id.in_(target_ids))
            .filter(Article.has_attachment.is_(True))
            .filter(Article.content_text.isnot(None))
            .all()
        )
        for article in rows:
            checked += 1
            if len(article.content_text or "") >= MIN_USEFUL_BODY_CHARS:
                skipped += 1
                continue

            docx_text = ""
            for attachment in article.attachments:
                docx_text = _read_docx_attachment(settings, attachment)
                if docx_text:
                    break
            docx_text = _drop_leading_duplicate_title(docx_text, article.title or "")
            if len(docx_text) < MIN_USEFUL_BODY_CHARS:
                skipped += 1
                continue

            new_text = normalize_content_text(_header_from_article(article) + "\n\n" + docx_text)
            article.content_text = new_text
            article.status = "ready"
            if article.text_path:
                path = to_os_path(settings.data_dir, PurePosixPath(article.text_path))
                path.parent.mkdir(parents=True, exist_ok=True)
                path.write_text(new_text, encoding="utf-8")
            updated += 1
        s.commit()

    print(f"checked={checked} updated={updated} skipped={skipped}")


if __name__ == "__main__":
    main()
