"""Normalize already-stored FLK/NPC article text.

FLK OFD reader returns visual layout lines. Older rows stored those hard-wraps
directly in article.content_text and the cached text file. This backfill keeps
the metadata header intact and normalizes only the article body.
"""
from __future__ import annotations

from pathlib import PurePosixPath

from govcrawler.adapters.flk_npc import normalize_content_text
from govcrawler.db import get_sessionmaker
from govcrawler.models import Article, CrawlSite, CrawlTarget
from govcrawler.settings import get_settings
from govcrawler.storage.paths import to_os_path


def main() -> None:
    settings = get_settings()
    Session = get_sessionmaker()
    checked = 0
    updated = 0
    with Session() as s:
        target_ids = [
            tid
            for (tid,) in (
                s.query(CrawlTarget.id)
                .join(CrawlSite, CrawlSite.id == CrawlTarget.site_id)
                .filter(CrawlSite.site_code == "flk_npc")
                .all()
            )
        ]
        if not target_ids:
            print("checked=0 updated=0 reason=no_flk_targets")
            return
        rows = (
            s.query(Article)
            .filter(Article.target_id.in_(target_ids))
            .filter(Article.content_text.isnot(None))
            .all()
        )
        for article in rows:
            checked += 1
            old = article.content_text or ""
            new = normalize_content_text(old)
            if new == old:
                continue
            article.content_text = new
            if article.text_path:
                path = to_os_path(settings.data_dir, PurePosixPath(article.text_path))
                path.parent.mkdir(parents=True, exist_ok=True)
                path.write_text(new, encoding="utf-8")
            updated += 1
        s.commit()
    print(f"checked={checked} updated={updated}")


if __name__ == "__main__":
    main()
