"""One-shot: backfill article.channel_name / channel_path / content_category /
content_subcategory from the article's crawl_target row.

Background: pipeline.fetch_and_store didn't carry these fields onto the
article until 2026-04-28. Older rows are NULL — joining article ↔
crawl_target every time the RAG / search side reads is wasteful.

Idempotent: only fills NULLs. Operator overrides on individual articles
(if any) are preserved. Skips orphan articles (target_id IS NULL).

Run inside the api container:

    docker exec docker-api-1 python /app/scripts/backfill_article_classification.py
"""
from __future__ import annotations

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from sqlalchemy import or_  # noqa: E402

from govcrawler.db import get_sessionmaker  # noqa: E402
from govcrawler.models import Article, CrawlTarget  # noqa: E402


def main() -> int:
    Session = get_sessionmaker()
    n_seen = 0
    n_updated = 0
    n_skipped_orphan = 0
    n_skipped_no_target_data = 0

    with Session() as s:
        # Pre-load target metadata into a dict to avoid per-row joins.
        target_meta: dict[int, tuple] = {
            t.id: (t.channel_name, t.channel_path, t.content_category, t.content_subcategory)
            for t in s.query(CrawlTarget).all()
        }

        # Fetch articles that have at least one of the 4 fields NULL.
        rows = (
            s.query(Article)
            .filter(
                or_(
                    Article.channel_name.is_(None),
                    Article.channel_path.is_(None),
                    Article.content_category.is_(None),
                    Article.content_subcategory.is_(None),
                )
            )
            .all()
        )
        for a in rows:
            n_seen += 1
            if a.target_id is None:
                n_skipped_orphan += 1
                continue
            meta = target_meta.get(a.target_id)
            if meta is None:
                n_skipped_orphan += 1
                continue
            t_chan_name, t_chan_path, t_cat, t_sub = meta
            if not any([t_chan_name, t_chan_path, t_cat, t_sub]):
                n_skipped_no_target_data += 1
                continue

            changed = False
            if a.channel_name is None and t_chan_name:
                a.channel_name = t_chan_name; changed = True
            if a.channel_path is None and t_chan_path:
                a.channel_path = t_chan_path; changed = True
            if a.content_category is None and t_cat:
                a.content_category = t_cat; changed = True
            if a.content_subcategory is None and t_sub:
                a.content_subcategory = t_sub; changed = True
            if changed:
                n_updated += 1

        s.commit()

    print(
        f"scanned={n_seen} updated={n_updated} "
        f"skipped_orphan={n_skipped_orphan} "
        f"skipped_target_no_data={n_skipped_no_target_data}"
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
