"""Download missing FLK/NPC DOCX/PDF attachments for stored articles."""
from __future__ import annotations

import json
from datetime import datetime
from pathlib import PurePosixPath
from urllib.parse import parse_qs, urlparse

from sqlalchemy import exists

from govcrawler.adapters.flk_npc import _attachment_urls
from govcrawler.db import get_sessionmaker
from govcrawler.models import Article, Attachment, CrawlSite, CrawlTarget
from govcrawler.pipeline import _derive_article_key
from govcrawler.settings import get_settings
from govcrawler.storage.attachments import download_attachment
from govcrawler.storage.paths import to_os_path
from govcrawler.storage.repo import insert_attachments


def _bbbs_from_article(article: Article, data: dict) -> str | None:
    bbbs = str(data.get("bbbs") or "").strip()
    if bbbs:
        return bbbs
    parsed = urlparse(article.url or "")
    return (parse_qs(parsed.query).get("id") or [None])[0]


def _payload_from_raw_html(article: Article) -> dict | None:
    if not article.raw_html_path:
        return None
    settings = get_settings()
    path = to_os_path(settings.data_dir, PurePosixPath(article.raw_html_path))
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        return None


def main() -> None:
    Session = get_sessionmaker()
    checked = 0
    downloaded = 0
    skipped = 0
    failed = 0

    with Session() as s:
        targets = (
            s.query(CrawlTarget.id, CrawlTarget.target_code)
            .join(CrawlSite, CrawlSite.id == CrawlTarget.site_id)
            .filter(CrawlSite.site_code == "flk_npc")
            .all()
        )
        target_codes = {int(tid): str(code) for tid, code in targets}
        if not target_codes:
            print("checked=0 downloaded=0 skipped=0 failed=0 reason=no_flk_targets")
            return

        rows = (
            s.query(Article)
            .filter(Article.target_id.in_(target_codes.keys()))
            .filter(~exists().where(Attachment.article_id == Article.id))
            .all()
        )
        for article in rows:
            checked += 1
            payload = _payload_from_raw_html(article)
            data = (payload or {}).get("data") or {}
            oss = data.get("ossFile") or {}
            bbbs = _bbbs_from_article(article, data)
            if not bbbs or not oss:
                skipped += 1
                continue

            urls = _attachment_urls(str(bbbs), data, oss)
            if not urls:
                skipped += 1
                continue

            when = article.publish_time or datetime.utcnow()
            article_key = _derive_article_key(article.url or str(bbbs))
            target_code = target_codes.get(int(article.target_id or 0)) or "flk_npc"
            records = []
            for url in urls:
                try:
                    item = download_attachment(
                        url,
                        site="flk_npc",
                        column=target_code,
                        when=when,
                        article_key=article_key,
                    )
                except Exception as e:
                    failed += 1
                    print(f"download_failed article_id={article.id} url={url} err={e}")
                    continue
                records.append(
                    {
                        "file_name": item.file_name,
                        "file_ext": item.file_ext,
                        "size_bytes": item.size_bytes,
                        "file_path": str(item.file_path),
                        "file_hash": item.file_hash,
                    }
                )
            if records:
                insert_attachments(s, article.id, records)
                article.has_attachment = True
                downloaded += len(records)
                s.commit()

    print(
        f"checked={checked} downloaded={downloaded} skipped={skipped} failed={failed}"
    )


if __name__ == "__main__":
    main()
