"""Backfill missing link attachments from stored raw HTML.

This repairs legacy rows where the article was stored but attachment downloads
failed, for example historical gd.gov.cn PDF URLs that work over HTTP but were
incorrectly upgraded to HTTPS during download. It only uses <a href> links
parsed from the article body; inline <img> content is intentionally ignored.
"""
from __future__ import annotations

import argparse
import random
import time
from datetime import datetime
from pathlib import PurePosixPath

from sqlalchemy import exists

from govcrawler.db import get_sessionmaker
from govcrawler.models import Article, Attachment, CrawlSite
from govcrawler.parser.detail_parser import parse_detail
from govcrawler.pipeline import _derive_article_key
from govcrawler.settings import get_settings
from govcrawler.storage.attachments import download_attachment
from govcrawler.storage.paths import to_os_path
from govcrawler.storage.repo import insert_attachments


DETAIL_SELECTORS = {
    "title": "meta[name=ArticleTitle]::attr(content), title::text, h1::text",
    "publish_time": "meta[name=PubDate]::attr(content), div.date-row::text",
    "source": "meta[name=ContentSource]::attr(content)",
    "content": "div.article-content, div.content, div.zw, div.TRS_Editor, div.TRS_UEDITOR",
    "attachment_css": (
        "a[href$='.pdf'], a[href$='.doc'], a[href$='.docx'], "
        "a[href$='.xls'], a[href$='.xlsx'], a[href$='.zip'], "
        "a[href$='.wps'], a[href$='.rar'], a[href$='.txt']"
    ),
}


def _sleep(base: float, jitter: float) -> None:
    delay = max(0.0, base) + (random.uniform(0, max(0.0, jitter)) if jitter else 0.0)
    if delay:
        time.sleep(delay)


def _load_fields(article: Article):
    settings = get_settings()
    if not article.raw_html_path:
        return None
    raw_abs = to_os_path(settings.data_dir, PurePosixPath(article.raw_html_path))
    raw_html = raw_abs.read_text(encoding="utf-8", errors="ignore")
    return parse_detail(raw_html, article.url or "", DETAIL_SELECTORS)


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--site-code", default="gd_wjk", help="site to repair; default: gd_wjk")
    ap.add_argument("--target-code", help="optional target_code filter")
    ap.add_argument("--article-id", type=int, help="repair one article")
    ap.add_argument("--limit", type=int, help="maximum articles to check")
    ap.add_argument("--sleep-sec", type=float, default=80.0, help="base delay between downloads")
    ap.add_argument("--jitter-sec", type=float, default=20.0, help="extra random delay")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    Session = get_sessionmaker()
    checked = 0
    candidate_articles = 0
    downloaded = 0
    skipped = 0
    failed = 0
    missing_raw = 0

    with Session() as s:
        q = (
            s.query(Article)
            .join(CrawlSite, Article.site_id == CrawlSite.id)
            .filter(CrawlSite.site_code == args.site_code)
        )
        if args.article_id:
            q = q.filter(Article.id == args.article_id)
        else:
            q = q.filter(~exists().where(Attachment.article_id == Article.id))
        if args.target_code:
            q = q.filter(Article.target.has(target_code=args.target_code))
        q = q.order_by(Article.id.asc())
        if args.limit:
            q = q.limit(args.limit)
        articles = q.all()

        for article in articles:
            checked += 1
            try:
                fields = _load_fields(article)
            except FileNotFoundError:
                missing_raw += 1
                continue
            except Exception as exc:
                failed += 1
                print(f"parse_failed article_id={article.id} err={type(exc).__name__}:{exc}")
                continue

            if not fields or not fields.attachment_urls:
                skipped += 1
                continue

            candidate_articles += 1
            target_code = article.target.target_code if article.target else args.target_code or args.site_code
            when = article.publish_time or datetime.utcnow()
            article_key = _derive_article_key(article.url or f"a_{article.id}")
            records = []
            print(
                f"candidate article_id={article.id} urls={len(fields.attachment_urls)} "
                f"title={article.title!r}"
            )

            for idx, url in enumerate(fields.attachment_urls, start=1):
                preferred = fields.attachment_names.get(url)
                if args.dry_run:
                    print(f"dry_run download article_id={article.id} #{idx} url={url} name={preferred!r}")
                    continue
                if downloaded or records:
                    _sleep(args.sleep_sec, args.jitter_sec)
                try:
                    item = download_attachment(
                        url,
                        site=args.site_code,
                        column=target_code,
                        when=when,
                        article_key=article_key,
                        preferred_name=preferred,
                        timeout_s=60,
                    )
                except Exception as exc:
                    failed += 1
                    print(f"download_failed article_id={article.id} url={url} err={type(exc).__name__}:{exc}")
                    continue
                records.append(
                    {
                        "file_name": item.file_name,
                        "file_ext": item.file_ext,
                        "size_bytes": item.size_bytes,
                        "file_path": str(item.file_path),
                        "file_hash": item.file_hash,
                        "source_url": item.source_url,
                    }
                )
                downloaded += 1
                print(f"downloaded article_id={article.id} file={item.file_name!r} size={item.size_bytes}")

            if records:
                insert_attachments(s, article.id, records)
                article.has_attachment = True
                s.commit()
            elif not args.dry_run:
                s.rollback()

    print(
        "checked=%s candidate_articles=%s downloaded=%s skipped=%s failed=%s missing_raw=%s dry_run=%s"
        % (checked, candidate_articles, downloaded, skipped, failed, missing_raw, args.dry_run)
    )


if __name__ == "__main__":
    main()