"""Rename stored attachments from article link text.

Some legacy attachment downloads used the URL basename as the stored filename,
for example ``post_146594_p020180122371885620172.pdf``. The article body often
contains a better human-readable link label ("附件：..."). This script reparses
stored raw HTML, maps attachment URLs to link labels, renames the file on disk,
and updates the attachment DB row.
"""
from __future__ import annotations

import argparse
import re
from pathlib import PurePosixPath

from govcrawler.db import get_sessionmaker
from govcrawler.models import Article, Attachment, CrawlSite, CrawlTarget
from govcrawler.config.registry import get_detail_selectors
from govcrawler.parser.detail_parser import parse_detail
from govcrawler.settings import get_settings
from govcrawler.storage.filenames import safe_filename, with_best_extension
from govcrawler.storage.paths import to_os_path

DETAIL_SELECTORS = {
    "title": "meta[name=ArticleTitle]::attr(content), title::text, h1::text",
    "publish_time": "meta[name=PubDate]::attr(content), div.date-row::text",
    "source": "meta[name=ContentSource]::attr(content)",
    "content": (
        "div.pages_content, div.article-content, div.content, "
        "div.zw, div.TRS_Editor, div.TRS_UEDITOR"
    ),
    "attachment_css": "a[href]",
}

FILE_LABEL_EXTS = "docx|xlsx|pdf|doc|xls|zip|rar|txt|wps|ofd"
FILE_LABEL_RE = re.compile(
    rf"([^<>\r\n]{{1,180}}?\.(?:{FILE_LABEL_EXTS}))",
    re.IGNORECASE,
)


def _candidate_name(label: str, old_name: str, source_url: str | None) -> str:
    label = re.sub(r"\s+(\.[A-Za-z0-9]+)$", r"\1", label.strip())
    return safe_filename(with_best_extension(label, old_name, source_url))


def _dedupe_target_name(abs_dir, old_abs, desired_name: str, digest: str | None) -> str:
    candidate = desired_name
    candidate_abs = abs_dir / candidate
    if not candidate_abs.exists() or candidate_abs == old_abs:
        return candidate
    stem, dot, ext = candidate.rpartition(".")
    suffix = (digest or "attachment")[:12]
    return f"{stem}_{suffix}.{ext}" if dot else f"{candidate}_{suffix}"


def _basename(url: str) -> str:
    return (url.rsplit("/", 1)[-1].split("?", 1)[0] or "").lower()


def _match_attachment(att: Attachment, urls: list[str], names: dict[str, str], index: int) -> str | None:
    if att.source_url and att.source_url in names:
        return att.source_url

    old_name = (att.file_name or "").lower()
    for url in urls:
        base = _basename(url)
        if base and (old_name == base or base in old_name):
            return url

    if len(urls) == 1:
        return urls[0]
    if 0 <= index < len(urls):
        return urls[index]
    return None


def _target_code_from_raw_path(raw_html_path: str | None) -> str | None:
    parts = (raw_html_path or "").split("/")
    # raw_html/<site>/<target>/YYYY/MM/file.html
    return parts[2] if len(parts) > 2 else None


def _is_old_post_attachment_name(file_name: str | None) -> bool:
    name = (file_name or "").lower()
    if not name.startswith("post_"):
        return False
    return name.endswith((
        ".pdf",
        ".doc",
        ".docx",
        ".xls",
        ".xlsx",
        ".zip",
        ".rar",
        ".txt",
        ".wps",
        ".ofd",
    ))


def _is_hash_attachment_name(file_name: str | None) -> bool:
    name = (file_name or "").lower().strip()
    stem, dot, ext = name.rpartition(".")
    if not dot or ext not in FILE_LABEL_EXTS.split("|"):
        return False
    return bool(re.fullmatch(r"[0-9a-f]{24,64}", stem))


def _labels_from_content_text(text: str | None) -> list[str]:
    if not text:
        return []
    normalized = re.sub(r"[ \t\u3000]+", " ", text)
    # Make compact attachment lists easier to parse:
    # "1.foo.xlsx 2.bar.doc" -> separate candidate segments.
    normalized = re.sub(r"\s+(?=(?:附件\s*)?\d+[.、])", "\n", normalized)
    normalized = re.sub(r"\s+(?=附件\d+[：:_])", "\n", normalized)
    labels: list[str] = []
    seen: set[str] = set()
    for segment in re.split(r"[\r\n]+", normalized):
        segment = segment.strip()
        if not segment:
            continue
        for match in FILE_LABEL_RE.finditer(segment):
            label = match.group(1).strip(" ：:-—\t")
            # Drop common article metadata that may precede the only file link
            # in text-only attachment pages.
            if "发布日期" in label and " " in label:
                label = label.rsplit(" ", 1)[-1].strip(" ：:-—\t")
            if "浏览次数" in label and " " in label:
                label = label.rsplit(" ", 1)[-1].strip(" ：:-—\t")
            if label and label not in seen:
                labels.append(label)
                seen.add(label)
    return labels


def _labels_to_attachment_fields(labels: list[str]) -> tuple[list[str], dict[str, str]]:
    urls = [f"content-text:{idx}" for idx, _ in enumerate(labels)]
    return urls, dict(zip(urls, labels))


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--site-code", help="limit to one crawl site, e.g. gd_wjk")
    ap.add_argument("--target-code", help="limit to one crawl target, e.g. gd_wjk__qbwj")
    ap.add_argument("--article-id", type=int, help="limit to one article id")
    ap.add_argument(
        "--only-old-post-names",
        action="store_true",
        help="only rename legacy URL-derived names like post_123_123.pdf",
    )
    ap.add_argument(
        "--only-hash-names",
        action="store_true",
        help="only rename URL-derived hash names like fce4387297db43bf921ef133376b44d7.doc",
    )
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    settings = get_settings()
    Session = get_sessionmaker()
    checked = 0
    renamed = 0
    skipped = 0
    missing_raw = 0
    errors = 0
    text_fallback = 0
    renamed_paths: dict[str, tuple[PurePosixPath, str]] = {}

    with Session() as s:
        q = s.query(Article)
        if args.article_id:
            q = q.filter(Article.id == args.article_id)
        if args.site_code:
            q = q.join(CrawlSite, Article.site_id == CrawlSite.id).filter(
                CrawlSite.site_code == args.site_code
            )
        if args.target_code:
            q = q.join(CrawlTarget, Article.target_id == CrawlTarget.id).filter(
                CrawlTarget.target_code == args.target_code
            )
        articles = q.order_by(Article.id.asc()).all()

        for article in articles:
            attachments = list(article.attachments or [])
            if args.only_old_post_names:
                attachments = [
                    att for att in attachments
                    if _is_old_post_attachment_name(att.file_name)
                ]
            if args.only_hash_names:
                attachments = [
                    att for att in attachments
                    if _is_hash_attachment_name(att.file_name)
                ]
            if not attachments:
                continue
            checked += len(attachments)
            if not article.raw_html_path:
                missing_raw += len(attachments)
                continue

            try:
                raw_abs = to_os_path(settings.data_dir, PurePosixPath(article.raw_html_path))
                raw_html = raw_abs.read_text(encoding="utf-8", errors="ignore")
                if raw_html.lstrip().startswith(("{", "[")):
                    labels = _labels_from_content_text(article.content_text)
                    if labels:
                        fields_urls, fields_names = _labels_to_attachment_fields(labels)
                        text_fallback += len(attachments)
                    else:
                        skipped += len(attachments)
                        continue
                else:
                    site_code = article.site.site_code if article.site else None
                    target_code = (
                        article.target.target_code
                        if article.target
                        else _target_code_from_raw_path(article.raw_html_path)
                    )
                    selectors = (
                        get_detail_selectors(site_code, target_code)
                        if site_code and target_code
                        else None
                    ) or DETAIL_SELECTORS
                    fields = parse_detail(raw_html, article.url or "", selectors)
                    fields_urls = fields.attachment_urls
                    fields_names = fields.attachment_names
                    if not fields_urls or not fields_names:
                        labels = _labels_from_content_text(article.content_text)
                        if labels:
                            fields_urls, fields_names = _labels_to_attachment_fields(labels)
                            text_fallback += len(attachments)
                if not fields_urls or not fields_names:
                    skipped += len(attachments)
                    continue
            except FileNotFoundError:
                labels = _labels_from_content_text(article.content_text)
                if labels:
                    fields_urls, fields_names = _labels_to_attachment_fields(labels)
                    text_fallback += len(attachments)
                else:
                    missing_raw += len(attachments)
                    continue
            except Exception as exc:
                errors += len(attachments)
                print(f"error article_id={article.id} parse_failed={type(exc).__name__}:{exc}")
                continue

            for idx, att in enumerate(sorted(attachments, key=lambda x: x.id)):
                match_url = _match_attachment(att, fields_urls, fields_names, idx)
                label = fields_names.get(match_url or "")
                if not label:
                    skipped += 1
                    continue

                old_name = att.file_name or ""
                desired = _candidate_name(label, old_name, match_url)
                if not desired or desired == old_name:
                    if match_url and not att.source_url and not args.dry_run:
                        att.source_url = match_url
                    skipped += 1
                    continue

                if not att.file_path:
                    if not args.dry_run:
                        att.file_name = desired
                        att.source_url = att.source_url or match_url
                    renamed += 1
                    print(f"rename article_id={article.id} attachment_id={att.id} {old_name!r} -> {desired!r}")
                    continue

                old_rel = PurePosixPath(att.file_path)
                cached = renamed_paths.get(str(old_rel))
                if cached:
                    new_rel, final_name = cached
                    if not args.dry_run:
                        att.file_name = final_name
                        att.file_path = str(new_rel)
                        att.source_url = att.source_url or match_url
                    renamed += 1
                    print(f"rename article_id={article.id} attachment_id={att.id} {old_name!r} -> {final_name!r}")
                    continue

                old_abs = to_os_path(settings.data_dir, old_rel)
                abs_dir = old_abs.parent
                final_name = _dedupe_target_name(abs_dir, old_abs, desired, att.file_hash)
                new_rel = old_rel.with_name(final_name)
                new_abs = to_os_path(settings.data_dir, new_rel)
                renamed_paths[str(old_rel)] = (new_rel, final_name)

                if not args.dry_run:
                    if old_abs.exists() and old_abs != new_abs:
                        new_abs.parent.mkdir(parents=True, exist_ok=True)
                        old_abs.rename(new_abs)
                    att.file_name = final_name
                    att.file_path = str(new_rel)
                    att.source_url = att.source_url or match_url
                renamed += 1
                print(f"rename article_id={article.id} attachment_id={att.id} {old_name!r} -> {final_name!r}")

        if args.dry_run:
            s.rollback()
        else:
            s.commit()

    print(
        "checked=%s renamed=%s skipped=%s missing_raw=%s errors=%s dry_run=%s"
        % (checked, renamed, skipped, missing_raw, errors, args.dry_run)
    )
    print("text_fallback=%s" % text_fallback)


if __name__ == "__main__":
    main()
