"""Delete legacy HTML attachments.

HTML/HTM attachment rows usually represent related links such as policy
interpretations, media reports, or H5 pages. They are not downloadable files
like PDF/DOC/XLS attachments and should not be exported as article attachments.

The script is dry-run by default. Pass --execute to delete DB rows and remove
the corresponding files from the attachment storage directory.
"""
from __future__ import annotations

import argparse
from collections import Counter
from pathlib import PurePosixPath

from sqlalchemy import func, or_

from govcrawler.db import get_sessionmaker
from govcrawler.models import Article, Attachment, CrawlSite, CrawlTarget
from govcrawler.settings import get_settings
from govcrawler.storage.paths import to_os_path


HTML_EXTS = {"html", "htm", "shtml", "shtm"}


def _is_html_attachment(att: Attachment) -> bool:
    ext = (att.file_ext or "").lower().lstrip(".")
    name = (att.file_name or "").lower()
    path = (att.file_path or "").lower()
    return (
        ext in HTML_EXTS
        or any(name.endswith(f".{x}") for x in HTML_EXTS)
        or any(path.endswith(f".{x}") for x in HTML_EXTS)
    )


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--site-code", help="limit to one crawl site")
    ap.add_argument("--target-code", help="limit to one crawl target")
    ap.add_argument("--limit", type=int, help="process at most N rows")
    ap.add_argument("--execute", action="store_true", help="actually delete rows and files")
    args = ap.parse_args()

    settings = get_settings()
    Session = get_sessionmaker()
    deleted_rows = 0
    deleted_files = 0
    missing_files = 0
    groups: Counter[tuple[str, str | None]] = Counter()
    samples: list[str] = []

    with Session() as s:
        q = (
            s.query(Attachment, Article, CrawlSite, CrawlTarget)
            .join(Article, Attachment.article_id == Article.id)
            .join(CrawlSite, Article.site_id == CrawlSite.id)
            .outerjoin(CrawlTarget, Article.target_id == CrawlTarget.id)
            .filter(
                or_(
                    func.lower(Attachment.file_ext).in_(HTML_EXTS),
                    *[
                        func.lower(Attachment.file_name).like(f"%.{ext}")
                        for ext in HTML_EXTS
                    ],
                    *[
                        func.lower(Attachment.file_path).like(f"%.{ext}")
                        for ext in HTML_EXTS
                    ],
                )
            )
        )
        if args.site_code:
            q = q.filter(CrawlSite.site_code == args.site_code)
        if args.target_code:
            q = q.filter(CrawlTarget.target_code == args.target_code)
        q = q.order_by(Attachment.id.asc())
        if args.limit:
            q = q.limit(args.limit)

        rows = q.all()
        for att, article, site, target in rows:
            if not _is_html_attachment(att):
                continue
            target_code = target.target_code if target else None
            groups[(site.site_code, target_code)] += 1
            if len(samples) < 30:
                samples.append(
                    "att_id=%s article_id=%s site=%s target=%s name=%r url=%s"
                    % (
                        att.id,
                        article.id,
                        site.site_code,
                        target_code,
                        att.file_name,
                        att.source_url,
                    )
                )

            if args.execute:
                if att.file_path:
                    file_abs = to_os_path(settings.data_dir, PurePosixPath(att.file_path))
                    try:
                        file_abs.unlink()
                        deleted_files += 1
                    except FileNotFoundError:
                        missing_files += 1
                s.delete(att)
            deleted_rows += 1

        if args.execute:
            s.commit()
        else:
            s.rollback()

    print("groups:")
    for (site_code, target_code), count in groups.most_common():
        print(f"  {site_code} {target_code or '-'} {count}")
    print("samples:")
    for sample in samples:
        print(f"  {sample}")
    print(
        "matched=%s deleted_rows=%s deleted_files=%s missing_files=%s execute=%s"
        % (sum(groups.values()), deleted_rows, deleted_files, missing_files, args.execute)
    )


if __name__ == "__main__":
    main()
