from __future__ import annotations
import hashlib
import os
import tempfile
from dataclasses import dataclass
from datetime import datetime
from email.message import Message
from pathlib import PurePosixPath
from urllib.parse import unquote

import httpx
from slugify import slugify

from govcrawler.settings import get_settings
from govcrawler.storage.paths import build_reldir, to_os_path
from govcrawler.utils.url_norm import normalize_url

MAX_ATTACHMENT_BYTES = 200 * 1024 * 1024   # 200 MB
CHUNK = 64 * 1024
SAFE_EXT = {"pdf", "doc", "docx", "xls", "xlsx", "zip", "rar", "txt"}


@dataclass
class DownloadedAttachment:
    file_name: str
    file_ext: str
    size_bytes: int
    file_hash: str
    file_path: PurePosixPath


def parse_disposition_filename(header: str | None) -> str | None:
    if not header:
        return None
    m = Message()
    m["content-disposition"] = header
    name = m.get_param("filename*", header="content-disposition") or m.get_param(
        "filename", header="content-disposition"
    )
    if not name:
        return None
    if isinstance(name, tuple):
        charset, _, raw = name
        try:
            # Python's email lib decodes %XX as latin-1 bytes; re-encode then decode
            # with the declared charset to recover the original (e.g. utf-8) text.
            if isinstance(raw, str):
                cs = (charset or "utf-8").lower() or "utf-8"
                try:
                    return raw.encode("latin-1").decode(cs)
                except Exception:
                    return unquote(raw)
            return raw
        except Exception:
            return raw
    return name


def safe_filename(name: str, max_len: int = 180) -> str:
    cleaned = slugify(
        name,
        allow_unicode=True,
        separator="_",
        regex_pattern=r'[\\/:*?"<>|\x00-\x1f]',
    )
    if len(cleaned) > max_len:
        cleaned = cleaned[:max_len]
    return cleaned or "attachment"


def _ext_of(name: str) -> str:
    _, dot, ext = name.rpartition(".")
    return ext.lower() if dot else ""


def download_attachment(
    url: str,
    *,
    site: str,
    column: str,
    when: datetime,
    article_key: str,
) -> DownloadedAttachment:
    """Stream-download; enforce size cap + path-traversal guard."""
    norm = normalize_url(url)
    reldir = build_reldir(site, column, when, "attachments")
    abs_dir = to_os_path(get_settings().data_dir, reldir)
    abs_dir.mkdir(parents=True, exist_ok=True)

    hasher = hashlib.sha256()
    size = 0
    headers = {"User-Agent": get_settings().user_agent}
    fd, tmp = tempfile.mkstemp(prefix=".dl_", dir=str(abs_dir))
    raw_name_fallback = norm.rsplit("/", 1)[-1].split("?", 1)[0] or "attachment.bin"
    raw_name: str = raw_name_fallback
    try:
        with httpx.stream(
            "GET", norm, headers=headers, follow_redirects=True, timeout=60
        ) as r:
            r.raise_for_status()
            disp_name = parse_disposition_filename(r.headers.get("content-disposition"))
            raw_name = disp_name or raw_name_fallback
            with os.fdopen(fd, "wb") as f:
                for chunk in r.iter_bytes(chunk_size=CHUNK):
                    if not chunk:
                        continue
                    size += len(chunk)
                    if size > MAX_ATTACHMENT_BYTES:
                        raise ValueError(
                            f"attachment exceeds {MAX_ATTACHMENT_BYTES} bytes: {norm}"
                        )
                    hasher.update(chunk)
                    f.write(chunk)
    except Exception:
        try:
            os.unlink(tmp)
        except FileNotFoundError:
            pass
        raise

    ext = _ext_of(raw_name)
    final_name = f"{article_key}_{safe_filename(raw_name)}"
    final_abs = abs_dir / final_name
    try:
        final_abs.resolve().relative_to(abs_dir.resolve())
    except Exception:
        os.unlink(tmp)
        raise ValueError(f"path traversal blocked for filename={raw_name!r}")
    os.replace(tmp, final_abs)

    return DownloadedAttachment(
        file_name=final_name,
        file_ext=ext,
        size_bytes=size,
        file_hash=hasher.hexdigest(),
        file_path=reldir / final_name,
    )
