from __future__ import annotations
import hashlib
import os
import tempfile
from dataclasses import dataclass
from datetime import datetime
from email.message import Message
from pathlib import PurePosixPath
from urllib.parse import unquote

import httpx
from slugify import slugify

from govcrawler.settings import get_settings
from govcrawler.storage.paths import build_reldir, to_os_path
from govcrawler.utils.url_norm import normalize_url

MAX_ATTACHMENT_BYTES = 200 * 1024 * 1024   # 200 MB
CHUNK = 64 * 1024
SAFE_EXT = {"pdf", "doc", "docx", "xls", "xlsx", "zip", "rar", "txt"}


@dataclass
class DownloadedAttachment:
    file_name: str
    file_ext: str
    size_bytes: int
    file_hash: str
    file_path: PurePosixPath


def parse_disposition_filename(header: str | None) -> str | None:
    if not header:
        return None
    m = Message()
    m["content-disposition"] = header
    name = m.get_param("filename*", header="content-disposition") or m.get_param(
        "filename", header="content-disposition"
    )
    if not name:
        return None
    if isinstance(name, tuple):
        charset, _, raw = name
        try:
            # Python's email lib decodes %XX as latin-1 bytes; re-encode then decode
            # with the declared charset to recover the original (e.g. utf-8) text.
            if isinstance(raw, str):
                cs = (charset or "utf-8").lower() or "utf-8"
                try:
                    return raw.encode("latin-1").decode(cs)
                except Exception:
                    return unquote(raw)
            return raw
        except Exception:
            return raw
    return name


def safe_filename(name: str, max_len: int = 180) -> str:
    cleaned = slugify(
        name,
        allow_unicode=True,
        separator="_",
        regex_pattern=r'[\\/:*?"<>|\x00-\x1f]',
    )
    if len(cleaned) > max_len:
        cleaned = cleaned[:max_len]
    return cleaned or "attachment"


def _ext_of(name: str) -> str:
    _, dot, ext = name.rpartition(".")
    return ext.lower() if dot else ""


def download_attachment(
    url: str,
    *,
    site: str,
    column: str,
    when: datetime,
    article_key: str,
    timeout_s: float | None = None,
) -> DownloadedAttachment:
    """Stream-download; enforce size cap + path-traversal guard."""
    norm = normalize_url(url)
    settings = get_settings()
    timeout = float(timeout_s if timeout_s is not None else settings.attachment_timeout_s)
    reldir = build_reldir(site, column, when, "attachments")
    abs_dir = to_os_path(settings.data_dir, reldir)
    abs_dir.mkdir(parents=True, exist_ok=True)

    hasher = hashlib.sha256()
    size = 0
    headers = {"User-Agent": settings.user_agent}
    # Inject cookies from the same pool that fetcher.http_client uses, so
    # ctct-shielded hosts (gdqy.gov.cn etc.) accept attachment downloads
    # after playwright primed them. Without this, /attachment/*.pdf URLs
    # under such hosts would return 412 challenge HTML and the bare
    # httpx.stream below would happily save it as a 'pdf' — but the file
    # would actually be the JS challenge page. Pulling cookies from the
    # pool keeps the request authenticated.
    try:
        from urllib.parse import urlparse as _up
        from govcrawler.cookies import get_default_store
        pool_cookies = get_default_store().get(_up(norm).netloc.lower()) or {}
    except Exception:
        pool_cookies = {}
    fd, tmp = tempfile.mkstemp(prefix=".dl_", dir=str(abs_dir))
    raw_name_fallback = norm.rsplit("/", 1)[-1].split("?", 1)[0] or "attachment.bin"
    raw_name: str = raw_name_fallback
    try:
        with httpx.stream(
            "GET", norm, headers=headers, cookies=pool_cookies,
            follow_redirects=True, timeout=timeout,
        ) as r:
            r.raise_for_status()
            disp_name = parse_disposition_filename(r.headers.get("content-disposition"))
            raw_name = disp_name or raw_name_fallback
            with os.fdopen(fd, "wb") as f:
                for chunk in r.iter_bytes(chunk_size=CHUNK):
                    if not chunk:
                        continue
                    size += len(chunk)
                    if size > MAX_ATTACHMENT_BYTES:
                        raise ValueError(
                            f"attachment exceeds {MAX_ATTACHMENT_BYTES} bytes: {norm}"
                        )
                    hasher.update(chunk)
                    f.write(chunk)
    except Exception:
        try:
            os.unlink(tmp)
        except FileNotFoundError:
            pass
        raise

    ext = _ext_of(raw_name)
    final_name = f"{article_key}_{safe_filename(raw_name)}"
    final_abs = abs_dir / final_name
    try:
        final_abs.resolve().relative_to(abs_dir.resolve())
    except Exception:
        os.unlink(tmp)
        raise ValueError(f"path traversal blocked for filename={raw_name!r}")
    os.replace(tmp, final_abs)

    return DownloadedAttachment(
        file_name=final_name,
        file_ext=ext,
        size_bytes=size,
        file_hash=hasher.hexdigest(),
        file_path=reldir / final_name,
    )
