from __future__ import annotations
import hashlib
import os
import tempfile
from dataclasses import dataclass
from datetime import datetime
from email.message import Message
from pathlib import PurePosixPath
from urllib.parse import unquote

import httpx

from govcrawler.settings import get_settings
from govcrawler.storage.filenames import dedupe_filename, safe_filename, with_best_extension
from govcrawler.storage.paths import build_reldir, to_os_path
from govcrawler.utils.url_norm import normalize_url

MAX_ATTACHMENT_BYTES = 200 * 1024 * 1024   # 200 MB
CHUNK = 64 * 1024
SAFE_EXT = {"pdf", "doc", "docx", "xls", "xlsx", "zip", "rar", "txt"}


@dataclass
class DownloadedAttachment:
    file_name: str
    file_ext: str
    size_bytes: int
    file_hash: str
    file_path: PurePosixPath
    source_url: str | None = None


def parse_disposition_filename(header: str | None) -> str | None:
    if not header:
        return None
    m = Message()
    m["content-disposition"] = header
    name = m.get_param("filename*", header="content-disposition") or m.get_param(
        "filename", header="content-disposition"
    )
    if not name:
        return None
    if isinstance(name, tuple):
        charset, _, raw = name
        try:
            # Python's email lib decodes %XX as latin-1 bytes; re-encode then decode
            # with the declared charset to recover the original (e.g. utf-8) text.
            if isinstance(raw, str):
                cs = (charset or "utf-8").lower() or "utf-8"
                try:
                    return raw.encode("latin-1").decode(cs)
                except Exception:
                    return unquote(raw)
            return raw
        except Exception:
            return raw
    if isinstance(name, str):
        decoded = unquote(name)
        # Some servers send an already URL-encoded value inside the regular
        # filename= parameter instead of RFC 5987 filename*=. Browsers decode
        # this for downloads; match that behavior for stored attachment names.
        return decoded
    return name


def _ext_of(name: str) -> str:
    _, dot, ext = name.rpartition(".")
    return ext.lower() if dot else ""


def _dedupe_filename(abs_dir, name: str, digest: str) -> str:
    """Return a safe filename, appending a short digest only on collision."""
    return dedupe_filename(abs_dir, name, digest)


def save_attachment_bytes(
    data: bytes,
    *,
    headers: dict | None = None,
    fallback_name: str = "attachment.bin",
    site: str,
    column: str,
    when: datetime,
    article_key: str,
    source_url: str | None = None,
) -> DownloadedAttachment:
    """Persist attachment bytes already fetched by a specialized client."""
    headers = headers or {}
    if len(data) > MAX_ATTACHMENT_BYTES:
        raise ValueError(f"attachment exceeds {MAX_ATTACHMENT_BYTES} bytes")
    settings = get_settings()
    reldir = build_reldir(site, column, when, "attachments")
    abs_dir = to_os_path(settings.data_dir, reldir)
    abs_dir.mkdir(parents=True, exist_ok=True)

    hasher = hashlib.sha256()
    hasher.update(data)
    digest = hasher.hexdigest()
    raw_name = (
        parse_disposition_filename(headers.get("content-disposition"))
        or fallback_name
        or "attachment.bin"
    )
    final_name = _dedupe_filename(abs_dir, raw_name, digest)
    final_abs = abs_dir / final_name
    try:
        final_abs.resolve().relative_to(abs_dir.resolve())
    except Exception:
        raise ValueError(f"path traversal blocked for filename={raw_name!r}")

    fd, tmp = tempfile.mkstemp(prefix=".dl_", dir=str(abs_dir))
    try:
        with os.fdopen(fd, "wb") as f:
            f.write(data)
        os.replace(tmp, final_abs)
    except Exception:
        try:
            os.unlink(tmp)
        except FileNotFoundError:
            pass
        raise

    return DownloadedAttachment(
        file_name=final_name,
        file_ext=_ext_of(final_name),
        size_bytes=len(data),
        file_hash=digest,
        file_path=reldir / final_name,
        source_url=source_url,
    )


def download_attachment(
    url: str,
    *,
    site: str,
    column: str,
    when: datetime,
    article_key: str,
    preferred_name: str | None = None,
    timeout_s: float | None = None,
) -> DownloadedAttachment:
    """Stream-download; enforce size cap + path-traversal guard."""
    norm = normalize_url(url)
    settings = get_settings()
    timeout = float(timeout_s if timeout_s is not None else settings.attachment_timeout_s)
    reldir = build_reldir(site, column, when, "attachments")
    abs_dir = to_os_path(settings.data_dir, reldir)
    abs_dir.mkdir(parents=True, exist_ok=True)

    hasher = hashlib.sha256()
    size = 0
    headers = {"User-Agent": settings.user_agent}
    # Inject cookies from the same pool that fetcher.http_client uses, so
    # ctct-shielded hosts (gdqy.gov.cn etc.) accept attachment downloads
    # after playwright primed them. Without this, /attachment/*.pdf URLs
    # under such hosts would return 412 challenge HTML and the bare
    # httpx.stream below would happily save it as a 'pdf' — but the file
    # would actually be the JS challenge page. Pulling cookies from the
    # pool keeps the request authenticated.
    try:
        from urllib.parse import urlparse as _up
        from govcrawler.cookies import get_default_store
        pool_cookies = get_default_store().get(_up(norm).netloc.lower()) or {}
    except Exception:
        pool_cookies = {}
    fd, tmp = tempfile.mkstemp(prefix=".dl_", dir=str(abs_dir))
    raw_name_fallback = norm.rsplit("/", 1)[-1].split("?", 1)[0] or "attachment.bin"
    raw_name: str = raw_name_fallback
    try:
        with httpx.stream(
            "GET", norm, headers=headers, cookies=pool_cookies,
            follow_redirects=True, timeout=timeout,
        ) as r:
            r.raise_for_status()
            disp_name = parse_disposition_filename(r.headers.get("content-disposition"))
            raw_name = (
                with_best_extension(
                    preferred_name,
                    disp_name,
                    raw_name_fallback,
                    r.headers.get("content-type"),
                )
                if preferred_name
                else (disp_name or raw_name_fallback)
            )
            with os.fdopen(fd, "wb") as f:
                for chunk in r.iter_bytes(chunk_size=CHUNK):
                    if not chunk:
                        continue
                    size += len(chunk)
                    if size > MAX_ATTACHMENT_BYTES:
                        raise ValueError(
                            f"attachment exceeds {MAX_ATTACHMENT_BYTES} bytes: {norm}"
                        )
                    hasher.update(chunk)
                    f.write(chunk)
    except Exception:
        try:
            os.unlink(tmp)
        except FileNotFoundError:
            pass
        raise

    ext = _ext_of(raw_name)
    final_name = _dedupe_filename(abs_dir, raw_name, hasher.hexdigest())
    final_abs = abs_dir / final_name
    try:
        final_abs.resolve().relative_to(abs_dir.resolve())
    except Exception:
        os.unlink(tmp)
        raise ValueError(f"path traversal blocked for filename={raw_name!r}")
    os.replace(tmp, final_abs)

    return DownloadedAttachment(
        file_name=final_name,
        file_ext=ext,
        size_bytes=size,
        file_hash=hasher.hexdigest(),
        file_path=reldir / final_name,
        source_url=norm,
    )