from __future__ import annotations
import hashlib
from urllib.parse import urlparse, urlunparse, urlencode, parse_qsl

ALLOWED_SCHEMES = {"http", "https"}


def normalize_url(u: str, *, force_https: bool = False) -> str:
    """lower host、去 fragment、sort query、去尾 /；默认保留原 scheme。

    对非 http/https scheme 抛 ValueError — 防 SRE: file://、javascript:。

    设 `force_https=True` 时强制转 https，只用于做跨协议 url_hash 去重
    （防止 http:// 和 https:// 版本的同一 URL 被当成两篇）。实际下载附件
    走原 scheme 以适配大量政务站 HTTPS 证书 hostname mismatch 的现状。
    """
    u = (u or "").strip()
    p = urlparse(u)
    if p.scheme.lower() not in ALLOWED_SCHEMES:
        raise ValueError(f"refuse non-http scheme: {p.scheme!r}")
    scheme = "https" if force_https else p.scheme.lower()
    host = p.netloc.lower()
    if not host:
        raise ValueError(f"url missing host: {u!r}")
    path = p.path.rstrip("/") or "/"
    q = urlencode(sorted(parse_qsl(p.query, keep_blank_values=True)))
    return urlunparse((scheme, host, path, "", q, ""))


def url_hash(u: str) -> str:
    """sha256 hex of canonicalized URL (64 chars).

    Uses force_https=True so http:// and https:// versions of the same
    endpoint produce the same hash → de-dup works across protocol flaps.
    """
    return hashlib.sha256(normalize_url(u, force_https=True).encode("utf-8")).hexdigest()


def sha256_bytes(data: bytes) -> str:
    """For attachment file_hash — Plan 03 consumer."""
    return hashlib.sha256(data).hexdigest()