"""Text normalization utilities for government document processing.

Provides ``normalize_title`` — a deterministic title normalizer used for
document deduplication, placeholder merging, and governance-chain matching.

Usage::

    from app.utils.text_normalize import normalize_title

    norm = normalize_title("《关于优化营商环境的 通知》")
    # => "关于优化营商环境的 通知"

政务文档标题规范化工具模块。
对文档标题执行确定性的规范化处理（括号统一、引号统一、空白折叠等），
用于文档去重、占位节点合并和政策链匹配。
规范化结果是稳定的：相同输入始终产生相同输出。
"""

from __future__ import annotations

import re
import unicodedata


# ── Bracket / quote normalization maps ──────────────────────────────────────

# Full-width → half-width brackets
_BRACKET_MAP = str.maketrans({
    "\uff08": "(",   # （ → (
    "\uff09": ")",   # ） → )
    "\u3010": "[",   # 【 → [
    "\u3011": "]",   # 】 → ]
    "\uff3b": "[",   # ［ → [
    "\uff3d": "]",   # ］ → ]
})

# Chinese fancy quotes → standard Chinese quotes
_QUOTE_MAP = str.maketrans({
    "\u201c": "\u300a",  # " → 《  (left double)
    "\u201d": "\u300b",  # " → 》  (right double)
    "\u2018": "\u300a",  # ' → 《  (left single)
    "\u2019": "\u300b",  # ' → 》  (right single)
})

# Collapse multiple whitespace
_MULTI_SPACE_RE = re.compile(r"\s{2,}")

# Leading / trailing punctuation to strip (excluding 书名号 《》)
_EDGE_PUNCT_RE = re.compile(
    r"^[\s\u3000,，.。;；:：!！?？、\-—–]+|[\s\u3000,，.。;；:：!！?？、\-—–]+$"
)

# Whitespace around 书名号
_TITLE_MARK_SPACE_RE = re.compile(r"\s*([《》])\s*")


def normalize_title(title: str) -> str:
    """Normalize a document title for deduplication and matching.

    Applies the following transformations in order:

    1. Strip leading/trailing whitespace
    2. Unify full-width / half-width brackets
    3. Unify Chinese / English quote styles
    4. Strip leading/trailing redundant punctuation
    5. Collapse consecutive whitespace to a single space
    6. Normalize whitespace around 书名号 (《》)
    7. Preserve the core title text (years, serial numbers, attachment
       markers are NOT removed to avoid false merges)

    Parameters
    ----------
    title:
        Raw title string.

    Returns
    -------
    Normalized title string.  Returns empty string for None / blank input.
    """
    if not title:
        return ""

    # NFKC normalization (full-width alphanumerics → half-width, etc.)
    text = unicodedata.normalize("NFKC", title)

    # 1. Strip whitespace
    text = text.strip()

    # 2. Unify brackets
    text = text.translate(_BRACKET_MAP)

    # 3. Unify quotes
    text = text.translate(_QUOTE_MAP)

    # 4. Strip edge punctuation
    text = _EDGE_PUNCT_RE.sub("", text)

    # 5. Collapse whitespace
    text = _MULTI_SPACE_RE.sub(" ", text)

    # 6. Normalize whitespace around 书名号
    text = _TITLE_MARK_SPACE_RE.sub(r"\1", text)

    return text.strip()
