"""
元数据提取器 —— 基于 LLM 从政务文档文本中自动提取标题、文号、发文机关等结构化元数据。
LLM-based metadata extractor for government documents.

Usage
-----
    extractor = MetadataExtractor(llm_client)
    meta = await extractor.extract(full_text)
    # meta = {"title": "...", "doc_number": "...", ...}
"""

from __future__ import annotations

import re
from typing import Any

from app.infrastructure.llm_client import LLMClient
from app.prompts.metadata_extraction import SYSTEM_PROMPT, build_user_prompt
from app.utils.logger import get_logger

logger = get_logger(__name__)

# 元数据提取专用的截取长度，固定为 2000 字符。
# 与 settings.summary_max_content_chars（摘要生成用，默认 8000）不同：
# 元数据（标题、文号等）集中在文档首部，无需更多文本。
# Max chars for metadata extraction (fixed at 2000).
# Differs from settings.summary_max_content_chars (default 8000 for summary generation):
# metadata fields (title, doc_number, etc.) appear in the document header and need less text.
_MAX_CONTENT_CHARS = 2000

# 控制字符清洗正则：移除 \x00-\x08, \x0B-\x0C, \x0E-\x1F 等非打印控制字符
# Regex to strip non-printable control characters before sending to LLM
_CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")

# Fields we expect the LLM to return
_EXPECTED_FIELDS = {
    "title": str,
    "doc_number": str,
    "issuing_org": str,
    "doc_type": str,
    "document_scene_type": str,
    "publish_date": str,
    "signer": str,
    "subject_words": list,
}


class MetadataExtractor:
    """政务文档元数据提取器，调用 LLM 从文档首部文本提取标题、文号、发文机关等结构化字段。

    Extract structured metadata from government document text using an LLM.

    The extractor calls ``LLMClient.chat_json`` which requests
    ``response_format=json_object`` – so the output is guaranteed parseable JSON.

    All failures are handled gracefully: ``extract`` never raises; it returns
    an empty dict on any error so the ingest pipeline can continue.
    """

    def __init__(self, llm_client: LLMClient) -> None:
        self._llm = llm_client

    @staticmethod
    def _sanitize_text(text: str) -> str:
        """清洗控制字符，防止发送至 LLM 时产生异常或 prompt 注入。

        Strip control characters to prevent LLM errors or prompt injection.
        """
        return _CONTROL_CHAR_RE.sub("", text)

    async def extract(self, text: str) -> dict[str, Any]:
        """Extract metadata fields from document text.

        Parameters
        ----------
        text:
            Full document text (only the first ``_MAX_CONTENT_CHARS``
            characters are forwarded to the LLM).

        Returns
        -------
        dict
            Keys: title, doc_number, issuing_org, doc_type, publish_date,
            signer, subject_words.  Missing / failed fields return "" or [].
            Always returns a plain dict, never raises.
        """
        content = self._sanitize_text(text[:_MAX_CONTENT_CHARS]).strip()
        if not content:
            logger.warning("metadata_extraction_empty_text")
            return {}

        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": build_user_prompt(content)},
        ]

        try:
            result = await self._llm.chat_json(
                messages,
                temperature=0.1,
                max_tokens=512,
                # Disable chain-of-thought / thinking for this task –
                # speed matters more than reasoning depth here.
                extra_body={"enable_thinking": False},
            )

            # Sanitise: ensure expected types
            cleaned = self._sanitise(result)

            logger.info(
                "metadata_extracted",
                title=cleaned.get("title", "")[:50],
                doc_number=cleaned.get("doc_number", ""),
                issuing_org=cleaned.get("issuing_org", ""),
                doc_type=cleaned.get("doc_type", ""),
            )
            return cleaned

        except Exception as exc:
            logger.warning("metadata_extraction_failed", error=str(exc))
            return {}

    # ── helpers ──────────────────────────────────────────────────────────────

    @staticmethod
    def _sanitise(raw: dict[str, Any]) -> dict[str, Any]:
        """Ensure every expected field has the right type."""
        out: dict[str, Any] = {}
        for field, expected_type in _EXPECTED_FIELDS.items():
            val = raw.get(field)
            if isinstance(val, expected_type):
                out[field] = val
            elif expected_type is list:
                # Convert non-list to [] rather than crashing
                out[field] = list(val) if isinstance(val, (list, tuple)) else []
            else:
                # Coerce to str, strip whitespace
                out[field] = str(val).strip() if val is not None else ""
        return out