"""
统一文档分析器 —— 将元数据提取和摘要生成合并为单次 LLM 调用，减少延迟和 token 消耗。
Unified document analyzer – extracts metadata and generates summary in a single LLM call.

Usage
-----
    analyzer = DocumentAnalyzer(llm_client)
    metadata, summary = await analyzer.analyze(full_text)
    # metadata = {"title": "...", "doc_number": "...", ...}
    # summary = "本文件由××发布，主要内容为……"
"""

from __future__ import annotations

import re
from typing import Any

from app.config import settings
from app.infrastructure.llm_client import LLMClient
from app.prompts.document_analysis import SYSTEM_PROMPT, _USER_PROMPT_TEMPLATE, build_user_prompt
from app.utils.logger import get_logger

logger = get_logger(__name__)

# 控制字符清洗正则
_CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")

# Expected metadata fields and their types
_EXPECTED_FIELDS: dict[str, type] = {
    "title": str,
    "doc_number": str,
    "issuing_org": str,
    "doc_type": str,
    "document_scene_type": str,
    "publish_date": str,
    "signer": str,
    "subject_words": list,
}


class DocumentAnalyzer:
    """统一文档分析器：单次 LLM 调用同时提取元数据和生成摘要。

    Unified document analyzer that extracts structured metadata and generates
    a summary in a single LLM call, reducing latency and token usage.

    All failures are handled gracefully: ``analyze`` never raises; it returns
    ``({}, "")`` on any error so the ingest pipeline can continue.
    """

    def __init__(
        self,
        llm_client: LLMClient,
        *,
        max_content_chars: int | None = None,
    ) -> None:
        self._llm = llm_client
        self._max_chars = max_content_chars or settings.analysis_max_content_chars

    async def analyze(self, text: str) -> tuple[dict[str, Any], str, dict[str, str]]:
        """Extract metadata and generate summary from document text.

        Parameters
        ----------
        text:
            Full document text. Only the first ``max_content_chars``
            characters are forwarded to the LLM.

        Returns
        -------
        tuple[dict, str, dict]
            (metadata_dict, summary_string, prompt_info). On failure returns ({}, "", {}).
            prompt_info contains the raw prompt templates (without document content filled in).
        """
        prompt_info: dict[str, str] = {
            "system_prompt": SYSTEM_PROMPT,
            "user_prompt_template": _USER_PROMPT_TEMPLATE,
        }

        content = _CONTROL_CHAR_RE.sub("", text[: self._max_chars]).strip()
        if not content:
            logger.warning("document_analysis_empty_text")
            return {}, "", prompt_info

        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": build_user_prompt(content, self._max_chars)},
        ]

        try:
            result = await self._llm.chat_json(
                messages,
                temperature=0.1,
                max_tokens=1024,
                extra_body={"enable_thinking": False},
            )

            # Parse metadata
            raw_meta = result.get("metadata", {})
            if not isinstance(raw_meta, dict):
                raw_meta = {}
            metadata = self._sanitise_metadata(raw_meta)

            # Parse summary
            summary = result.get("summary", "")
            if not isinstance(summary, str):
                summary = str(summary) if summary else ""
            summary = summary.strip()

            logger.info(
                "document_analyzed",
                title=metadata.get("title", "")[:50],
                doc_number=metadata.get("doc_number", ""),
                summary_length=len(summary),
            )
            return metadata, summary, prompt_info

        except Exception as exc:
            logger.warning("document_analysis_failed", error=str(exc))
            return {}, "", prompt_info

    @staticmethod
    def _sanitise_metadata(raw: dict[str, Any]) -> dict[str, Any]:
        """Ensure every expected field has the correct type."""
        out: dict[str, Any] = {}
        for field, expected_type in _EXPECTED_FIELDS.items():
            val = raw.get(field)
            if isinstance(val, expected_type):
                out[field] = val
            elif expected_type is list:
                out[field] = list(val) if isinstance(val, (list, tuple)) else []
            else:
                out[field] = str(val).strip() if val is not None else ""
        return out
