"""
多格式文档处理器 —— 基于 Docling 将 PDF/DOCX/PPTX/XLSX/图片/Markdown 等解析为带结构元数据的分块。
Multi-format document processor using Docling.

Parses PDF, DOCX, PPTX, XLSX, images, Markdown and more into structured
chunks with heading hierarchy and page number metadata.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from app.config import settings
from app.utils.logger import get_logger

logger = get_logger(__name__)


@dataclass
class ProcessedChunk:
    """Docling 产出的分块，携带页码、标题层级和元素类型等结构信息。

    A chunk produced by Docling with structural metadata."""

    content: str
    chunk_index: int
    page_number: int | None = None
    page_numbers: list[int] = field(default_factory=list)
    heading_hierarchy: list[str] = field(default_factory=list)
    element_type: str = ""


@dataclass
class ProcessedDocument:
    """Docling 处理结果，包含全文文本、结构化分块列表、页数和文件类型。

    Result of processing a document with Docling."""

    full_text: str
    chunks: list[ProcessedChunk]
    page_count: int
    file_type: str
    metadata: dict[str, Any] = field(default_factory=dict)


class DoclingProcessor:
    """多格式文档处理器，使用 Docling 解析文档并产出带结构元数据的分块，支持 OCR 和延迟初始化。

    Parse multi-format documents using Docling and produce structured chunks."""

    def __init__(
        self,
        *,
        max_tokens: int | None = None,
        ocr_enabled: bool | None = None,
    ):
        self._max_tokens = max_tokens or settings.docling_max_tokens
        self._ocr_enabled = ocr_enabled if ocr_enabled is not None else settings.docling_ocr_enabled

        # Lazy-init: Docling imports are heavy, defer until first use
        self._converter = None
        self._chunker = None
        self._initialized = False

    def _ensure_initialized(self) -> None:
        """Lazy-initialize Docling converter (heavy imports)."""
        if self._initialized:
            return

        from docling.document_converter import DocumentConverter, PdfFormatOption
        from docling.datamodel.pipeline_options import PdfPipelineOptions

        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_ocr = self._ocr_enabled

        if self._ocr_enabled:
            try:
                from docling.datamodel.pipeline_options import RapidOcrOptions
                pipeline_options.ocr_options = RapidOcrOptions()
                logger.info("docling_ocr_engine", engine="RapidOCR")
            except ImportError:
                logger.warning("rapidocr_not_available", fallback="default OCR")

        self._converter = DocumentConverter(
            format_options={
                "pdf": PdfFormatOption(pipeline_options=pipeline_options),
            },
        )

        # 复用 HybridChunker 实例，避免每次 process() 重复创建
        # Reuse HybridChunker instance to avoid recreating it on every process() call
        from docling.chunking import HybridChunker
        self._chunker = HybridChunker(max_tokens=self._max_tokens)

        self._initialized = True
        logger.info("docling_initialized", max_tokens=self._max_tokens, ocr=self._ocr_enabled)

    @staticmethod
    def _extract_xlsx_text(file_path: Path) -> str:
        """Fallback: 使用 openpyxl 直接读取 xlsx 单元格文本。

        Docling 的 markdown 导出对纯表格 xlsx 可能返回空文本，
        此方法逐行读取所有 sheet 的单元格值作为兜底方案。
        """
        import openpyxl

        try:
            wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
        except Exception as exc:
            logger.warning("xlsx_openpyxl_open_failed", path=str(file_path), error=str(exc))
            return ""

        lines: list[str] = []
        try:
            for sheet_name in wb.sheetnames:
                ws = wb[sheet_name]
                if len(wb.sheetnames) > 1:
                    lines.append(f"## {sheet_name}")
                    lines.append("")
                for row in ws.iter_rows(values_only=True):
                    cells = [str(c).strip() if c is not None else "" for c in row]
                    # 跳过全空行
                    if not any(cells):
                        continue
                    lines.append(" | ".join(cells))
                lines.append("")
        finally:
            wb.close()

        text = "\n".join(lines).strip()
        if text:
            logger.info(
                "xlsx_openpyxl_fallback_ok",
                path=str(file_path),
                chars=len(text),
            )
        return text

    @staticmethod
    def _patch_xlsx_compat(file_path: Path) -> Path:
        """Remove openpyxl-incompatible attributes from xlsx files in-place.

        Java POI may produce xlsx with attributes like `defaultColWidthPt`,
        `widthPt` etc. that openpyxl 3.x does not recognize (causing TypeError).
        This strips all `*Pt` XML attributes from worksheet/styles XML.
        """
        import re
        import zipfile
        from io import BytesIO

        _PT_ATTRS = re.compile(r'\s+\w+Pt="[^"]*"')

        needs_patch = False
        with zipfile.ZipFile(file_path, "r") as zin:
            for name in zin.namelist():
                if name.startswith("xl/") and name.endswith(".xml"):
                    content = zin.read(name).decode("utf-8")
                    if _PT_ATTRS.search(content):
                        needs_patch = True
                        break

        if not needs_patch:
            return file_path

        logger.info("xlsx_compat_patch", path=str(file_path))
        buf = BytesIO()
        with zipfile.ZipFile(file_path, "r") as zin, \
             zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                data = zin.read(item.filename)
                if item.filename.startswith("xl/") and item.filename.endswith(".xml"):
                    text = data.decode("utf-8")
                    if _PT_ATTRS.search(text):
                        text = _PT_ATTRS.sub("", text)
                        data = text.encode("utf-8")
                zout.writestr(item, data)

        file_path.write_bytes(buf.getvalue())
        return file_path

    def process(self, file_path: Path) -> ProcessedDocument:
        """解析文档并产出结构化分块：Docling 转换 → HybridChunker 分块 → 提取页码/标题层级。

        Parse a document and produce structured chunks.

        Args:
            file_path: Path to the document file (PDF, DOCX, PPTX, XLSX, image, MD, etc.)

        Returns:
            ProcessedDocument with full text, chunks, and metadata.
        """
        self._ensure_initialized()

        file_path = Path(file_path)
        file_type = file_path.suffix.lstrip(".").lower()

        logger.info("docling_processing", path=str(file_path), file_type=file_type)

        # Workaround: POI-generated xlsx may contain unsupported attributes
        # (e.g. defaultColWidthPt) that cause openpyxl to fail.
        # Strip them before passing to Docling.
        if file_type == "xlsx":
            file_path = self._patch_xlsx_compat(file_path)

        # Step 1: Convert document with Docling
        # 捕获 Docling 转换异常，记录文件路径和类型便于排查
        # Catch Docling conversion errors with file context for debugging
        try:
            result = self._converter.convert(str(file_path))
        except Exception as exc:
            logger.error(
                "docling_convert_failed",
                path=str(file_path),
                file_type=file_type,
                error=str(exc),
            )
            raise RuntimeError(
                f"Docling 文档转换失败 (file={file_path.name}, type={file_type}): {exc}"
            ) from exc
        doc = result.document

        # Step 2: Chunk with HybridChunker (复用 _ensure_initialized 中创建的实例)
        doc_chunks = list(self._chunker.chunk(doc))

        # Step 3: Extract metadata from each chunk
        chunks: list[ProcessedChunk] = []
        all_page_numbers: set[int] = set()

        for i, dc in enumerate(doc_chunks):
            page_numbers: set[int] = set()

            # 使用 hasattr 检查以兼容不同版本的 Docling API（属性可能不存在）
            # hasattr checks for compatibility across different Docling API versions
            # Extract page numbers from provenance
            if hasattr(dc, "meta") and hasattr(dc.meta, "doc_items"):
                for item in dc.meta.doc_items:
                    if hasattr(item, "prov"):
                        for prov in item.prov:
                            if hasattr(prov, "page_no") and prov.page_no is not None:
                                page_numbers.add(prov.page_no)

            sorted_pages = sorted(page_numbers)
            all_page_numbers.update(page_numbers)

            # Extract heading hierarchy
            headings: list[str] = []
            if hasattr(dc, "meta") and hasattr(dc.meta, "headings"):
                headings = list(dc.meta.headings) if dc.meta.headings else []

            # Extract element type from first doc_item
            element_type = ""
            if hasattr(dc, "meta") and hasattr(dc.meta, "doc_items") and dc.meta.doc_items:
                first_item = dc.meta.doc_items[0]
                if hasattr(first_item, "label"):
                    element_type = str(first_item.label)

            # Get chunk text
            chunk_text = dc.text if hasattr(dc, "text") else str(dc)

            chunks.append(ProcessedChunk(
                content=chunk_text,
                chunk_index=i,
                page_number=sorted_pages[0] if sorted_pages else None,
                page_numbers=sorted_pages,
                heading_hierarchy=headings,
                element_type=element_type,
            ))

        # Step 4: Get full text for summary/graph (export as markdown)
        full_text = doc.export_to_markdown()

        # Fallback: xlsx 文件 Docling 导出可能为空，用 openpyxl 直接读取单元格
        # Fallback: Docling markdown export may be empty for xlsx spreadsheets
        xlsx_fallback_used = False
        if not full_text.strip() and file_type == "xlsx":
            logger.warning(
                "docling_xlsx_empty_text_fallback",
                path=str(file_path),
            )
            full_text = self._extract_xlsx_text(file_path)
            xlsx_fallback_used = True
            # 如果 Docling 也没产出分块，从 openpyxl 文本生成基础分块
            if full_text.strip() and not chunks:
                chunks = [
                    ProcessedChunk(
                        content=full_text,
                        chunk_index=0,
                        element_type="table",
                    )
                ]

        # Calculate page count
        page_count = max(all_page_numbers) if all_page_numbers else 0

        logger.info(
            "docling_processed",
            path=str(file_path),
            chunks=len(chunks),
            pages=page_count,
            full_text_chars=len(full_text),
            xlsx_fallback=xlsx_fallback_used,
        )

        return ProcessedDocument(
            full_text=full_text,
            chunks=chunks,
            page_count=page_count,
            file_type=file_type,
            metadata={
                "source_format": file_type,
                "chunk_count": len(chunks),
                "xlsx_fallback": xlsx_fallback_used,
            },
        )