"""
PDF 文档处理器 —— 使用 PyMuPDF 提取 PDF 文本，清理页眉页脚和水印，规范化空白字符。
Document processor – extract text from PDF files using PyMuPDF.

注意：此模块已被 docling_processor.py 替代，仅用于回退兼容。
NOTE: This module has been superseded by docling_processor.py and is kept only for fallback compatibility.
"""

from __future__ import annotations

import re
from pathlib import Path
from typing import Any

import fitz  # PyMuPDF

from app.utils.logger import get_logger

logger = get_logger(__name__)

# 政务文档常见的页眉页脚模式（页码格式等），匹配后将被清除
# Common patterns to clean from government docs
_HEADER_FOOTER_PATTERNS = [
    re.compile(r"^[-—]\s*\d+\s*[-—]$"),  # — 1 — page numbers
    re.compile(r"^\s*第\s*\d+\s*页\s*$"),  # 第 X 页
    re.compile(r"^\s*共\s*\d+\s*页\s*$"),  # 共 X 页
    re.compile(r"^\s*- \d+ -\s*$"),  # - 1 -
]

# 水印文字模式，匹配后将被移除
_WATERMARK_PATTERNS = [
    re.compile(r"内部资料|仅供参考|草稿|DRAFT", re.IGNORECASE),
]


class DocumentProcessor:
    """PDF 文本提取器，逐页提取文本并清理页眉页脚、水印等干扰内容。

    Extract and clean text from PDF files."""

    def __init__(self, *, remove_headers: bool = True, remove_watermarks: bool = True):
        self._remove_headers = remove_headers
        self._remove_watermarks = remove_watermarks

    def extract_text(self, file_path: str | Path) -> dict[str, Any]:
        """Extract text from a PDF file.

        Returns:
            {
                "full_text": str,          # cleaned full text
                "pages": list[str],        # text per page
                "page_count": int,
                "metadata": dict,          # PDF metadata (author, title, etc.)
            }
        """
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        # 使用上下文管理器确保文件句柄及时释放
        # Use context manager to ensure the file handle is released promptly
        with fitz.open(str(file_path)) as doc:
            pages: list[str] = []
            total_pages = len(doc)
            doc_metadata = {
                "title": doc.metadata.get("title", ""),
                "author": doc.metadata.get("author", ""),
                "page_count": total_pages,
            }

            for page_num in range(total_pages):
                page = doc[page_num]
                text = page.get_text("text")

                if self._remove_headers:
                    text = self._clean_headers_footers(text)
                if self._remove_watermarks:
                    text = self._clean_watermarks(text)

                text = self._normalize_text(text)
                if text.strip():
                    pages.append(text.strip())

        full_text = "\n\n".join(pages)

        logger.info(
            "document_extracted",
            path=str(file_path),
            pages=len(pages),
            chars=len(full_text),
        )

        return {
            "full_text": full_text,
            "pages": pages,
            "page_count": total_pages,   # use cached value (doc is already closed)
            "metadata": doc_metadata,
        }

    def _clean_headers_footers(self, text: str) -> str:
        """Remove common page headers and footers."""
        lines = text.split("\n")
        cleaned = []
        for line in lines:
            stripped = line.strip()
            if any(pat.match(stripped) for pat in _HEADER_FOOTER_PATTERNS):
                continue
            cleaned.append(line)
        return "\n".join(cleaned)

    def _clean_watermarks(self, text: str) -> str:
        """Remove watermark text patterns."""
        for pat in _WATERMARK_PATTERNS:
            text = pat.sub("", text)
        return text

    def _normalize_text(self, text: str) -> str:
        """Normalize whitespace and special characters."""
        # Replace multiple whitespace/newlines with single
        text = re.sub(r"\n{3,}", "\n\n", text)
        # Normalize unicode spaces
        text = re.sub(r"[\u3000\xa0]+", " ", text)
        # Remove zero-width chars
        text = re.sub(r"[\u200b\u200c\u200d\ufeff]", "", text)
        # Normalize common punctuation
        text = text.replace("\r\n", "\n").replace("\r", "\n")
        return text.strip()

    def extract_tables(self, file_path: str | Path) -> list[list[list[str]]]:
        """Extract tables from PDF (best-effort using PyMuPDF).

        Returns a list of tables, where each table is a list of rows,
        and each row is a list of cell strings.
        """
        file_path = Path(file_path)
        all_tables = []

        # 使用上下文管理器确保文件句柄及时释放
        # Use context manager to ensure the file handle is released promptly
        with fitz.open(str(file_path)) as doc:
            for page_num in range(len(doc)):
                page = doc[page_num]
                tabs = page.find_tables()
                for tab in tabs:
                    table_data = tab.extract()
                    if table_data and len(table_data) > 1:
                        all_tables.append(table_data)

        return all_tables