"""File type detection and validation utilities.

Uses magic bytes (file signatures) as the primary detection method,
matching the Java converter's FileFormatDetector logic.

文件类型检测与验证工具模块。
优先使用魔数字节（文件签名）识别真实格式，兼容政务场景常见的
DOC/DOCX/XLS/XLSX/PPT/PPTX/OFD/PDF/图片等格式。
同时提供格式支持判断、转换目标判断等辅助函数。
"""

from __future__ import annotations

from pathlib import Path

from app.config import settings

# Extension-based fallback mapping (for text files that magic bytes can't detect)
_TEXT_EXTENSIONS = {"txt", "md", "markdown", "csv"}

# Formats that Docling can process directly
DOCLING_FORMATS = {"pdf", "docx", "pptx", "xlsx", "png", "jpg", "jpeg", "tiff", "bmp", "md", "markdown"}

# Formats that should be processed as plain text (no Docling needed)
PLAIN_TEXT_FORMATS = {"txt"}

# Target format when converting via Java service
CONVERSION_TARGET: dict[str, str] = {
    "doc": "docx",
    "wps": "docx",
    "xls": "xlsx",
    "et": "xlsx",
    "ppt": "pptx",
    "ofd": "pdf",
}

# Magic byte signatures
_MAGIC_PDF = b"\x25\x50\x44\x46"          # %PDF
_MAGIC_ZIP = b"\x50\x4B\x03\x04"          # PK..
_MAGIC_OLE2 = b"\xD0\xCF\x11\xE0"         # OLE2 Compound Document
_MAGIC_PNG = b"\x89\x50\x4E\x47"           # .PNG
_MAGIC_JPEG = b"\xFF\xD8\xFF"              # JPEG
_MAGIC_BMP = b"\x42\x4D"                   # BM
_MAGIC_TIFF_LE = b"\x49\x49\x2A\x00"      # II*.  (little-endian)
_MAGIC_TIFF_BE = b"\x4D\x4D\x00\x2A"      # MM.*  (big-endian)


def detect_file_type(file_path: str | Path) -> str:
    """Detect the real file type using magic bytes.

    Detection order:
      1. Text files (by extension, since they have no magic bytes)
      2. Magic bytes: PDF, ZIP-based (DOCX/XLSX/PPTX/OFD), OLE2 (DOC/XLS/PPT),
         images (PNG/JPEG/BMP/TIFF)
      3. Fallback to file extension

    Returns:
        Format string like "pdf", "doc", "docx", "xlsx", etc.
    """
    path = Path(file_path)
    ext = path.suffix.lstrip(".").lower()

    # Text files can't be detected by magic bytes
    if ext in _TEXT_EXTENSIONS:
        return ext if ext != "markdown" else "md"

    # 初始只读取前 8 字节做魔数判断，减少不必要的 I/O 开销；
    # 仅当检测到 ZIP 或 OLE2 特征时才读取 64KB 做深入内容检测
    # Read only 8 bytes initially for magic-byte matching;
    # read 64KB only when ZIP/OLE2 content inspection is needed
    try:
        with open(path, "rb") as f:
            magic8 = f.read(8)
            if len(magic8) < 4:
                return ext if ext else "unknown"

            magic4 = magic8[:4]

            # PDF: %PDF
            if magic4 == _MAGIC_PDF:
                return "pdf"

            # ZIP-based: DOCX, PPTX, XLSX, OFD — 需要读取 64KB 检查内部文件结构
            # Need 64KB to inspect ZIP central directory for format detection
            if magic4 == _MAGIC_ZIP:
                f.seek(0)
                header = f.read(65536)
                return _detect_zip_format(header, ext)

            # OLE2 Compound Document: DOC, XLS, PPT, WPS, ET — 需要读取 64KB 扫描目录流
            # Need 64KB to scan OLE2 directory stream entries
            if magic4 == _MAGIC_OLE2:
                f.seek(0)
                header = f.read(65536)
                return _detect_ole2_format(header, ext)

            # PNG
            if magic4 == _MAGIC_PNG:
                return "png"

            # JPEG
            if magic8[:3] == _MAGIC_JPEG:
                return "jpg"

            # BMP
            if magic8[:2] == _MAGIC_BMP:
                return "bmp"

            # TIFF
            if magic4 == _MAGIC_TIFF_LE or magic4 == _MAGIC_TIFF_BE:
                return "tiff"

    except (OSError, IOError):
        return ext if ext else "unknown"

    # Fallback to extension
    return ext if ext else "unknown"


def _detect_zip_format(header: bytes, ext: str) -> str:
    """Detect specific format within a ZIP archive by inspecting content.

    ZIP 容器内部格式判别：通过检查 ZIP 内文件路径特征区分 DOCX/XLSX/PPTX/OFD。
    """
    # Use latin-1 for 1:1 byte mapping (same as Java ISO_8859_1)
    content = header.decode("latin-1", errors="replace")

    if "OFD.xml" in content and "Doc_0/Pages/" in content:
        return "ofd"
    if "word/document.xml" in content:
        return "docx"
    if "xl/workbook.xml" in content:
        return "xlsx"
    if "ppt/presentation.xml" in content:
        return "pptx"

    # Generic ZIP — trust extension for OFD
    if ext == "ofd":
        return "ofd"

    return ext if ext else "unknown"


def _detect_ole2_format(header: bytes, ext: str) -> str:
    """Detect specific format within an OLE2 compound document.

    OLE2 directory entries use UTF-16LE encoding.
    We scan the header for known stream names.

    OLE2 复合文档内部格式判别：扫描 UTF-16LE 编码的目录流名称
    区分 DOC/XLS/PPT/WPS/ET 等旧版 Office 格式。
    """
    content = header.decode("latin-1", errors="replace")

    # Word: "WordDocument" in UTF-16LE → W\x00o\x00r\x00d\x00D\x00...
    if "W\x00o\x00r\x00d\x00D\x00o\x00c\x00u\x00m\x00e\x00n\x00t" in content \
       or "W\x00o\x00r\x00d" in content:
        return "doc"

    # Excel: "Workbook" or "Book" in UTF-16LE
    if "W\x00o\x00r\x00k\x00b\x00o\x00o\x00k" in content \
       or "B\x00o\x00o\x00k" in content:
        return "xls"

    # PowerPoint: "PowerPoint" or "Current User" in UTF-16LE
    if "P\x00o\x00w\x00e\x00r\x00P\x00o\x00i\x00n\x00t" in content \
       or "C\x00u\x00r\x00r\x00e\x00n\x00t\x00 \x00U\x00s\x00e\x00r" in content:
        return "ppt"

    # WPS/ET: OLE2 but not standard MS Office — trust extension
    if ext in ("wps", "et", "dps"):
        return ext

    # Default: most common OLE2 in government documents is DOC
    return "doc"


def is_supported(file_type: str) -> bool:
    """Check if a file type is in the supported list."""
    return file_type in settings.supported_file_types


def needs_conversion(file_type: str) -> bool:
    """Check if a file type needs conversion via the Java service."""
    return file_type in settings.formats_need_conversion


def get_conversion_target(file_type: str) -> str | None:
    """Get the target format for conversion."""
    return CONVERSION_TARGET.get(file_type)


def is_docling_format(file_type: str) -> bool:
    """Check if Docling can process this format directly."""
    return file_type in DOCLING_FORMATS


def is_plain_text(file_type: str) -> bool:
    """Check if this should be processed as plain text."""
    return file_type in PLAIN_TEXT_FORMATS
