package com.gzzm.lobster.parse;

import com.gzzm.lobster.config.LobsterConfig;
import com.gzzm.platform.commons.Tools;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfDocumentInformation;
import com.spire.pdf.PdfPageBase;
import com.spire.pdf.bookmarks.PdfBookmark;
import com.spire.pdf.general.PdfDestination;
import com.spire.pdf.texts.PdfTextExtractOptions;
import com.spire.pdf.texts.PdfTextExtractor;
import net.cyan.arachne.annotation.Service;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import static com.gzzm.lobster.parse.ParserSupport.escapeYaml;
import static com.gzzm.lobster.parse.ParserSupport.safeMsg;

/**
 * PdfParser —— 基于 Spire.Pdf 的 .pdf 解析 / PDF parser on top of Spire.Pdf.
 *
 * <p>决策（对齐 design doc §3.4）：
 * <ul>
 *   <li>优先按 Bookmarks 切节；无书签时按页切节</li>
 *   <li>书签嵌套超过 {@link LobsterConfig#getPdfBookmarkMaxDepth} 层扁平化到该层</li>
 *   <li>表格抽取第一期走 {@code page.extractText()} 的阅读序文本，不调 PdfTableExtractor；
 *       原因：Spire 的表格识别对复杂版式不稳定，reading-order 文本在多数政务 PDF 上够用，
 *       复杂表格留到第二期按真实样本打磨</li>
 *   <li>页眉/页脚：extractText 返回的是全页文本（含页眉页脚）；第一期不做启发式剔除，
 *       标记为已知损耗——真实场景大多数政务 PDF 的页眉页脚短、对问答干扰小</li>
 *   <li>注释/表单：第一期忽略</li>
 * </ul>
 */

public class PdfParser implements DocumentParser {

    /** 页文本里常见的连续空白/空行. */
    private static final Pattern MULTI_BLANK = Pattern.compile("\\n{3,}");

    @Override public String kind() { return "pdf"; }

    @Override
    public ParseResult parse(InputStream in, String originalName, String mimeType) throws Exception {
        return parseAs(in, originalName, mimeType, "pdf");
    }

    /**
     * 同 {@link #parse}，但允许外部覆盖 kind 标签——
     * 给 {@link OfdParser} 这种"转成 PDF 再解析"的委托使用：outline、YAML 头、ParseResult.kind
     * 都落目标 kind，不用事后字符串替换。
     */
    ParseResult parseAs(InputStream in, String originalName, String mimeType, String kindOverride) throws Exception {
        String kind = kindOverride == null || kindOverride.isEmpty() ? "pdf" : kindOverride;
        PdfDocument pdf = new PdfDocument();
        try {
            pdf.loadFromStream(in);
            return renderPdf(pdf, originalName, mimeType, kind);
        } finally {
            try { pdf.close(); } catch (Throwable ignore) { /* Spire close 偶尔抛，忽略 */ }
        }
    }

    private ParseResult renderPdf(PdfDocument pdf, String originalName, String mimeType, String kind) {
        int pageCount = pdf.getPages().getCount();
        Outline outline = new Outline(kind, originalName);
        MarkdownBuilder mb = new MarkdownBuilder();

        // ---- 元信息 ----
        String title = originalName;
        String author = null;
        String created = null;
        try {
            PdfDocumentInformation info = pdf.getDocumentInformation();
            if (info != null) {
                if (info.getTitle() != null && !info.getTitle().isEmpty()) title = info.getTitle();
                if (info.getAuthor() != null && !info.getAuthor().isEmpty()) author = info.getAuthor();
                try {
                    java.util.Date cd = info.getCreationDate();
                    if (cd != null) created = String.valueOf(cd);
                } catch (Throwable ignore) { /* 某些 PDF 日期字段损坏，忽略 */ }
            }
        } catch (Throwable t) {
            logParseWarn("pdf metadata", t);
        }
        outline.setTitle(title);

        mb.appendLine("---");
        mb.appendLine("kind: " + kind);
        if (originalName != null) mb.appendLine("source: " + originalName);
        if (title != null) mb.appendLine("title: " + escapeYaml(title));
        if (author != null) mb.appendLine("author: " + escapeYaml(author));
        if (created != null) mb.appendLine("created: " + escapeYaml(created));
        mb.appendLine("pages: " + pageCount);
        if (mimeType != null) mb.appendLine("mimeType: " + mimeType);
        mb.appendLine("---");
        mb.appendBlankLine();

        // ---- 预抽每页文本 ----
        PdfTextExtractOptions options = new PdfTextExtractOptions();
        options.setSimpleExtraction(true); // reading-order 文本，不按坐标还原版式
        String[] pageText = new String[pageCount];
        for (int i = 0; i < pageCount; i++) {
            try {
                PdfPageBase page = pdf.getPages().get(i);
                PdfTextExtractor extractor = new PdfTextExtractor(page);
                String t = extractor.extract(options);
                pageText[i] = normalizePage(t);
            } catch (Throwable t) {
                logParseWarn("pdf page " + i, t);
                pageText[i] = "<!-- [parse error: page " + (i + 1) + ": " + safeMsg(t) + "] -->";
            }
        }

        // ---- 路由：有书签走书签，否则按页切 ----
        List<BookmarkEntry> bookmarks = collectBookmarks(pdf);
        boolean usingBookmarks = !bookmarks.isEmpty();

        if (usingBookmarks) {
            renderWithBookmarks(mb, outline, bookmarks, pageText);
            outline.getStats().put("hasBookmarks", true);
        } else {
            renderPerPage(mb, outline, pageText);
            outline.getStats().put("hasBookmarks", false);
        }
        outline.getStats().put("pages", pageCount);
        outline.getStats().put("sections", outline.getSections().size());

        mb.closeAllOpen();

        String md = mb.toMarkdown();
        int cap = LobsterConfig.getParsedMarkdownMaxChars();
        if (md.length() > cap) {
            md = md.substring(0, cap)
                    + "\n\n> 全文超过 " + cap + " 字符已截断。"
                    + "调用 `read_file` 传 `sectionId` 或更大 offset 继续阅读。\n";
        }
        outline.setTotalChars(md.length());
        return new ParseResult(kind, md, outline);
    }

    // ---------------- bookmark path ----------------

    private void renderWithBookmarks(MarkdownBuilder mb, Outline outline,
                                     List<BookmarkEntry> bookmarks, String[] pageText) {
        int pageCount = pageText.length;
        // 按页码排序（Spire 书签通常已经是顺序，但兜底一下）
        bookmarks.sort((a, b) -> Integer.compare(a.pageIndex, b.pageIndex));

        for (int i = 0; i < bookmarks.size(); i++) {
            BookmarkEntry cur = bookmarks.get(i);
            int start = clamp(cur.pageIndex, 0, pageCount - 1);
            int end;
            if (i + 1 < bookmarks.size()) {
                end = clamp(bookmarks.get(i + 1).pageIndex - 1, start, pageCount - 1);
            } else {
                end = pageCount - 1;
            }
            String id = "b" + (i + 1);
            int level = Math.max(1, Math.min(cur.level, 6));
            String displayTitle = truncate(cur.title, 80);
            OutlineSection sec = mb.openSection(id, level, displayTitle);
            sec.putExtra("kind", "bookmark");
            sec.putExtra("pageStart", start + 1);
            sec.putExtra("pageEnd", end + 1);

            mb.appendBlankLine();
            for (int k = 0; k < level; k++) mb.append("#");
            mb.append(" ").append(displayTitle)
              .append(" (第 ").append(String.valueOf(start + 1))
              .append("-").append(String.valueOf(end + 1)).appendLine(" 页)");
            mb.appendBlankLine();

            for (int p = start; p <= end; p++) {
                String pt = pageText[p];
                if (pt == null || pt.isEmpty()) continue;
                mb.appendLine(pt);
                mb.appendBlankLine();
            }

            mb.closeSection(sec);
            outline.getSections().add(sec);
        }
    }

    // ---------------- per-page path ----------------

    private void renderPerPage(MarkdownBuilder mb, Outline outline, String[] pageText) {
        for (int p = 0; p < pageText.length; p++) {
            String id = "p" + (p + 1);
            String displayTitle = "第 " + (p + 1) + " 页";
            OutlineSection sec = mb.openSection(id, 2, displayTitle);
            sec.putExtra("kind", "page");
            sec.putExtra("pageStart", p + 1);
            sec.putExtra("pageEnd", p + 1);
            mb.appendLine("## " + displayTitle);
            mb.appendBlankLine();
            String pt = pageText[p];
            if (pt != null && !pt.isEmpty()) {
                mb.appendLine(pt);
                mb.appendBlankLine();
            }
            mb.closeSection(sec);
            outline.getSections().add(sec);
        }
    }

    // ---------------- bookmark collection ----------------

    /**
     * 递归收集所有书签，扁平为有序列表，level 超过 {@code maxDepth} 的归并到 {@code maxDepth}.
     */
    private List<BookmarkEntry> collectBookmarks(PdfDocument pdf) {
        List<BookmarkEntry> out = new ArrayList<>();
        try {
            if (pdf.getBookmarks() == null) return out;
            int maxDepth = LobsterConfig.getPdfBookmarkMaxDepth();
            int n = pdf.getBookmarks().getCount();
            for (int i = 0; i < n; i++) {
                PdfBookmark b = pdf.getBookmarks().get(i);
                walkBookmark(b, 1, maxDepth, out);
            }
        } catch (Throwable t) {
            logParseWarn("pdf bookmarks", t);
        }
        return out;
    }

    private void walkBookmark(PdfBookmark b, int depth, int maxDepth, List<BookmarkEntry> out) {
        if (b == null) return;
        int effectiveLevel = Math.min(depth, maxDepth);
        BookmarkEntry e = new BookmarkEntry();
        e.title = safeTitle(b);
        e.pageIndex = pageIndexOf(b);
        e.level = effectiveLevel;
        if (e.pageIndex >= 0 && e.title != null && !e.title.isEmpty()) {
            out.add(e);
        }
        try {
            int childCount = b.getCount();
            for (int i = 0; i < childCount; i++) {
                walkBookmark(b.get(i), depth + 1, maxDepth, out);
            }
        } catch (Throwable ignore) { /* 某些 PDF 书签结构 Spire 会抛，忽略 */ }
    }

    private static String safeTitle(PdfBookmark b) {
        try {
            String t = b.getTitle();
            return t == null ? null : t.trim();
        } catch (Throwable ignore) {
            return null;
        }
    }

    private static int pageIndexOf(PdfBookmark b) {
        try {
            PdfDestination dest = b.getDestination();
            if (dest == null) return -1;
            return dest.getPageNumber();
        } catch (Throwable ignore) {
            return -1;
        }
    }

    // ---------------- helpers ----------------

    private static class BookmarkEntry {
        String title;
        int pageIndex;
        int level;
    }

    private static String normalizePage(String pageText) {
        if (pageText == null) return "";
        String t = pageText.replace("\r\n", "\n").replace('\r', '\n').trim();
        return MULTI_BLANK.matcher(t).replaceAll("\n\n");
    }

    private static int clamp(int v, int lo, int hi) { return Math.max(lo, Math.min(hi, v)); }

    private static String truncate(String s, int max) {
        if (s == null) return "";
        if (s.length() <= max) return s;
        return s.substring(0, max - 1) + "…";
    }

    private static void logParseWarn(String where, Throwable t) {
        try { Tools.log("[PdfParser] " + where + " failed", t); } catch (Throwable ignore) { /* ignore */ }
    }
}
