package com.gzzm.lobster.parse; import com.gzzm.lobster.config.LobsterConfig; import com.gzzm.platform.commons.Tools; import com.spire.doc.BuiltinDocumentProperties; import com.spire.doc.Document; import com.spire.doc.FileFormat; import com.spire.doc.Section; import com.spire.doc.Table; import com.spire.doc.TableCell; import com.spire.doc.TableRow; import com.spire.doc.documents.DocumentObjectType; import com.spire.doc.documents.ListType; import com.spire.doc.documents.Paragraph; import com.spire.doc.interfaces.ICompositeObject; import com.spire.doc.interfaces.IDocumentObject; import net.cyan.arachne.annotation.Service; import java.io.InputStream; import static com.gzzm.lobster.parse.ParserSupport.NEWLINES; import static com.gzzm.lobster.parse.ParserSupport.escapeCell; import static com.gzzm.lobster.parse.ParserSupport.escapeYaml; import static com.gzzm.lobster.parse.ParserSupport.safeMsg; /** * WordParser —— 基于 Spire.Doc 的 Word 家族解析 / Word-family parser on top of Spire.Doc. *

覆盖扩展：docx/doc/dotx/dot/docm/dotm/wps/wpt/rtf/odt/ott。 * 具体格式由 {@link FileFormat#Auto} 根据 magic bytes 自动识别，加载侧不分流。 * *

决策： *

标题层级：优先 {@link com.spire.doc.OutlineLevel}；缺失时回退到 style 名前缀匹配
列表：{@link com.spire.doc.formatting.ListFormat} 判定；缩进按 listLevelNumber
表格：markdown table，合并单元格取首格值向右/向下复制
页眉/页脚：整块跳过，按设计文档 §3.1
图片：占位 {@code [图片]}，第二期接图片抽取
脚注/尾注：第一期先丢（Paragraph.getText() 默认不包含），待真实样本再补

*/ public class WordParser implements DocumentParser { @Override public String kind() { return "docx"; } @Override public ParseResult parse(InputStream in, String originalName, String mimeType) throws Exception { Document doc = new Document(); try { doc.loadFromStream(in, FileFormat.Auto); return renderDocument(doc, originalName, mimeType); } finally { try { doc.close(); } catch (Throwable ignore) { /* Spire close 偶尔抛，忽略 */ } } } private ParseResult renderDocument(Document doc, String originalName, String mimeType) { MarkdownBuilder mb = new MarkdownBuilder(); String kind = kindFromName(originalName); Outline outline = new Outline(kind, originalName); // ---- 元信息头 ---- String title = originalName; String author = null; try { BuiltinDocumentProperties p = doc.getBuiltinDocumentProperties(); if (p != null) { if (p.getTitle() != null && !p.getTitle().isEmpty()) title = p.getTitle(); if (p.getAuthor() != null && !p.getAuthor().isEmpty()) author = p.getAuthor(); } } catch (Throwable t) { logParseWarn("docx metadata", t); } outline.setTitle(title); mb.appendLine("---"); mb.appendLine("kind: " + kind); if (originalName != null) mb.appendLine("source: " + originalName); if (title != null) mb.appendLine("title: " + escapeYaml(title)); if (author != null) mb.appendLine("author: " + escapeYaml(author)); if (mimeType != null) mb.appendLine("mimeType: " + mimeType); mb.appendLine("---"); mb.appendBlankLine(); // ---- 正文 ---- SectionCtx ctx = new SectionCtx(); int sectionCount = doc.getSections().getCount(); for (int si = 0; si < sectionCount; si++) { Section section = doc.getSections().get(si); // 页眉页脚直接跳过 —— 不遍历 section.getHeadersFooters() int bodyCount; try { bodyCount = section.getBody().getChildObjects().getCount(); } catch (Throwable t) { logParseWarn("docx section body", t); continue; } for (int bi = 0; bi < bodyCount; bi++) { IDocumentObject obj; try { obj = section.getBody().getChildObjects().get(bi); } catch (Throwable t) { mb.appendLine(""); continue; } emitObject(mb, outline, obj, ctx, 0); } } // 关闭最后一节 if (ctx.active != null && ctx.active.getEndChar() == 0) { mb.closeSection(ctx.active); } mb.closeAllOpen(); // 兜底：若整份 body 没跑出任何段落/表格（常见于 SDT/Shape 嵌得太深或未知容器）， // 直接用 doc.getText() 灌入纯文本——丢层级但不丢内容。 if (ctx.emitted == 0) { String fallback = safeDocText(doc); if (fallback != null && !fallback.isEmpty()) { mb.appendBlankLine(); mb.appendLine(""); mb.appendLine(NEWLINES.matcher(fallback).replaceAll("\n").trim()); mb.appendBlankLine(); } } // 全文截断 String md = mb.toMarkdown(); int cap = LobsterConfig.getParsedMarkdownMaxChars(); if (md.length() > cap) { md = md.substring(0, cap) + "\n\n> 全文超过 " + cap + " 字符已截断。" + "调用 `read_file` 传 `sectionId` 或更大 offset 继续阅读。\n"; } outline.setTotalChars(md.length()); outline.getStats().put("sections", outline.getSections().size()); return new ParseResult(kind, md, outline); } /** 从原始文件名推导 kind；识别不了时回退到 docx 作通用词家族标签. */ private static String kindFromName(String name) { if (name == null) return "docx"; int dot = name.lastIndexOf('.'); if (dot < 0 || dot == name.length() - 1) return "docx"; String ext = name.substring(dot + 1).toLowerCase(java.util.Locale.ROOT); switch (ext) { case "doc": case "docx": case "dot": case "dotx": case "docm": case "dotm": case "wps": case "wpt": case "rtf": case "odt": case "ott": return ext; default: return "docx"; } } /** 解析循环上下文——section 计数只在开新 heading 时递增，避免 id 出现空洞. */ private static class SectionCtx { OutlineSection active; int seq; /** 命中过段落/表格的次数；用来判定结构化解析是否全空需要兜底. */ int emitted; } /** 递归向下派发：Paragraph/Table 直接 emit；SDT/TextBox/ShapeGroup 等容器下钻. */ private static final int MAX_NEST_DEPTH = 8; private void emitObject(MarkdownBuilder mb, Outline outline, IDocumentObject obj, SectionCtx ctx, int depth) { if (obj == null || depth > MAX_NEST_DEPTH) return; DocumentObjectType type; try { type = obj.getDocumentObjectType(); } catch (Throwable t) { mb.appendLine(""); return; } try { if (type == DocumentObjectType.Paragraph) { emitParagraph(mb, outline, (Paragraph) obj, ctx); } else if (type == DocumentObjectType.Table) { if (emitTable(mb, (Table) obj)) ctx.emitted++; } else if (isContainer(type) && obj instanceof ICompositeObject) { ICompositeObject c = (ICompositeObject) obj; int n = c.getChildObjects().getCount(); for (int i = 0; i < n; i++) { emitObject(mb, outline, c.getChildObjects().get(i), ctx, depth + 1); } } // 其余叶子类型（Field_Mark / Bookmark_Start / Break / ...）忽略 } catch (Throwable t) { mb.appendLine(""); } } /** 这些类型内部还包着 Paragraph/Table，需要继续下钻；都实现 ICompositeObject. */ private static boolean isContainer(DocumentObjectType type) { return type == DocumentObjectType.Structure_Document_Tag || type == DocumentObjectType.Structure_Document_Tag_Inline || type == DocumentObjectType.Structure_Document_Tag_Row || type == DocumentObjectType.Structure_Document_Tag_Cell || type == DocumentObjectType.SDT_Block_Content || type == DocumentObjectType.SDT_Inline_Content || type == DocumentObjectType.SDT_Row_Content || type == DocumentObjectType.SDT_Cell_Content || type == DocumentObjectType.Text_Box || type == DocumentObjectType.Shape_Group || type == DocumentObjectType.Shape || type == DocumentObjectType.Custom_Xml || type == DocumentObjectType.Sub_Document; } private static String safeDocText(Document doc) { try { return doc.getText(); } catch (Throwable t) { logParseWarn("docx fallback getText", t); return null; } } private void emitParagraph(MarkdownBuilder mb, Outline outline, Paragraph p, SectionCtx ctx) { String text = safeText(p); int headingLevel = detectHeadingLevel(p); if (headingLevel >= 1 && headingLevel <= 6 && !text.isEmpty()) { // 收尾上一节 if (ctx.active != null && ctx.active.getEndChar() == 0) { mb.closeSection(ctx.active); } // 段前留白，avoid 粘连 if (mb.cursor() > 0) mb.appendBlankLine(); String id = "s" + (++ctx.seq); String displayTitle = truncate(text, 60); OutlineSection sec = mb.openSection(id, headingLevel, displayTitle); for (int i = 0; i < headingLevel; i++) mb.append("#"); mb.append(" ").appendLine(text); mb.appendBlankLine(); outline.getSections().add(sec); ctx.active = sec; ctx.emitted++; return; } if (text.isEmpty()) return; // 列表 or 普通段落 String listPrefix = detectListPrefix(p); if (listPrefix != null) { mb.appendLine(listPrefix + text); } else { mb.appendLine(text); mb.appendBlankLine(); } ctx.emitted++; } private boolean emitTable(MarkdownBuilder mb, Table table) { int rowCount = table.getRows().getCount(); if (rowCount == 0) return false; // 先留白 mb.appendBlankLine(); // 用首行作表头 TableRow first = table.getRows().get(0); int colCount = first.getCells().getCount(); if (colCount == 0) return false; mb.append("|"); for (int c = 0; c < colCount; c++) { mb.append(" ").append(cellText(first.getCells().get(c))).append(" |"); } mb.appendLine(""); mb.append("|"); for (int c = 0; c < colCount; c++) mb.append("---|"); mb.appendLine(""); for (int r = 1; r < rowCount; r++) { TableRow row = table.getRows().get(r); int rc = row.getCells().getCount(); mb.append("|"); for (int c = 0; c < colCount; c++) { String cell = c < rc ? cellText(row.getCells().get(c)) : ""; mb.append(" ").append(cell).append(" |"); } mb.appendLine(""); } mb.appendBlankLine(); return true; } private String cellText(TableCell cell) { if (cell == null) return ""; StringBuilder sb = new StringBuilder(); int n = cell.getChildObjects().getCount(); for (int i = 0; i < n; i++) { IDocumentObject obj = cell.getChildObjects().get(i); if (obj instanceof Paragraph) { String t = safeText((Paragraph) obj); if (!t.isEmpty()) { if (sb.length() > 0) sb.append("
"); sb.append(t); } } // 嵌套表格 / 图片等第一期忽略 } return escapeCell(sb.toString()); } private int detectHeadingLevel(Paragraph p) { // 1) OutlineLevel 优先 try { com.spire.doc.OutlineLevel ol = p.getFormat().getOutlineLevel(); if (ol != null) { String name = ol.name(); // 枚举值形如 Level_1 / Level_2 / ... / Body_Text if (name != null && name.startsWith("Level_")) { try { int lv = Integer.parseInt(name.substring("Level_".length())); if (lv >= 1 && lv <= 6) return lv; } catch (NumberFormatException ignore) { /* fall through */ } } } } catch (Throwable ignore) { /* fall through */ } // 2) style 名前缀匹配：英文 "Heading N"、中文 "标题 N" try { String sn = p.getStyleName(); if (sn != null) { String lower = sn.toLowerCase(); if (lower.startsWith("heading")) { char c = lastDigit(sn); if (c != 0) return c - '0'; } if (sn.startsWith("标题")) { char c = lastDigit(sn); if (c != 0) return c - '0'; } } } catch (Throwable ignore) { /* ignore */ } return 0; } private String detectListPrefix(Paragraph p) { try { if (p.getListFormat() == null) return null; ListType lt = p.getListFormat().getListType(); if (lt == null || lt == ListType.No_List) return null; int level = p.getListFormat().getListLevelNumber(); StringBuilder indent = new StringBuilder(); for (int i = 0; i < Math.max(0, level); i++) indent.append(" "); if (lt == ListType.Bulleted) return indent + "- "; return indent + "1. "; } catch (Throwable ignore) { return null; } } private static String safeText(Paragraph p) { try { String raw = p.getText(); if (raw == null) return ""; return NEWLINES.matcher(raw).replaceAll(" ").trim(); } catch (Throwable t) { return ""; } } private static String truncate(String s, int max) { if (s == null) return ""; if (s.length() <= max) return s; return s.substring(0, max - 1) + "…"; } private static char lastDigit(String s) { for (int i = s.length() - 1; i >= 0; i--) { char c = s.charAt(i); if (c >= '0' && c <= '9') return c; } return 0; } private static void logParseWarn(String where, Throwable t) { try { Tools.log("[WordParser] " + where + " failed", t); } catch (Throwable ignore) { /* ignore */ } } }