"""Rule-first extractor for standard service guide documents."""

from __future__ import annotations

import hashlib
import re
from datetime import datetime, timezone
from typing import Any

from pydantic import BaseModel, Field, ValidationError

from app.prompts.service_guide_extraction import (
    PROCESS_STEP_KEYWORDS,
    SCENE_DETECTION_KEYWORDS,
    SECTION_ALIASES,
)
from app.schemas.service_guide_profile import StandardServiceGuideProfile
from app.utils.logger import get_logger

logger = get_logger(__name__)

_URL_RE = re.compile(r"https?://\S+")
_PHONE_RE = re.compile(r"(?:\+?86[-\s]?)?(?:0\d{2,3}[-\s]?)?\d{7,8}|1\d{10}|1\d{4}")
_DATE_RE = re.compile(r"\d{4}[-/.]\d{1,2}[-/.]\d{1,2}")
_TIME_LIMIT_RE = re.compile(r"(?P<duration>\d+)\s*(?P<unit>个?工作日|个?自然日|日|天|小时)")
_FIELD_HEADER_RE = re.compile(r"^[\u4e00-\u9fa5A-Za-z0-9（）()/_-]{2,30}\s*[：:]")
_TRUE_WORDS = ("是", "支持", "可", "允许", "已关联", "有")
_FALSE_WORDS = ("否", "不支持", "不可", "不允许", "无", "未关联")
_TABLE_SEP_RE = re.compile(r"^\s*\|?(?:\s*:?-{2,}:?\s*\|)+\s*:?-{2,}:?\s*\|?\s*$")
_LIST_SPLIT_RE = re.compile(r"[、，,；;／/]+")
_CONSULTATION_KEYWORDS = ("咨询", "咨询电话", "咨询方式", "咨询网址")
_COMPLAINT_KEYWORDS = ("投诉", "监督", "举报", "投诉电话", "监督电话", "投诉网址", "监督网址")
_MAX_TABLE_ROWS = 200


class ServiceGuideExtractionInput(BaseModel):
    doc_id: str
    content_hash: str
    title: str = ""
    doc_type: str = ""
    knowledge_category: str = ""
    source_url: str = ""
    acl_ids: list[str] = Field(default_factory=list)
    metadata: dict[str, Any] = Field(default_factory=dict)
    plain_text: str = ""
    markdown_text: str = ""
    structured_blocks: list[dict[str, Any]] = Field(default_factory=list)


class ServiceGuideExtractionOutput(BaseModel):
    detected: bool
    scene_type: str = ""
    detection_confidence: float = 0.0
    detection_reasons: list[str] = Field(default_factory=list)
    profile: StandardServiceGuideProfile | None = None
    root_fields: dict[str, Any] = Field(default_factory=dict)
    bindings: dict[str, Any] = Field(default_factory=dict)
    quality: dict[str, Any] = Field(default_factory=dict)
    raw_sections: dict[str, str] = Field(default_factory=dict)
    artifacts: dict[str, Any] = Field(default_factory=dict)
    warnings: list[str] = Field(default_factory=list)


class ServiceGuideExtractor:
    """Extract a conservative structured profile from standard service guides."""

    extractor_version = "guide_extractor_v0"

    async def extract(
        self,
        payload: ServiceGuideExtractionInput,
    ) -> ServiceGuideExtractionOutput:
        text = (payload.markdown_text or payload.plain_text or "").strip()
        detected, confidence, reasons = self._detect(payload, text)
        if not detected:
            return ServiceGuideExtractionOutput(
                detected=False,
                scene_type="other",
                detection_confidence=confidence,
                detection_reasons=reasons,
            )

        raw_sections = self._split_sections(text)
        warnings: list[str] = []
        try:
            matter_identity = self._extract_matter_identity(payload, text)
            basic_info = self._extract_basic_info(raw_sections.get("basic_info", ""), text)
            cross_region_service = self._extract_cross_region_service(
                raw_sections.get("cross_region_service", "")
            )
            process_info = self._extract_process_info(raw_sections.get("process_info", ""), text)
            materials, material_warnings = self._extract_materials(raw_sections.get("materials", ""))
            fees = self._extract_fees(raw_sections.get("fees", ""))
            service_windows = self._extract_service_windows(raw_sections.get("service_windows", ""))
            legal_basis = self._extract_legal_basis(raw_sections.get("legal_basis", ""))
            consultation, consultation_warnings = self._extract_consultation_and_supervision(
                raw_sections.get("consultation_and_supervision", "")
            )
            review_info = self._extract_review_info(raw_sections.get("review_info", ""))
            acceptance_info = self._extract_acceptance_info(raw_sections.get("acceptance_info", ""))

            warnings.extend(material_warnings)
            warnings.extend(consultation_warnings)
            quality = self._build_quality(raw_sections, materials, fees, service_windows, warnings)
            bindings = {
                "matter_ids": [],
                "organization_bindings": self._build_org_bindings(matter_identity),
                "region_bindings": [],
            }

            root_fields = self._build_root_fields(
                matter_identity,
                basic_info,
                cross_region_service,
                materials,
                fees,
                service_windows,
                legal_basis,
                quality,
            )
            artifacts = {"section_names": list(raw_sections.keys())}
            profile_payload = {
                "schema_version": "service_guide_v1",
                "profile_id": self._build_profile_id(payload.doc_id, payload.content_hash, matter_identity),
                "doc_id": payload.doc_id,
                "content_hash": payload.content_hash,
                "acl_ids": payload.acl_ids,
                "scene_type": "standard_service_guide",
                "source": payload.metadata.get("source", ""),
                "source_url": payload.source_url,
                "is_current": True,
                "guide_version": matter_identity.get("matter_version", ""),
                "extractor_version": self.extractor_version,
                "extracted_at": datetime.now(timezone.utc).isoformat(),
                "updated_at": datetime.now(timezone.utc).isoformat(),
                **root_fields,
                "document_info": {
                    "title": payload.title,
                    "normalized_title": self._normalize_text(payload.title),
                    "doc_type": payload.doc_type,
                    "knowledge_category": payload.knowledge_category or "办事指南",
                    "publish_date": payload.metadata.get("publish_date", ""),
                    "effective_date": payload.metadata.get("effective_date", ""),
                    "expiry_date": payload.metadata.get("expiry_date", ""),
                    "issuing_org": payload.metadata.get("issuing_org", ""),
                    "status": payload.metadata.get("status", ""),
                },
                "matter_identity": matter_identity,
                "basic_info": basic_info,
                "cross_region_service": cross_region_service,
                "review_info": review_info,
                "result_info": [],
                "acceptance_info": acceptance_info,
                "process_info": process_info,
                "materials": materials,
                "fees": fees,
                "legal_basis": legal_basis,
                "rights_and_obligations": {"rights": [], "obligations": []},
                "remedies": {},
                "consultation_and_supervision": consultation,
                "service_windows": service_windows,
                "bindings": bindings,
                "quality": quality,
                "raw_sections": raw_sections,
            }
            try:
                profile = StandardServiceGuideProfile.model_validate(profile_payload)
            except ValidationError as exc:
                validation_warning = "profile validation failed; structured profile omitted"
                warnings.append(validation_warning)
                quality = {
                    **quality,
                    "needs_review": True,
                    "warnings": list(dict.fromkeys([*quality.get("warnings", []), validation_warning])),
                }
                artifacts["profile_validation_errors"] = exc.errors()
                logger.warning(
                    "service_guide_profile_validation_failed",
                    doc_id=payload.doc_id,
                    error_count=len(exc.errors()),
                )
                return ServiceGuideExtractionOutput(
                    detected=True,
                    scene_type="standard_service_guide",
                    detection_confidence=confidence,
                    detection_reasons=reasons,
                    profile=None,
                    root_fields=root_fields,
                    bindings=bindings,
                    quality=quality,
                    raw_sections=raw_sections,
                    artifacts=artifacts,
                    warnings=warnings,
                )

            return ServiceGuideExtractionOutput(
                detected=True,
                scene_type="standard_service_guide",
                detection_confidence=confidence,
                detection_reasons=reasons,
                profile=profile,
                root_fields=root_fields,
                bindings=bindings,
                quality=quality,
                raw_sections=raw_sections,
                artifacts=artifacts,
                warnings=warnings,
            )
        except Exception as exc:
            extraction_warning = f"service guide extraction failed: {exc}"
            warnings.append(extraction_warning)
            logger.warning(
                "service_guide_extraction_stage_failed",
                doc_id=payload.doc_id,
                error=str(exc),
            )
            return ServiceGuideExtractionOutput(
                detected=True,
                scene_type="standard_service_guide",
                detection_confidence=confidence,
                detection_reasons=reasons,
                profile=None,
                quality={
                    "completeness_score": 0.0,
                    "confidence_score": 0.0,
                    "missing_fields": [],
                    "needs_review": True,
                    "warnings": warnings,
                },
                raw_sections=raw_sections,
                artifacts={
                    "section_names": list(raw_sections.keys()),
                    "extract_error": str(exc),
                },
                warnings=warnings,
            )

    def _detect(
        self,
        payload: ServiceGuideExtractionInput,
        text: str,
    ) -> tuple[bool, float, list[str]]:
        reasons: list[str] = []
        title = payload.title.strip()
        doc_type = payload.doc_type.strip()
        category = payload.knowledge_category.strip()

        if doc_type == "办事指南":
            reasons.append("doc_type=办事指南")
        if category in {"办事指南", "政务服务事项"}:
            reasons.append(f"knowledge_category={category}")

        markers = [kw for kw in SCENE_DETECTION_KEYWORDS if kw in text or kw in title]
        if markers:
            reasons.append(f"markers={','.join(markers[:4])}")

        strong_hit = doc_type == "办事指南" or category in {"办事指南", "政务服务事项"}
        detected = strong_hit or len(markers) >= 3
        confidence = 0.96 if strong_hit else min(0.55 + len(markers) * 0.1, 0.9)
        log_payload = {
            "doc_id": payload.doc_id,
            "detected": detected,
            "confidence": confidence if detected else min(confidence, 0.49),
            "reasons": reasons,
        }
        if detected:
            logger.info("service_guide_detected", **log_payload)
        else:
            logger.debug("service_guide_not_detected", **log_payload)
        return detected, confidence if detected else min(confidence, 0.49), reasons

    def _split_sections(self, text: str) -> dict[str, str]:
        if not text:
            return {}

        lines = [line.rstrip() for line in text.splitlines()]
        sections: dict[str, str] = {}
        current: str | None = None
        buffer: list[str] = []

        for line in lines:
            matched = self._match_section_header(line)
            if matched:
                if current and buffer:
                    sections[current] = "\n".join(buffer).strip()
                current = matched
                buffer = []
                continue
            if current:
                buffer.append(line)

        if current and buffer:
            sections[current] = "\n".join(buffer).strip()

        filtered_sections = {name: value for name, value in sections.items() if value}
        logger.debug(
            "service_guide_sections_split",
            section_count=len(filtered_sections),
            section_names=list(filtered_sections.keys()),
        )
        return filtered_sections

    def _match_section_header(self, line: str) -> str | None:
        clean = re.sub(r"^[一二三四五六七八九十0-9（()）.、\s-]+", "", line.strip())
        if ("：" in clean or ":" in clean) and not clean.endswith(("：", ":")):
            return None
        for section_name, aliases in SECTION_ALIASES.items():
            if any(alias in clean for alias in aliases):
                return section_name
        return None

    def _extract_matter_identity(
        self,
        payload: ServiceGuideExtractionInput,
        text: str,
    ) -> dict[str, Any]:
        matter_name = self._extract_field(text, ("事项名称",)) or payload.title.strip()
        colloquial = self._split_items(self._extract_field(text, ("日常用语",)))
        return {
            "matter_name": matter_name,
            "colloquial_names": colloquial,
            "matter_type": self._extract_field(text, ("事项类型", "事项类别")),
            "basic_code": self._extract_field(text, ("基本编码",)),
            "implementation_code": self._extract_field(text, ("实施编码",)),
            "business_item_code": self._extract_field(text, ("业务办理项编码",)),
            "matter_version": self._extract_field(text, ("事项版本", "版本号")),
            "implementing_subject": self._extract_field(text, ("实施主体",)),
            "subject_nature": self._extract_field(text, ("实施主体性质",)),
            "delegated_department": self._extract_field(text, ("委托部门", "受委托部门")),
        }

    def _extract_basic_info(self, section_text: str, full_text: str) -> dict[str, Any]:
        text = section_text or full_text
        promised = self._extract_time_limit(self._extract_field(text, ("承诺办结时限",)))
        legal = self._extract_time_limit(self._extract_field(text, ("法定办结时限",)))
        return {
            "service_object": self._split_items(self._extract_field(text, ("服务对象",))),
            "promised_time_limit": promised,
            "legal_time_limit": legal,
            "visit_count_to_hall": self._extract_int(self._extract_field(text, ("到办事现场次数", "跑动次数"))),
            "must_onsite": self._extract_bool(self._extract_field(text, ("是否必须现场办理",))),
            "must_onsite_reason": self._extract_field(text, ("必须现场办理原因",)),
            "case_type": self._extract_field(text, ("办件类型",)),
            "notified_commitment_enabled": self._extract_bool(self._extract_field(text, ("告知承诺制",))),
            "hall_required": self._extract_bool(self._extract_field(text, ("是否进驻政务大厅",))),
            "express_supported": self._extract_bool(self._extract_field(text, ("是否支持物流快递", "是否支持快递",))),
            "reservation_supported": self._extract_bool(self._extract_field(text, ("是否支持预约办理", "是否支持预约",))),
            "reservation_url": self._extract_field(text, ("在线预约地址", "预约地址")),
            "service_modes": self._split_items(self._extract_field(text, ("办理形式", "服务形式", "办理方式"))),
            "online_depth": self._extract_field(text, ("网办深度",)),
            "linked_agencies": self._split_items(self._extract_field(text, ("联办机构",))),
        }

    def _extract_cross_region_service(self, section_text: str) -> list[dict[str, Any]]:
        if not section_text.strip():
            return []
        scope = "指定区域"
        if "全国" in section_text or "跨省" in section_text:
            scope = "全国"
        elif any(word in section_text for word in ("全省", "省内", "跨市")):
            scope = "全省"
        summary = self._trim_text(section_text.splitlines()[0] if section_text.splitlines() else section_text, 60)
        return [{
            "service_scope_type": scope,
            "regions_summary": [summary] if summary else [],
            "regions_detail": self._extract_regions(section_text),
            "regions_truncated": len(self._extract_regions(section_text)) > 20,
            "service_modes": self._split_items(self._extract_field(section_text, ("通办形式", "办理形式"))),
            "notes": "",
            "raw_text": section_text.strip(),
        }]

    def _extract_process_info(self, section_text: str, full_text: str) -> dict[str, Any]:
        text = section_text or full_text
        step_titles: list[str] = []
        for keyword in PROCESS_STEP_KEYWORDS:
            if keyword in text and keyword not in step_titles:
                step_titles.append(keyword)
        return {
            "summary": self._trim_text(section_text or text, 200),
            "step_titles": step_titles,
            "raw_text": section_text.strip(),
            "notes": "",
            "needs_review": False,
        }

    def _extract_materials(self, section_text: str) -> tuple[list[dict[str, Any]], list[str]]:
        warnings: list[str] = []
        rows = self._extract_markdown_table(section_text)
        materials: list[dict[str, Any]] = []

        if rows:
            for idx, row in enumerate(rows):
                raw_row_text = " | ".join(str(v) for v in row.values() if v)
                material_name = self._pick_first(row, ("材料名称", "名称", "申请材料", "材料"))
                if not material_name:
                    warnings.append(f"materials[{idx}]: field mapping uncertain, raw text preserved")
                materials.append({
                    "guide_material_id": f"mat_{idx + 1}",
                    "material_name": material_name,
                    "linked_material_id": "",
                    "requirement_level": self._normalize_requirement_level(self._pick_first(row, ("必要性", "是否必要", "要求"))),
                    "original_count": self._extract_int(self._pick_first(row, ("原件", "原件份数", "原件数量"))),
                    "copy_count": self._extract_int(self._pick_first(row, ("复印件", "复印件份数", "复印件数量"))),
                    "form_types": self._split_items(self._pick_first(row, ("材料形式", "形式", "提交形式"))),
                    "paper_spec": self._pick_first(row, ("规格", "纸张规格")),
                    "electronic_license_linked": self._extract_bool(self._pick_first(row, ("电子证照", "关联电子证照"))),
                    "exempt_submission": self._extract_bool(self._pick_first(row, ("免提交", "是否免提交"))),
                    "reusable_previous_submission": self._extract_bool(self._pick_first(row, ("可复用", "历史材料复用"))),
                    "material_type": self._pick_first(row, ("材料类型",)),
                    "source_channel": self._pick_first(row, ("来源渠道", "材料来源")),
                    "fill_instructions": self._pick_first(row, ("填报须知", "填写说明")),
                    "notes": self._pick_first(row, ("备注",)),
                    "applicable_conditions": self._split_items(self._pick_first(row, ("适用条件",))),
                    "blank_form_available": self._extract_bool(self._pick_first(row, ("空白表格",))),
                    "sample_available": self._extract_bool(self._pick_first(row, ("示例样本", "示例"))),
                    "download_hint": self._pick_first(row, ("下载说明",)),
                    "raw_row_text": raw_row_text,
                })
            logger.debug(
                "service_guide_materials_extracted",
                source="table",
                material_count=len(materials),
                warning_count=len(warnings),
            )
            return materials, warnings

        for idx, line in enumerate(self._fallback_list_rows(section_text)):
            clean = re.sub(r"^[0-9一二三四五六七八九十()（）.、\s-]+", "", line)
            materials.append({
                "guide_material_id": f"mat_{idx + 1}",
                "material_name": clean,
                "linked_material_id": "",
                "requirement_level": "",
                "original_count": None,
                "copy_count": None,
                "form_types": [],
                "paper_spec": "",
                "electronic_license_linked": None,
                "exempt_submission": None,
                "reusable_previous_submission": None,
                "material_type": "",
                "source_channel": "",
                "fill_instructions": "",
                "notes": "",
                "applicable_conditions": [],
                "blank_form_available": None,
                "sample_available": None,
                "download_hint": "",
                "raw_row_text": line,
            })
            warnings.append(f"materials[{idx}]: field mapping uncertain, raw text preserved")
        logger.debug(
            "service_guide_materials_extracted",
            source="fallback",
            material_count=len(materials),
            warning_count=len(warnings),
        )
        return materials, warnings

    def _extract_fees(self, section_text: str) -> list[dict[str, Any]]:
        rows = self._extract_markdown_table(section_text)
        fees: list[dict[str, Any]] = []
        if rows:
            for row in rows:
                amount_text = self._pick_first(row, ("金额", "收费标准", "收费金额"))
                fees.append({
                    "fee_name": self._pick_first(row, ("收费项目", "项目名称", "收费名称")),
                    "amount_text": amount_text,
                    "amount_value": self._extract_amount(amount_text),
                    "currency": "CNY",
                    "charging_body": self._pick_first(row, ("收费主体",)),
                    "charging_method": self._pick_first(row, ("收费方式",)),
                    "reducible": self._extract_bool(self._pick_first(row, ("是否减免", "减免"))),
                    "notes": self._pick_first(row, ("备注",)),
                })
            return fees

        amount_text = self._extract_money_text(section_text)
        if amount_text:
            fees.append({
                "fee_name": self._extract_field(section_text, ("收费项目",)) or "收费项目",
                "amount_text": amount_text,
                "amount_value": self._extract_amount(amount_text),
                "currency": "CNY",
                "charging_body": self._extract_field(section_text, ("收费主体",)),
                "charging_method": self._extract_field(section_text, ("收费方式",)),
                "reducible": self._extract_bool(self._extract_field(section_text, ("减免",))),
                "notes": "",
            })
        return fees

    def _extract_service_windows(self, section_text: str) -> list[dict[str, Any]]:
        rows = self._extract_markdown_table(section_text)
        windows: list[dict[str, Any]] = []
        for row in rows:
            windows.append({
                "window_name": self._pick_first(row, ("窗口名称", "名称")),
                "location": self._pick_first(row, ("办理地点", "地址", "地点")),
                "office_phone": self._pick_first(row, ("办公电话", "联系电话", "电话")),
                "office_hours": self._pick_first(row, ("办公时间", "工作时间")),
                "navigation": self._pick_first(row, ("位置指引", "导航")),
                "scope": self._pick_first(row, ("服务范围", "办理范围")),
            })
        return windows

    def _extract_legal_basis(self, section_text: str) -> list[dict[str, Any]]:
        rows = self._extract_markdown_table(section_text)
        legal_basis: list[dict[str, Any]] = []
        for row in rows:
            legal_basis.append({
                "law_name": self._pick_first(row, ("法律法规名称", "法律名称", "依据名称")),
                "document_no": self._pick_first(row, ("文号",)),
                "article_no": self._pick_first(row, ("条款号", "条款")),
                "issuing_body": self._pick_first(row, ("颁布机关", "制定机关")),
                "effective_date": self._normalize_date(self._pick_first(row, ("实施日期", "生效日期"))),
                "article_content": self._pick_first(row, ("条款内容", "内容")),
            })
        return legal_basis

    def _extract_consultation_and_supervision(
        self,
        section_text: str,
    ) -> tuple[dict[str, Any], list[str]]:
        lines = [line.strip() for line in (section_text or "").splitlines() if line.strip()]
        consultation_lines = [
            line for line in lines
            if any(keyword in line for keyword in _CONSULTATION_KEYWORDS)
        ]
        complaint_lines = [
            line for line in lines
            if any(keyword in line for keyword in _COMPLAINT_KEYWORDS)
        ]

        warnings: list[str] = []
        if not consultation_lines and not complaint_lines and section_text.strip():
            warnings.append("consultation and complaint channels not distinguished")
            consultation_text = section_text
            complaint_text = ""
        else:
            consultation_text = "\n".join(consultation_lines)
            complaint_text = "\n".join(complaint_lines)
            if consultation_text and complaint_text:
                consultation_phones = self._unique_matches(_PHONE_RE, consultation_text)
                complaint_phones = self._unique_matches(_PHONE_RE, complaint_text)
                consultation_urls = self._unique_matches(_URL_RE, consultation_text)
                complaint_urls = self._unique_matches(_URL_RE, complaint_text)
                if (
                    consultation_phones == complaint_phones
                    and consultation_urls == complaint_urls
                    and (consultation_phones or consultation_urls)
                ):
                    warnings.append("consultation and complaint channels not distinguished")

        return {
            "consultation_phones": self._unique_matches(_PHONE_RE, consultation_text),
            "consultation_urls": self._unique_matches(_URL_RE, consultation_text),
            "complaint_phones": self._unique_matches(_PHONE_RE, complaint_text),
            "complaint_urls": self._unique_matches(_URL_RE, complaint_text),
        }, warnings

    def _extract_review_info(self, section_text: str) -> dict[str, Any]:
        return {
            "power_level": self._extract_field(section_text, ("行使层级",)),
            "power_source": self._extract_field(section_text, ("权力来源",)),
            "service_forms": self._split_items(self._extract_field(section_text, ("审批服务形式", "服务形式"))),
            "business_system": self._extract_field(section_text, ("业务系统",)),
        }

    def _extract_acceptance_info(self, section_text: str) -> dict[str, Any]:
        return {
            "service_targets": self._split_items(self._extract_field(section_text, ("服务对象",))),
            "natural_person_topics": [],
            "legal_person_topics": [],
            "local_feature_topics": [],
            "application_scope": self._extract_field(section_text, ("申请内容", "受理范围")),
            "acceptance_conditions": self._fallback_list_rows(section_text),
        }

    def _build_root_fields(
        self,
        matter_identity: dict[str, Any],
        basic_info: dict[str, Any],
        cross_region_service: list[dict[str, Any]],
        materials: list[dict[str, Any]],
        fees: list[dict[str, Any]],
        service_windows: list[dict[str, Any]],
        legal_basis: list[dict[str, Any]],
        quality: dict[str, Any],
    ) -> dict[str, Any]:
        cross_region_scope = cross_region_service[0]["service_scope_type"] if cross_region_service else ""
        cross_region_summary = cross_region_service[0]["regions_summary"] if cross_region_service else []
        promised_days = self._extract_int((basic_info.get("promised_time_limit") or {}).get("duration"))
        legal_days = self._extract_int((basic_info.get("legal_time_limit") or {}).get("duration"))
        guide_search_parts = [
            matter_identity.get("matter_name", ""),
            " ".join(matter_identity.get("colloquial_names", [])),
            " ".join(item.get("material_name", "") for item in materials),
            " ".join(item.get("fee_name", "") for item in fees),
            " ".join(item.get("window_name", "") for item in service_windows),
            " ".join(item.get("law_name", "") for item in legal_basis),
        ]
        return {
            "matter_name": matter_identity.get("matter_name", ""),
            "colloquial_names": matter_identity.get("colloquial_names", []),
            "matter_type": matter_identity.get("matter_type", ""),
            "implementation_code": matter_identity.get("implementation_code", ""),
            "basic_code": matter_identity.get("basic_code", ""),
            "business_item_code": matter_identity.get("business_item_code", ""),
            "matter_version": matter_identity.get("matter_version", ""),
            "implementing_subject": matter_identity.get("implementing_subject", ""),
            "implementing_subject_nature": matter_identity.get("subject_nature", ""),
            "delegated_department": matter_identity.get("delegated_department", ""),
            "service_objects": basic_info.get("service_object", []),
            "service_modes": basic_info.get("service_modes", []),
            "online_depth": basic_info.get("online_depth", ""),
            "hall_required": basic_info.get("hall_required"),
            "express_supported": basic_info.get("express_supported"),
            "reservation_supported": basic_info.get("reservation_supported"),
            "must_onsite": basic_info.get("must_onsite"),
            "must_onsite_reason": basic_info.get("must_onsite_reason", ""),
            "visit_count_to_hall": basic_info.get("visit_count_to_hall"),
            "promised_time_limit_days": promised_days,
            "legal_time_limit_days": legal_days,
            "handled_org_names": [matter_identity.get("implementing_subject", "")] if matter_identity.get("implementing_subject") else [],
            "region_names": cross_region_summary,
            "linked_matter_ids": [],
            "material_names": [item.get("material_name", "") for item in materials if item.get("material_name")],
            "fee_names": [item.get("fee_name", "") for item in fees if item.get("fee_name")],
            "window_names": [item.get("window_name", "") for item in service_windows if item.get("window_name")],
            "legal_basis_names": [item.get("law_name", "") for item in legal_basis if item.get("law_name")],
            "cross_region_scope": cross_region_scope,
            "cross_region_summary": cross_region_summary,
            "guide_search_text": " ".join(part for part in guide_search_parts if part).strip(),
            "needs_review": quality.get("needs_review", False),
            "completeness_score": quality.get("completeness_score", 0.0),
            "confidence_score": quality.get("confidence_score", 0.0),
        }

    def _build_quality(
        self,
        raw_sections: dict[str, str],
        materials: list[dict[str, Any]],
        fees: list[dict[str, Any]],
        service_windows: list[dict[str, Any]],
        warnings: list[str],
    ) -> dict[str, Any]:
        present_sections = len(raw_sections)
        completeness = min(0.35 + present_sections * 0.05, 0.95)
        if materials:
            completeness += 0.1
        if fees:
            completeness += 0.05
        if service_windows:
            completeness += 0.05
        completeness = min(completeness, 0.98)
        confidence = max(0.5, completeness - len(warnings) * 0.03)
        missing = []
        if not materials:
            missing.append("materials")
        if not service_windows:
            missing.append("service_windows")
        return {
            "completeness_score": round(completeness, 4),
            "confidence_score": round(confidence, 4),
            "missing_fields": missing,
            "needs_review": bool(warnings),
            "warnings": warnings,
        }

    def _build_org_bindings(self, matter_identity: dict[str, Any]) -> list[dict[str, Any]]:
        name = matter_identity.get("implementing_subject", "")
        if not name:
            return []
        return [{"name": name, "organization_id": "", "role": "handled_by"}]

    def _build_profile_id(
        self,
        doc_id: str,
        content_hash: str,
        matter_identity: dict[str, Any],
    ) -> str:
        seed = f"{doc_id}:{content_hash}:{matter_identity.get('matter_name', '')}:{matter_identity.get('matter_version', '')}"
        return f"guide_{hashlib.sha256(seed.encode('utf-8')).hexdigest()[:12]}"

    def _extract_field(self, text: str, labels: tuple[str, ...]) -> str:
        if not text:
            return ""
        lines = text.splitlines()
        for idx, line in enumerate(lines):
            clean_line = self._normalize_field_line(line)
            for label in labels:
                for prefix in (f"{label}：", f"{label}:"):
                    if not clean_line.startswith(prefix):
                        continue
                    value_lines = [clean_line[len(prefix):].strip()]
                    for next_line in lines[idx + 1:]:
                        normalized_next = next_line.strip()
                        if not normalized_next:
                            if any(value_lines):
                                break
                            continue
                        if self._looks_like_labeled_line(normalized_next) or self._match_section_header(normalized_next):
                            break
                        value_lines.append(normalized_next)
                    value = "\n".join(item for item in value_lines if item).strip()
                    if len(value_lines) > 1:
                        logger.debug(
                            "service_guide_multiline_field",
                            labels=list(labels),
                            line_count=len(value_lines),
                        )
                    return value
        return ""

    def _extract_time_limit(self, raw_text: str) -> dict[str, Any]:
        match = _TIME_LIMIT_RE.search(raw_text or "")
        if not match:
            return {"raw_text": raw_text or "", "duration": "", "unit": "", "is_working_day": None}
        unit = match.group("unit")
        return {
            "raw_text": raw_text,
            "duration": match.group("duration"),
            "unit": unit,
            "is_working_day": "工作日" in unit,
        }

    def _extract_bool(self, raw_value: str) -> bool | None:
        value = (raw_value or "").strip()
        if not value:
            return None
        if any(word in value for word in _FALSE_WORDS):
            return False
        if any(word in value for word in _TRUE_WORDS):
            return True
        return None

    def _extract_int(self, raw_value: str | None) -> int | None:
        if not raw_value:
            return None
        match = re.search(r"\d+", str(raw_value))
        return int(match.group(0)) if match else None

    def _extract_amount(self, raw_value: str | None) -> float | None:
        if not raw_value:
            return None
        match = re.search(r"\d+(?:\.\d+)?", raw_value)
        return float(match.group(0)) if match else None

    def _extract_money_text(self, text: str) -> str:
        match = re.search(r"\d+(?:\.\d+)?\s*元", text or "")
        return match.group(0) if match else ""

    def _split_items(self, raw_value: str | None) -> list[str]:
        if not raw_value:
            return []
        items = [item.strip() for item in _LIST_SPLIT_RE.split(raw_value) if item.strip()]
        return list(dict.fromkeys(items))

    def _extract_regions(self, text: str) -> list[str]:
        candidates = re.findall(r"[\u4e00-\u9fa5]{2,12}(?:省|市|区|县|旗|自治州)", text or "")
        return list(dict.fromkeys(candidates))

    def _trim_text(self, text: str, limit: int) -> str:
        clean = re.sub(r"\s+", " ", (text or "").strip())
        return clean[:limit]

    def _normalize_requirement_level(self, raw_value: str) -> str:
        value = (raw_value or "").strip()
        if not value:
            return ""
        if "非必要" in value or "可不提交" in value:
            return "optional"
        if "条件" in value or "仅" in value:
            return "conditional"
        if any(keyword in value for keyword in ("必要", "必需", "必须", "需提交", "应提交", "提交")):
            return "required"
        return ""

    def _normalize_text(self, text: str) -> str:
        return re.sub(r"\s+", "", (text or "").strip())

    def _normalize_date(self, raw_value: str) -> str | None:
        match = _DATE_RE.search(raw_value or "")
        return match.group(0).replace("/", "-").replace(".", "-") if match else None

    def _unique_matches(self, pattern: re.Pattern[str], text: str) -> list[str]:
        return list(
            dict.fromkeys(
                match.group(0).rstrip("，。；;,.）)]")
                for match in pattern.finditer(text or "")
            )
        )

    def _pick_first(self, row: dict[str, str], labels: tuple[str, ...]) -> str:
        for label in labels:
            for header, value in row.items():
                if label in header:
                    return value.strip()
        return ""

    def _extract_markdown_table(self, text: str) -> list[dict[str, str]]:
        lines = [line.strip() for line in (text or "").splitlines() if line.strip()]
        if len(lines) < 2:
            return []

        table_start = -1
        for idx in range(len(lines) - 1):
            if lines[idx].count("|") >= 2 and _TABLE_SEP_RE.match(lines[idx + 1]):
                table_start = idx
                break
        if table_start < 0:
            return []

        headers = self._split_table_row(lines[table_start])
        rows: list[dict[str, str]] = []
        for line in lines[table_start + 2:]:
            if line.count("|") < 2:
                break
            values = self._split_table_row(line)
            if len(values) != len(headers):
                logger.debug(
                    "service_guide_table_row_skipped",
                    reason="column_count_mismatch",
                    expected=len(headers),
                    actual=len(values),
                )
                continue
            rows.append({headers[i]: values[i] for i in range(len(headers))})
            if len(rows) >= _MAX_TABLE_ROWS:
                logger.warning(
                    "service_guide_table_truncated",
                    max_rows=_MAX_TABLE_ROWS,
                )
                break
        return rows

    def _normalize_field_line(self, line: str) -> str:
        return re.sub(r"^[0-9一二三四五六七八九十()（）.、\s-]+", "", line.strip())

    def _looks_like_labeled_line(self, line: str) -> bool:
        return bool(_FIELD_HEADER_RE.match(self._normalize_field_line(line)))

    def _split_table_row(self, line: str) -> list[str]:
        stripped = line.strip().strip("|")
        return [cell.strip() for cell in stripped.split("|")]

    def _fallback_list_rows(self, text: str) -> list[str]:
        rows = []
        for line in (text or "").splitlines():
            stripped = line.strip()
            if not stripped or len(stripped) < 3:
                continue
            if any(keyword in stripped for keyword in ("材料名称", "收费项目", "窗口名称")):
                continue
            rows.append(stripped)
        return rows[:50]