from __future__ import annotations

from types import SimpleNamespace

from govcrawler.rag.exporter import (
    KNOWLEDGE_CATEGORY,
    RagExporter,
    article_doc_id,
    attachment_doc_id,
    build_article_metadata,
    build_attachment_metadata,
    split_subject_words,
)


def _article(**overrides):
    base = {
        "id": 123,
        "title": "清远市政务公开测试",
        "url": "https://www.example.gov.cn/zwgk/post_123.html",
        "url_hash": "a" * 64,
        "native_post_id": "post_123",
        "doc_no": "清府〔2026〕1号",
        "publisher": "清远市人民政府",
        "source_raw": "清远市人民政府门户网站",
        "publish_date": None,
        "publish_time": None,
        "topic_words": "政务服务、信息公开, 数据",
        "channel_name": "政务公开",
        "channel_path": "政务公开/通知公告",
        "content_category": "主题分类",
        "content_subcategory": "子类",
        "open_category": "主动公开",
        "index_no": "000000/2026-00001",
        "metadata_json": {"public_meta": {"发布机构": "清远市人民政府"}},
        "site": SimpleNamespace(site_code="gdqy"),
        "target": SimpleNamespace(target_code="gdqy__zwgk"),
    }
    base.update(overrides)
    return SimpleNamespace(**base)


def test_doc_ids_are_stable_and_windows_filename_safe():
    assert article_doc_id(12) == "govcrawler_article_12"
    assert attachment_doc_id(12, 34) == "govcrawler_article_12_attachment_34"
    assert ":" not in article_doc_id(12)
    assert ":" not in attachment_doc_id(12, 34)


def test_split_subject_words_deduplicates_common_separators():
    assert split_subject_words("政务服务、信息公开, 数据；政务服务") == [
        "政务服务",
        "信息公开",
        "数据",
    ]


def test_article_metadata_maps_all_govcrawler_docs_to_information_collection():
    meta = build_article_metadata(_article(), filename="post_123.txt")

    assert meta["doc_id"] == "govcrawler_article_123"
    assert meta["knowledge_category"] == KNOWLEDGE_CATEGORY
    assert meta["acl_ids"] == []
    assert meta["doc_type"] == "网页正文"
    assert meta["doc_number"] == "清府〔2026〕1号"
    assert meta["issuing_org"] == "清远市人民政府"
    assert meta["source_system"] == "GovCrawler"
    assert meta["source_article_id"] == "123"
    assert meta["source_site_code"] == "gdqy"
    assert meta["source_target_code"] == "gdqy__zwgk"
    assert meta["source_url"] == "https://www.example.gov.cn/zwgk/post_123.html"
    assert meta["subject_words"] == ["政务服务", "信息公开", "数据"]


def test_attachment_metadata_keeps_article_provenance_and_attachment_id():
    article = _article()
    attachment = SimpleNamespace(
        id=456,
        file_name="附件.pdf",
        file_ext="pdf",
        file_hash="b" * 64,
        size_bytes=1024,
    )

    meta = build_attachment_metadata(article, attachment, filename="附件.pdf")

    assert meta["doc_id"] == "govcrawler_article_123_attachment_456"
    assert meta["knowledge_category"] == KNOWLEDGE_CATEGORY
    assert meta["doc_type"] == "网页附件"
    assert meta["source_article_id"] == "123"
    assert meta["source_attachment_id"] == "456"
    assert meta["source_metadata"]["attachment_file_hash"] == "b" * 64


def test_export_document_waits_for_final_ingest_status(tmp_path):
    class FakeClient:
        def __init__(self):
            self.waited_for = None

        def ingest_file(self, file_path, metadata):
            return {"status": "queued", "task_id": "task-1"}

        def wait_for_task(self, task_id):
            self.waited_for = task_id
            return {"task_id": task_id, "status": "COMPLETED"}

    settings = SimpleNamespace(rag_export_wait_completion=True)
    client = FakeClient()
    exporter = RagExporter(settings=settings, client=client)
    source_file = tmp_path / "source.txt"
    source_file.write_text("hello", encoding="utf-8")

    document = exporter._export_document(
        file_path=source_file,
        metadata={"doc_id": "doc-1", "title": "Doc 1", "original_filename": "source.txt"},
        kind="article",
        dry_run=False,
    )

    assert client.waited_for == "task-1"
    assert document.status == "COMPLETED"
    assert document.task_id == "task-1"


def test_export_pending_skips_when_ingest_url_is_empty():
    class FailIfCalledClient:
        def ingest_file(self, file_path, metadata):
            raise AssertionError("ingest_file should not be called")

    settings = SimpleNamespace(
        rag_ingest_url="",
        rag_export_wait_completion=True,
    )
    exporter = RagExporter(settings=settings, client=FailIfCalledClient())

    result = exporter.export_pending()

    assert result.total == 0
    assert result.exported == 0
    assert result.failed == 0
    assert result.items == []