from __future__ import annotations

from types import SimpleNamespace

import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from govcrawler.models import Article, ArticleRagPushLog, Base, CrawlSite, CrawlTarget
from govcrawler.rag.exporter import (
    KNOWLEDGE_CATEGORY,
    RagExporter,
    RagExportError,
    RagIngestClient,
    article_doc_id,
    attachment_doc_id,
    build_article_metadata,
    build_attachment_metadata,
    split_subject_words,
)


def _article(**overrides):
    base = {
        "id": 123,
        "title": "清远市政务公开测试",
        "url": "https://www.example.gov.cn/zwgk/post_123.html",
        "url_hash": "a" * 64,
        "native_post_id": "post_123",
        "doc_no": "清府〔2026〕1号",
        "publisher": "清远市人民政府",
        "source_raw": "清远市人民政府门户网站",
        "publish_date": None,
        "publish_time": None,
        "topic_words": "政务服务、信息公开, 数据",
        "channel_name": "政务公开",
        "channel_path": "政务公开/通知公告",
        "content_category": "主题分类",
        "content_subcategory": "子类",
        "open_category": "主动公开",
        "index_no": "000000/2026-00001",
        "metadata_json": {"public_meta": {"发布机构": "清远市人民政府"}},
        "site": SimpleNamespace(site_code="gdqy"),
        "target": SimpleNamespace(target_code="gdqy__zwgk"),
    }
    base.update(overrides)
    return SimpleNamespace(**base)


def test_doc_ids_are_stable_and_windows_filename_safe():
    assert article_doc_id(12) == "govcrawler_article_12"
    assert attachment_doc_id(12, 34) == "govcrawler_article_12_attachment_34"
    assert ":" not in article_doc_id(12)
    assert ":" not in attachment_doc_id(12, 34)


def test_split_subject_words_deduplicates_common_separators():
    assert split_subject_words("政务服务、信息公开, 数据；政务服务") == [
        "政务服务",
        "信息公开",
        "数据",
    ]


def test_article_metadata_maps_govcrawler_docs_to_public_knowledge_base():
    attachment = SimpleNamespace(id=456, file_name="attachment.pdf")
    meta = build_article_metadata(_article(attachments=[attachment]), filename="post_123.txt")

    assert meta["doc_id"] == "govcrawler_article_123"
    assert meta["knowledge_category"] == KNOWLEDGE_CATEGORY
    assert meta["acl_ids"] == []
    assert meta["doc_type"] == "网页正文"
    assert meta["doc_number"] == "清府〔2026〕1号"
    assert meta["issuing_org"] == "清远市人民政府"
    assert meta["source_system"] == "GovCrawler"
    assert meta["source_article_id"] == "123"
    assert meta["source_site_code"] == "gdqy"
    assert meta["source_target_code"] == "gdqy__zwgk"
    assert meta["source_url"] == "https://www.example.gov.cn/zwgk/post_123.html"
    assert meta["related_docs"] == [
        {
            "doc_id": "govcrawler_article_123_attachment_456",
            "title": "attachment.pdf",
            "relation_type": "附件",
        }
    ]
    assert meta["subject_words"] == ["政务服务", "信息公开", "数据"]


def test_article_metadata_prefers_registry_admin_level():
    article = _article()
    article._registry_admin_level = "county"

    meta = build_article_metadata(article, filename="post_123.txt")

    assert meta["administrative_level"] == "county"


def test_attachment_metadata_keeps_article_provenance_and_attachment_id():
    article = _article()
    attachment = SimpleNamespace(
        id=456,
        file_name="附件.pdf",
        file_ext="pdf",
        file_hash="b" * 64,
        size_bytes=1024,
        source_url="https://www.example.gov.cn/files/attachment.pdf",
    )

    meta = build_attachment_metadata(article, attachment, filename="附件.pdf")

    assert meta["doc_id"] == "govcrawler_article_123_attachment_456"
    assert meta["knowledge_category"] == KNOWLEDGE_CATEGORY
    assert meta["doc_type"] == "网页附件"
    assert meta["source_article_id"] == "123"
    assert meta["source_attachment_id"] == "456"
    assert meta["source_url"] == "https://www.example.gov.cn/files/attachment.pdf"
    assert meta["source_metadata"]["article_source_url"] == (
        "https://www.example.gov.cn/zwgk/post_123.html"
    )
    assert meta["source_metadata"]["attachment_source_url"] == (
        "https://www.example.gov.cn/files/attachment.pdf"
    )
    assert meta["source_metadata"]["attachment_file_hash"] == "b" * 64
    assert meta["related_docs"] == [
        {
            "doc_id": "govcrawler_article_123",
            "title": article.title,
            "relation_type": "正文",
        }
    ]


def test_export_document_waits_for_final_ingest_status(tmp_path):
    class FakeClient:
        def __init__(self):
            self.waited_for = None

        def ingest_file(self, file_path, metadata):
            return {"status": "queued", "task_id": "task-1"}

        def wait_for_task(self, task_id):
            self.waited_for = task_id
            return {"task_id": task_id, "status": "COMPLETED"}

    settings = SimpleNamespace(rag_export_wait_completion=True)
    client = FakeClient()
    exporter = RagExporter(settings=settings, client=client)
    source_file = tmp_path / "source.txt"
    source_file.write_text("hello", encoding="utf-8")

    document = exporter._export_document(
        file_path=source_file,
        metadata={"doc_id": "doc-1", "title": "Doc 1", "original_filename": "source.txt"},
        kind="article",
        dry_run=False,
    )

    assert client.waited_for == "task-1"
    assert document.status == "COMPLETED"
    assert document.task_id == "task-1"


def test_ingest_client_uses_gov_public_kb_endpoint_and_token(tmp_path):
    class FakeResponse:
        status_code = 200
        text = '{"status":"queued","task_id":"task-1"}'

        def raise_for_status(self):
            return None

        def json(self):
            return {"status": "queued", "task_id": "task-1"}

    class FakeHttpClient:
        def __init__(self):
            self.calls = []

        def post(self, url, headers, data, files):
            self.calls.append({
                "url": url,
                "headers": headers,
                "data": data,
                "files": files,
            })
            return FakeResponse()

        def close(self):
            return None

    source_file = tmp_path / "source.txt"
    source_file.write_text("hello", encoding="utf-8")
    fake_http = FakeHttpClient()
    client = RagIngestClient(SimpleNamespace(
        rag_gov_public_kb_ingest_url=(
            "http://rag.example.com/api/v1/gov-public-kb/ingest/document"
        ),
        rag_gov_public_kb_status_url="",
        rag_gov_public_kb_ingest_token="gov-public-token",
        rag_export_timeout_s=60.0,
    ))
    client._client = fake_http

    result = client.ingest_file(
        source_file,
        {"doc_id": "doc-1", "original_filename": "source.txt"},
    )

    assert result["task_id"] == "task-1"
    assert fake_http.calls[0]["url"] == (
        "http://rag.example.com/api/v1/gov-public-kb/ingest/document"
    )
    assert fake_http.calls[0]["headers"] == {
        "Authorization": "Bearer gov-public-token",
    }


def test_ingest_client_rejects_missing_gov_public_kb_token(tmp_path):
    source_file = tmp_path / "source.txt"
    source_file.write_text("hello", encoding="utf-8")
    client = RagIngestClient(SimpleNamespace(
        rag_gov_public_kb_ingest_url=(
            "http://rag.example.com/api/v1/gov-public-kb/ingest/document"
        ),
        rag_gov_public_kb_status_url="",
        rag_gov_public_kb_ingest_token="",
        rag_export_timeout_s=60.0,
    ))

    with pytest.raises(RagExportError, match="RAG_GOV_PUBLIC_KB_INGEST_TOKEN"):
        client.ingest_file(
            source_file,
            {"doc_id": "doc-1", "original_filename": "source.txt"},
        )


def test_export_pending_logs_failure_when_ingest_url_is_empty(tmp_path, monkeypatch):
    class FailIfCalledClient:
        def ingest_file(self, file_path, metadata):
            raise AssertionError("ingest_file should not be called")

    from govcrawler import db as db_mod

    db_path = tmp_path / "empty-url.db"
    monkeypatch.setenv("DB_URL", "sqlite:///" + str(db_path))
    db_mod._reset_for_tests()
    try:
        Base.metadata.create_all(db_mod.get_engine())
        Session = db_mod.get_sessionmaker()
        with Session() as s:
            site = CrawlSite(
                site_code="demo",
                base_url="https://demo.example.com",
                yaml_path="config/sites/demo.yaml",
            )
            s.add(site)
            s.flush()
            target = CrawlTarget(
                site_id=site.id,
                target_code="demo__news",
                entry_url="https://demo.example.com/news/",
            )
            s.add(target)
            s.flush()
            article = Article(
                site_id=site.id,
                target_id=target.id,
                url="https://demo.example.com/a.html",
                url_hash="d" * 64,
                title="Demo Article",
                status="ready",
                text_path="articles_text/a.txt",
                content_text="hello",
            )
            s.add(article)
            s.commit()

        settings = SimpleNamespace(
            rag_ingest_url="http://rag.example.com/api/v1/ingest/webhook/document",
            rag_gov_public_kb_ingest_url="",
            rag_export_wait_completion=True,
            rag_export_batch_size=50,
            rag_export_running_stale_s=21600,
        )
        exporter = RagExporter(settings=settings, client=FailIfCalledClient())

        result = exporter.export_pending(source="auto")

        assert result.total == 1
        assert result.exported == 0
        assert result.failed == 1
        assert result.items[0].status == "failed"
        with Session() as s:
            article = s.query(Article).one()
            log = s.query(ArticleRagPushLog).one()
            assert article.rag_export_status == "failed"
            assert "RAG_GOV_PUBLIC_KB_INGEST_URL is empty" in (
                article.rag_export_error or ""
            )
            assert log.article_id == article.id
            assert log.source == "auto"
            assert log.status == "failed"
            assert log.file_count == 1
            assert log.duration_ms == 0
            assert "RAG_GOV_PUBLIC_KB_INGEST_URL is empty" in (log.error_msg or "")
    finally:
        db_mod._reset_for_tests()


def test_export_article_writes_push_log(tmp_path):
    class FakeClient:
        def ingest_file(self, file_path, metadata):
            return {"status": "queued", "task_id": "task-123"}

        def wait_for_task(self, task_id):
            return {"task_id": task_id, "status": "COMPLETED"}

    engine = create_engine("sqlite:///" + str(tmp_path / "rag-log.db"), future=True)
    Base.metadata.create_all(engine)
    Session = sessionmaker(bind=engine, expire_on_commit=False)
    data_dir = tmp_path / "data"
    body = data_dir / "articles_text" / "a.txt"
    body.parent.mkdir(parents=True)
    body.write_text("hello", encoding="utf-8")

    with Session() as s:
        site = CrawlSite(
            site_code="demo",
            base_url="https://demo.example.com",
            yaml_path="config/sites/demo.yaml",
        )
        s.add(site)
        s.flush()
        target = CrawlTarget(
            site_id=site.id,
            target_code="demo__news",
            entry_url="https://demo.example.com/news/",
        )
        s.add(target)
        s.flush()
        article = Article(
            site_id=site.id,
            target_id=target.id,
            url="https://demo.example.com/a.html",
            url_hash="c" * 64,
            title="Demo Article",
            status="ready",
            text_path="articles_text/a.txt",
            content_text="hello",
        )
        s.add(article)
        s.commit()
        article = s.get(Article, article.id)
        _ = article.site, article.target, article.attachments

        settings = SimpleNamespace(
            data_dir=str(data_dir),
            rag_ingest_url="http://rag.example.com/api/v1/ingest/webhook/document",
            rag_gov_public_kb_ingest_url="http://rag.example.com/api/v1/gov-public-kb/ingest/document",
            rag_export_wait_completion=True,
        )
        exporter = RagExporter(settings=settings, client=FakeClient())
        result = exporter.export_article(s, article, source="manual")

        assert result.status == "exported"
        log = s.query(ArticleRagPushLog).one()
        assert log.article_id == article.id
        assert log.source == "manual"
        assert log.status == "completed"
        assert log.file_count == 1
        assert log.duration_ms is not None
        assert log.task_ids == ["task-123"]
        assert log.rag_ingest_url == "http://rag.example.com/api/v1/gov-public-kb/ingest/document"


def test_export_article_marks_partial_failed_status_as_failed(tmp_path):
    class FakeClient:
        def ingest_file(self, file_path, metadata):
            return {"status": "queued", "task_id": "task-456"}

        def wait_for_task(self, task_id):
            return {"task_id": task_id, "status": "PARTIAL_FAILED", "error": "chunk failed"}

    engine = create_engine("sqlite:///" + str(tmp_path / "rag-partial.db"), future=True)
    Base.metadata.create_all(engine)
    Session = sessionmaker(bind=engine, expire_on_commit=False)
    data_dir = tmp_path / "data"
    body = data_dir / "articles_text" / "a.txt"
    body.parent.mkdir(parents=True)
    body.write_text("hello", encoding="utf-8")

    with Session() as s:
        site = CrawlSite(
            site_code="demo",
            base_url="https://demo.example.com",
            yaml_path="config/sites/demo.yaml",
        )
        s.add(site)
        s.flush()
        target = CrawlTarget(
            site_id=site.id,
            target_code="demo__news",
            entry_url="https://demo.example.com/news/",
        )
        s.add(target)
        s.flush()
        article = Article(
            site_id=site.id,
            target_id=target.id,
            url="https://demo.example.com/a.html",
            url_hash="e" * 64,
            title="Demo Article",
            status="ready",
            text_path="articles_text/a.txt",
            content_text="hello",
        )
        s.add(article)
        s.commit()
        article = s.get(Article, article.id)
        _ = article.site, article.target, article.attachments

        settings = SimpleNamespace(
            data_dir=str(data_dir),
            rag_gov_public_kb_ingest_url="http://rag.example.com/api/v1/gov-public-kb/ingest/document",
            rag_export_wait_completion=True,
        )
        exporter = RagExporter(settings=settings, client=FakeClient())
        result = exporter.export_article(s, article, source="manual")

        assert result.status == "failed"
        assert "chunk failed" in (result.error or "")
        s.refresh(article)
        assert article.rag_export_status == "failed"
        assert "chunk failed" in (article.rag_export_error or "")
        log = s.query(ArticleRagPushLog).one()
        assert log.status == "failed"
        assert log.file_count == 1
        assert log.task_ids == ["task-456"]
        assert "chunk failed" in (log.error_msg or "")
