"""gkmlpt adapter — unit + real-probe integration.

Three layers:
  1. URL builder is pure + templatable
  2. `parse_list_response` projects real probe JSON without contract violations
  3. bucket-invariant probe catches CMS format drift
"""
from __future__ import annotations

import json
from pathlib import Path

import pytest

from govcrawler.adapters import get_adapter, gkmlpt
from govcrawler.adapters.contract import CrawlItem, Status
from govcrawler.models import Article
from scripts.backfill_gd_gkmlpt_metadata import (
    apply_values,
    values_from_detail_html,
    values_from_raw,
)

PROBE = Path(__file__).resolve().parents[1] / "data" / "probe"


# ---------------------------------------------------------------------------
# registry
# ---------------------------------------------------------------------------
def test_registry_resolves_gkmlpt():
    assert get_adapter("gkmlpt") is gkmlpt


def test_registry_rejects_unknown():
    with pytest.raises(KeyError, match="unknown cms_adapter"):
        get_adapter("not_a_real_cms")


# ---------------------------------------------------------------------------
# URL builder
# ---------------------------------------------------------------------------
class TestBuildListUrl:
    def test_default_template(self):
        url = gkmlpt.build_list_url(
            base_url="http://fgw.qingcheng.gov.cn",
            dept_path="qycsj",
            column_id="1234",
            page=2,
            sid="abc",
        )
        assert url == (
            "http://fgw.qingcheng.gov.cn/qycsj/gkmlpt/api/all/1234"
            "?page=2&sid=abc"
        )

    def test_custom_template(self):
        """Per-site override via adapter_params.list_api_path_tpl."""
        url = gkmlpt.build_list_url(
            base_url="http://example.gov.cn",
            dept_path="foo",
            column_id="77",
            page=1,
            path_tpl="{base_url}/{dept_path}/api/v2/list/{column_id}?fmt=json",
        )
        assert url == (
            "http://example.gov.cn/foo/api/v2/list/77?fmt=json&page=1"
        )

    def test_trailing_slash_normalised(self):
        url = gkmlpt.build_list_url(
            base_url="http://x/",
            dept_path="/y/",
            column_id="9",
        )
        assert url == "http://x/y/gkmlpt/api/all/9?page=1"


# ---------------------------------------------------------------------------
# list response projection
# ---------------------------------------------------------------------------
SAMPLE_MIN = {
    "articles": [
        {
            "id": 2116964,
            "identifier": "006939748/2025-00175",
            "title": "测试通知",
            "url": "http://www.qingcheng.gov.cn/qyqcfgw/gkmlpt/content/2/2116/post_2116964.html",
            "date": 1750262400,
            "first_publish_time": 1751528380,
            "display_publish_time": 1751507722,
            "publisher": "清远市清城区发展和改革局",
            "document_number": "粤府办〔2025〕15号",
            "validity": 0,
            "is_expired": 0,
            "is_abolished": 0,
            "expired_time": 0,
            "abolished_time": 0,
            "classify_main_name": "其他文件",
            "classify_genre_name": "意见",
            "classify_theme_name": "公安、安全、司法",
        }
    ]
}


class TestParseListResponse:
    def test_projects_minimal_article(self):
        items = gkmlpt.parse_list_response(SAMPLE_MIN, site_id="qingcheng_fgw")
        assert len(items) == 1
        it = items[0]
        assert isinstance(it, CrawlItem)
        assert it.native_post_id == "2116964"
        assert it.status == Status.RAW
        assert it.publish_time is not None
        assert it.publish_time.tzinfo is not None
        assert it.publish_time.date().isoformat() == "2025-07-03"
        assert it.publish_date is not None
        assert it.publish_date.isoformat() == "2025-06-19"
        assert it.is_effective is True
        assert it.expiry_date is None
        assert it.index_no == "006939748/2025-00175"
        assert it.publisher == "清远市清城区发展和改革局"
        assert it.source_raw == "清远市清城区发展和改革局"
        assert it.doc_no == "粤府办〔2025〕15号"
        assert it.content_category == "公安、安全、司法"
        assert it.content_subcategory == "意见"
        assert it.open_category == "公安、安全、司法、意见"
        assert it.metadata_json == {"raw": SAMPLE_MIN["articles"][0]}

    def test_accepts_bare_list(self):
        items = gkmlpt.parse_list_response(
            SAMPLE_MIN["articles"], site_id="qingcheng_fgw"
        )
        assert len(items) == 1

    def test_invalid_payload_type_raises(self):
        from govcrawler.adapters.contract import ContractViolation

        with pytest.raises(ContractViolation):
            gkmlpt.parse_list_response("bad", site_id="x")  # type: ignore[arg-type]

    def test_skips_malformed_row_without_killing_page(self):
        payload = {
            "articles": [
                {"id": 1, "url": ""},  # missing url → skip
                SAMPLE_MIN["articles"][0],  # good
                {"not_a_dict": None},  # noise
            ]
        }
        items = gkmlpt.parse_list_response(payload, site_id="qingcheng_fgw")
        assert len(items) == 1
        assert items[0].native_post_id == "2116964"

    def test_string_date_fallback(self):
        payload = {
            "articles": [
                {
                    "id": 42,
                    "title": "t",
                    "url": "http://x/content/0/0/post_42.html",
                    "date": "2026-04-01T00:00:00Z",
                }
            ]
        }
        items = gkmlpt.parse_list_response(payload, site_id="x")
        assert items[0].publish_time is not None
        assert items[0].publish_time.year == 2026

    def test_display_publish_time_zero_falls_back_to_first_publish_time(self):
        payload = {
            "articles": [
                {
                    "id": 4889387,
                    "title": "广东省发展改革委关于印发管理办法的通知",
                    "url": "https://drc.gd.gov.cn/gkmlpt/content/4/4889/post_4889387.html",
                    "display_publish_time": 0,
                    "first_publish_time": 1777027943,
                    "date": 1776960000,
                    "publisher": "广东省发展和改革委员会",
                    "document_number": "粤发改规〔2026〕4号",
                    "identifier": "006939756/2026-00196",
                }
            ]
        }
        items = gkmlpt.parse_list_response(payload, site_id="gd_drc")

        assert len(items) == 1
        assert items[0].publish_time is not None
        assert items[0].publish_time.year == 2026
        assert items[0].doc_no == "粤发改规〔2026〕4号"

    def test_metadata_falls_back_to_main_category_for_non_policy_rows(self):
        payload = {
            "articles": [
                {
                    "id": 43,
                    "title": "t",
                    "url": "http://x/content/0/0/post_43.html",
                    "first_publish_time": 1771916570,
                    "classify_main_name": "政务动态",
                    "classify_genre_name": "",
                    "classify_theme_name": "",
                }
            ]
        }
        item = gkmlpt.parse_list_response(payload, site_id="x")[0]
        assert item.content_category == "政务动态"
        assert item.content_subcategory is None
        assert item.open_category == "政务动态"

    def test_projects_expired_validity_fields(self):
        payload = {
            "articles": [
                {
                    "id": 44,
                    "title": "expired",
                    "url": "http://x/content/0/0/post_44.html",
                    "first_publish_time": 1771916570,
                    "is_expired": 1,
                    "expired_time": 1772121600,
                }
            ]
        }
        item = gkmlpt.parse_list_response(payload, site_id="x")[0]
        assert item.is_effective is False
        assert item.expiry_date is not None
        assert item.expiry_date.isoformat() == "2026-02-27"


# ---------------------------------------------------------------------------
# bucket invariant
# ---------------------------------------------------------------------------
class TestBucketInvariant:
    def test_matches(self):
        assert gkmlpt.verify_bucket_invariant(
            {
                "id": 2116964,
                "url": "http://x/qycsj/gkmlpt/content/2/2116/post_2116964.html",
            }
        )

    def test_mismatched_bucket_fails(self):
        assert not gkmlpt.verify_bucket_invariant(
            {"id": 2116964, "url": "http://x/content/99/0/post_2116964.html"}
        )

    def test_non_post_url_fails(self):
        assert not gkmlpt.verify_bucket_invariant(
            {"id": 2116964, "url": "http://x/other.html"}
        )


# ---------------------------------------------------------------------------
# real-probe integration — use verify_httpx.json sample articles
# ---------------------------------------------------------------------------
def _load_probe_samples() -> list[dict]:
    p = PROBE / "verify_httpx.json"
    if not p.exists():
        pytest.skip(f"probe file missing: {p}")
    data = json.loads(p.read_text(encoding="utf-8"))
    return [r for r in data if r.get("sample_article")]


@pytest.mark.parametrize(
    "report",
    _load_probe_samples() if (PROBE / "verify_httpx.json").exists() else [],
    ids=lambda r: r["label"],
)
def test_real_probe_sample_projects_and_passes_bucket_check(report):
    items = gkmlpt.parse_list_response(
        {"articles": [report["sample_article"]]},
        site_id=report["label"],
    )
    assert len(items) == 1
    assert gkmlpt.verify_bucket_invariant(report["sample_article"])


def test_gd_gkmlpt_backfill_values_from_raw():
    raw = SAMPLE_MIN["articles"][0]
    values = values_from_raw(raw)

    assert values["native_post_id"] == "2116964"
    assert values["index_no"] == "006939748/2025-00175"
    assert values["publisher"] == "清远市清城区发展和改革局"
    assert values["doc_no"] == "粤府办〔2025〕15号"
    assert values["publish_time"].date().isoformat() == "2025-07-03"
    assert values["publish_date"].isoformat() == "2025-06-19"
    assert values["content_category"] == "公安、安全、司法"
    assert values["content_subcategory"] == "意见"
    assert values["open_category"] == "公安、安全、司法、意见"


def test_gd_gkmlpt_backfill_values_from_detail_html():
    html = """
    <table>
      <tr>
        <td>索引号：</td><td><span>006939748/2025-00175</span></td>
        <td>分类：</td><td><span>公安、安全、司法、意见</span></td>
      </tr>
      <tr>
        <td>发布机构：</td><td><span>广东省人民政府办公厅</span></td>
        <td>成文日期：</td><td><span>2025-06-19</span></td>
      </tr>
      <tr>
        <td>文号：</td><td><span>粤府办〔2025〕15号</span></td>
        <td>发布日期：</td><td><span>2025-07-03</span></td>
      </tr>
      <tr>
        <td>主题词：</td><td colspan="3"><span>行政检查</span></td>
      </tr>
    </table>
    """
    values = values_from_detail_html(html)

    assert values["index_no"] == "006939748/2025-00175"
    assert values["publisher"] == "广东省人民政府办公厅"
    assert values["doc_no"] == "粤府办〔2025〕15号"
    assert values["publish_time"].date().isoformat() == "2025-07-03"
    assert values["publish_date"].isoformat() == "2025-06-19"
    assert values["content_category"] == "公安、安全、司法"
    assert values["content_subcategory"] == "意见"
    assert values["open_category"] == "公安、安全、司法、意见"
    assert values["topic_words"] == "行政检查"


def test_gd_gkmlpt_backfill_corrects_legacy_category_values():
    article = Article(
        site_id=1,
        url="https://www.gd.gov.cn/gkmlpt/content/4/4738/post_4738821.html",
        url_hash="h",
        content_category=None,
        content_subcategory=None,
        publish_date=None,
    )
    values = {
        "content_category": "公安、安全、司法",
        "content_subcategory": "意见",
        "publish_date": values_from_raw(SAMPLE_MIN["articles"][0])["publish_date"],
    }

    changed = apply_values(article, values, only_missing=False)
    assert changed == ["content_category", "content_subcategory", "publish_date"]
    assert article.content_category == "公安、安全、司法"
    assert article.content_subcategory == "意见"
    assert article.publish_date.isoformat() == "2025-06-19"
