"""Adapter output contract tests — gatekeeper for §7.5.7.

Two layers of tests:

  A. **Contract self-tests** — the Pydantic models in
     `govcrawler.adapters.contract` reject malformed input (proves the
     guard is real, not ceremonial).

  B. **Real-data projection** — load the gkmlpt API JSON we captured in
     `data/probe/verify_httpx.json` and mechanically project each
     article into `CrawlItem`. If the contract is well-designed, this
     should never raise; if it does, the contract is too strict or the
     adapter will need a cleanup step before constructing `CrawlItem`.

New adapters add their own layer-B tests here (or in a sibling file)
pointing at their saved probe sample. Code review blocks the merge when
contract tests fail — enforces §7.5.7 "契约不通过不合并".
"""
from __future__ import annotations

import hashlib
import json
from datetime import date, datetime, timezone
from pathlib import Path

import pytest
from pydantic import ValidationError

from govcrawler.adapters.contract import (
    AttachmentItem,
    ContractViolation,
    CrawlItem,
    CrawlLogItem,
    FetchStrategy,
    Status,
)
from govcrawler.utils.url_norm import url_hash as compute_url_hash

PROBE = Path(__file__).resolve().parents[1] / "data" / "probe"

# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
SHA_EMPTY = hashlib.sha256(b"").hexdigest()  # valid 64-char hex for fixtures


def _make_minimal_item(**overrides) -> dict:
    """Smallest dict that should pass CrawlItem validation."""
    base = {
        "site_id": "qingcheng_fgw",
        "url": "http://www.qingcheng.gov.cn/post/1.html",
        "url_hash": compute_url_hash("http://www.qingcheng.gov.cn/post/1.html"),
        "title": "测试标题",
    }
    base.update(overrides)
    return base


# ===========================================================================
# LAYER A — Contract self-tests
# ===========================================================================
class TestCrawlItemBasics:
    def test_minimal_item_parses(self):
        item = CrawlItem(**_make_minimal_item())
        assert item.status == Status.RAW
        assert item.attachments == []
        assert item.native_post_id is None  # allowed to be None

    def test_empty_title_rejected(self):
        with pytest.raises(ValidationError):
            CrawlItem(**_make_minimal_item(title=""))

    def test_bad_url_hash_rejected(self):
        with pytest.raises(ValidationError):
            CrawlItem(**_make_minimal_item(url_hash="not-hex"))

    def test_extra_fields_forbidden(self):
        # extra=forbid guards against adapter typos silently dropping fields
        with pytest.raises(ValidationError):
            CrawlItem(**_make_minimal_item(sneaky="oops"))

    def test_content_simhash_forbidden(self):
        # §5.4 explicitly drops content_simhash; contract must refuse it
        with pytest.raises(ValidationError):
            CrawlItem(**_make_minimal_item(content_simhash="deadbeefcafe1234"))


class TestPublishTime:
    def test_naive_datetime_rejected(self):
        with pytest.raises(ValidationError):
            CrawlItem(**_make_minimal_item(publish_time=datetime(2026, 4, 1)))

    def test_utc_datetime_accepted(self):
        dt = datetime(2026, 4, 1, tzinfo=timezone.utc)
        item = CrawlItem(**_make_minimal_item(publish_time=dt))
        assert item.publish_time == dt

    def test_absurd_year_rejected(self):
        with pytest.raises(ValidationError):
            CrawlItem(**_make_minimal_item(
                publish_time=datetime(1980, 1, 1, tzinfo=timezone.utc)
            ))


class TestAttachmentConsistency:
    def _att(self, **k):
        base = {
            "file_path": "storage/a.pdf",
            "file_hash": SHA_EMPTY,
            "source_url": "http://x/a.pdf",
        }
        base.update(k)
        return AttachmentItem(**base)

    def test_has_attachment_requires_list(self):
        with pytest.raises(ValidationError):
            CrawlItem(**_make_minimal_item(has_attachment=True))

    def test_list_requires_has_attachment(self):
        with pytest.raises(ValidationError):
            CrawlItem(**_make_minimal_item(attachments=[self._att()]))

    def test_consistent_pair_ok(self):
        item = CrawlItem(**_make_minimal_item(
            has_attachment=True, attachments=[self._att()]
        ))
        assert item.has_attachment and len(item.attachments) == 1

    def test_bad_attachment_hash_rejected(self):
        with pytest.raises(ValidationError):
            AttachmentItem(file_path="x", file_hash="nope", source_url="http://x")


class TestReadyStatus:
    def test_ready_without_paths_rejected(self):
        with pytest.raises(ValidationError):
            CrawlItem(**_make_minimal_item(status=Status.READY))

    def test_ready_with_paths_ok(self):
        item = CrawlItem(**_make_minimal_item(
            status=Status.READY,
            content_text="正文",
            raw_html_path="storage/1.html",
            text_path="storage/1.txt",
        ))
        assert item.status == Status.READY


class TestCrawlLogItem:
    def test_success_with_error_msg_rejected(self):
        with pytest.raises(ValidationError):
            CrawlLogItem(
                site_id="x", strategy=FetchStrategy.HTTPX,
                success=True, error_msg="oops",
            )

    def test_fail_without_diagnostic_rejected(self):
        with pytest.raises(ValidationError):
            CrawlLogItem(site_id="x", strategy=FetchStrategy.HTTPX, success=False)

    def test_happy_path(self):
        log = CrawlLogItem(
            site_id="x", strategy=FetchStrategy.HTTPX, success=True,
            http_status=200, duration_ms=123,
        )
        assert log.strategy == FetchStrategy.HTTPX
        assert log.occurred_at.tzinfo is timezone.utc


# ===========================================================================
# LAYER B — Project real gkmlpt probe data through the contract
# ===========================================================================
def _load_gkmlpt_samples() -> list[dict]:
    """Grab the sample_article dicts from the httpx verify report."""
    report = PROBE / "verify_httpx.json"
    if not report.exists():
        pytest.skip(f"probe report not found; run scripts/verify_gkmlpt_httpx.py first: {report}")
    data = json.loads(report.read_text(encoding="utf-8"))
    return [r for r in data if r.get("sample_article")]


def _project_gkmlpt_article(sample: dict, site_id: str) -> CrawlItem:
    """Mechanical projection of gkmlpt list-API article → CrawlItem.

    This is what the real gkmlpt adapter's list-parse step must produce.
    Test-level version kept minimal & explicit so contract violations
    surface instead of being silently coerced.
    """
    a = sample["sample_article"]
    # `first_publish_time` is unix seconds; some rows have only `date`.
    ts = a.get("first_publish_time") or a.get("date")
    publish_time = (
        datetime.fromtimestamp(ts, tz=timezone.utc) if ts is not None else None
    )
    return CrawlItem(
        site_id=site_id,
        native_post_id=str(a["id"]),
        url=a["url"],
        url_hash=compute_url_hash(a["url"]),
        title=a["title"],
        publish_time=publish_time,
        source_raw=a.get("publisher"),
        publisher=a.get("publisher"),
        doc_no=a.get("document_number") or None,
        index_no=a.get("identifier") or None,
        # list stage: no body yet → status stays raw
        status=Status.RAW,
    )


@pytest.mark.parametrize(
    "report",
    _load_gkmlpt_samples() if (PROBE / "verify_httpx.json").exists() else [],
    ids=lambda r: r["label"],
)
def test_gkmlpt_sample_projects_cleanly(report):
    site_id = report["label"]
    item = _project_gkmlpt_article(report, site_id=site_id)
    # basic invariants that matter to RAG
    assert item.site_id == site_id
    assert item.native_post_id and item.native_post_id.isdigit()
    assert item.url_hash and len(item.url_hash) == 64
    assert item.title
    assert item.publish_time is not None
    assert item.publish_time.tzinfo is timezone.utc


def test_gkmlpt_bucket_invariant_holds():
    """url bucket = floor(native_post_id / 1000). Not part of the contract
    itself, but a gkmlpt-specific sanity check useful to keep near the
    contract tests (catches CMS format drift early)."""
    import re

    for report in _load_gkmlpt_samples():
        a = report["sample_article"]
        pid = a["id"]
        m = re.search(r"/content/\d+/(\d+)/post_(\d+)\.html", a["url"])
        assert m, f"{report['label']}: url shape drifted: {a['url']}"
        assert m.group(1) == str(pid // 1000)
        assert m.group(2) == str(pid)
