"""Phase-A schema v2 — contract alignment + invariant tests.

Three layers of gatekeeping:

  1. **Field alignment** — every field the Pydantic `CrawlItem` contract
     declares that maps to DB storage exists on `Article`. Catches drift
     when either side is edited in isolation.

  2. **dept_binding CHECK** — the DB refuses
     - an unknown binding value
     - `mapped` without `local_dept_id`
     - a non-`mapped` binding carrying a `local_dept_id`

  3. **FK chain** — site → site_department → crawl_target → article
     inserts work end-to-end, and `ON DELETE CASCADE` collapses the chain.

Uses an in-memory SQLite engine with FK enforcement — good enough to catch
DDL bugs without spinning up PG.
"""
from __future__ import annotations

from datetime import datetime, timezone

import pytest
from sqlalchemy import create_engine, event, select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session

from govcrawler.adapters.contract import CrawlItem
from govcrawler.models import (
    DEPT_BINDINGS,
    Article,
    Base,
    CrawlSite,
    CrawlTarget,
    SiteDepartment,
)
from govcrawler.repositories import depts, sites, targets
from govcrawler.storage.repo import insert_article


# ---------------------------------------------------------------------------
# engine fixture — in-memory SQLite with FK + CHECK enforcement
# ---------------------------------------------------------------------------
@pytest.fixture
def engine():
    eng = create_engine("sqlite://", future=True)

    @event.listens_for(eng, "connect")
    def _fk_on(dbapi_conn, _):
        cur = dbapi_conn.cursor()
        cur.execute("PRAGMA foreign_keys=ON")
        cur.close()

    Base.metadata.create_all(eng)
    yield eng
    eng.dispose()


@pytest.fixture
def session(engine):
    with Session(engine) as s:
        yield s


# ---------------------------------------------------------------------------
# 1. field alignment — Pydantic contract ↔ SQLA Article
# ---------------------------------------------------------------------------
# These Pydantic fields are storage-bound. Fields omitted here are either
# type-converted at the pipeline (e.g. str site_id → int FK) or don't map
# to `article` at all (attachments list, fetch_strategy on log).
_CONTRACT_TO_ARTICLE = {
    "native_post_id",
    "url",
    "url_hash",
    "title",
    "publish_time",
    "source_raw",
    "publisher",
    "content_text",
    "raw_html_path",
    "text_path",
    "channel_name",
    "channel_path",
    "content_category",
    "content_subcategory",
    "index_no",
    "doc_no",
    "publish_date",
    "effective_date",
    "is_effective",
    "expiry_date",
    "topic_words",
    "open_category",
    "metadata_json",
    "has_attachment",
    "status",
    "fetch_strategy",
    "target_id",
    "dept_id",
}


def test_article_has_every_contract_field():
    """Every storage-bound field on CrawlItem must exist on Article."""
    contract_fields = set(CrawlItem.model_fields.keys())
    # site_id exists on both but differs in type (str on contract, int FK on DB)
    # — ignore that here, just check name.
    shared = contract_fields & _CONTRACT_TO_ARTICLE
    missing = [f for f in shared if not hasattr(Article, f)]
    assert not missing, f"Article is missing contract fields: {missing}"


def test_article_rejects_content_simhash():
    """§5.5 explicitly excludes content_simhash — DB column MUST NOT exist."""
    assert not hasattr(Article, "content_simhash")


# ---------------------------------------------------------------------------
# 2. dept_binding CHECK constraint
# ---------------------------------------------------------------------------
def _mk_site(session: Session, code: str = "qingcheng_fgw") -> CrawlSite:
    s = sites.upsert_by_code(session, code, yaml_path=f"sites/{code}.yaml")
    session.flush()
    return s


def test_dept_binding_values_are_canonical():
    assert DEPT_BINDINGS == (
        "pending",
        "mapped",
        "city_level",
        "cross_dept",
        "external_ref",
    )


def test_bad_binding_rejected_by_db(session):
    site = _mk_site(session)
    session.add(
        SiteDepartment(
            site_id=site.id,
            dept_path="qycsj",
            dept_binding="bogus",  # not in enum
        )
    )
    with pytest.raises(IntegrityError):
        session.flush()


def test_mapped_without_local_dept_rejected_by_db(session):
    site = _mk_site(session)
    session.add(
        SiteDepartment(
            site_id=site.id, dept_path="qycsj", dept_binding="mapped",
            # local_dept_id missing
        )
    )
    with pytest.raises(IntegrityError):
        session.flush()


def test_non_mapped_with_local_dept_rejected_by_db(session):
    """city_level/cross_dept/external_ref must NOT carry local_dept_id."""
    site = _mk_site(session)
    # Without a local_department row, FK would fail first — so skip real FK
    # and just prove the binding CHECK catches it on its own by pointing at
    # a nonexistent dept_id while FKs are off for this statement.
    # Simpler: directly execute DDL-side insert via repo helper path.
    with pytest.raises(ValueError):
        depts.upsert(
            session,
            site_id=site.id,
            dept_path="xxgk",
            dept_binding="city_level",
            local_dept_id=999,  # city_level must have local_dept_id IS NULL
        )


def test_repo_rejects_mapped_without_local_dept(session):
    site = _mk_site(session)
    with pytest.raises(ValueError):
        depts.upsert(
            session, site_id=site.id, dept_path="qycsj", dept_binding="mapped"
        )


# ---------------------------------------------------------------------------
# 3. FK chain — site → site_department → crawl_target → article
# ---------------------------------------------------------------------------
def test_full_chain_insert_and_cascade(session):
    site = _mk_site(session, "qingcheng_fgw")

    dept = depts.upsert(
        session,
        site_id=site.id,
        dept_path="qycsj",
        dept_binding="pending",  # no local_dept_id yet — the common bootstrap state
    )
    session.flush()

    tgt = targets.upsert_by_code(
        session,
        target_code="qingcheng_fgw__qycsj__1234",
        site_id=site.id,
        site_department_id=dept.id,
        target_name="财政预决算",
        entry_url="http://www.qingcheng.gov.cn/qycsj/gkmlpt/index/1234",
    )
    session.flush()

    art = Article(
        site_id=site.id,
        target_id=tgt.id,
        native_post_id="987654",
        url="http://www.qingcheng.gov.cn/content/987/987654/post_987654.html",
        url_hash="a" * 64,
        title="test post",
        publish_time=datetime(2026, 4, 1, tzinfo=timezone.utc),
        status="raw",
    )
    session.add(art)
    session.flush()

    # sanity: round-trip
    fetched = session.scalar(select(Article).where(Article.url_hash == "a" * 64))
    assert fetched is not None
    assert fetched.target_id == tgt.id
    assert fetched.site_id == site.id

    # cascade: delete the article first (site_id is RESTRICT — this is the
    # Phase-B "safe unsubscribe" path: deactivate articles, then drop site).
    # Here we delete just the article, then the site — the dept + target should
    # cascade via site_id FK ON DELETE CASCADE.
    session.delete(art)
    session.flush()
    session.delete(site)
    session.flush()
    assert session.scalar(select(SiteDepartment).where(SiteDepartment.id == dept.id)) is None
    assert session.scalar(select(CrawlTarget).where(CrawlTarget.id == tgt.id)) is None


def test_low_level_article_insert_sanitizes_long_text(session):
    site = _mk_site(session, "gd_wjk")
    long_text = "广东省人民政府" * 5000

    art = insert_article(
        session,
        site_id=site.id,
        url="https://www.gd.gov.cn/zwgk/wjk/qbwj/yfh/content/post_3923090.html",
        url_hash="d" * 64,
        title="test",
        content_text=long_text,
        status="ready",
    )
    session.flush()

    assert art.content_text is not None
    assert len(art.content_text) == 14000


def test_site_delete_blocked_when_article_exists(session):
    """article.site_id ON DELETE RESTRICT protects us from losing data accidentally."""
    site = _mk_site(session, "qingyuan_gov")
    session.flush()
    session.add(
        Article(
            site_id=site.id,
            url="http://x/y",
            url_hash="b" * 64,
            title="t",
            status="raw",
        )
    )
    session.flush()
    session.delete(site)
    with pytest.raises(IntegrityError):
        session.flush()


def test_article_url_hash_unique(session):
    site = _mk_site(session)
    session.flush()
    h = "c" * 64
    session.add(Article(site_id=site.id, url="http://x/1", url_hash=h, title="a"))
    session.flush()
    session.add(Article(site_id=site.id, url="http://x/2", url_hash=h, title="b"))
    with pytest.raises(IntegrityError):
        session.flush()


def test_partial_unique_native_post_id(session):
    """Two articles in same site with SAME native_post_id → conflict.
    Two articles with native_post_id=NULL → both allowed."""
    site = _mk_site(session)
    session.flush()
    session.add(
        Article(
            site_id=site.id, native_post_id="42",
            url="http://x/1", url_hash="d" * 64, title="a",
        )
    )
    session.flush()
    session.add(
        Article(
            site_id=site.id, native_post_id="42",
            url="http://x/2", url_hash="e" * 64, title="b",
        )
    )
    with pytest.raises(IntegrityError):
        session.flush()
    session.rollback()

    # rollback wiped the site — re-create before the NULL-coexistence probe
    site = _mk_site(session)
    session.flush()

    # NULLs are fine side-by-side (on SQLite/PG NULLs are distinct)
    for i, uh in enumerate(("f" * 64, "0" * 64)):
        session.add(
            Article(
                site_id=site.id, native_post_id=None,
                url=f"http://x/null{i}", url_hash=uh, title="t",
            )
        )
    session.flush()  # must not raise


def test_crawl_site_adapter_xor_yaml(session):
    """Per §5.1: exactly one of cms_adapter / yaml_path must be set."""
    # Both set → violation
    session.add(
        CrawlSite(
            site_code="both", cms_adapter="gkmlpt", yaml_path="sites/both.yaml"
        )
    )
    with pytest.raises(IntegrityError):
        session.flush()
    session.rollback()

    # Neither set → violation
    session.add(CrawlSite(site_code="neither"))
    with pytest.raises(IntegrityError):
        session.flush()
    session.rollback()

    # Just yaml_path → ok
    session.add(CrawlSite(site_code="yaml_only", yaml_path="sites/a.yaml"))
    session.flush()
    # Just cms_adapter → ok
    session.add(CrawlSite(site_code="adapter_only", cms_adapter="gkmlpt"))
    session.flush()


# ---------------------------------------------------------------------------
# repositories smoke tests
# ---------------------------------------------------------------------------
def test_repo_round_trip(session):
    site = sites.upsert_by_code(
        session, "qingcheng_fgw", yaml_path="sites/qingcheng_fgw.yaml",
        site_name="清城区发改局",
    )
    session.flush()
    assert sites.get_by_code(session, "qingcheng_fgw").id == site.id

    d = depts.upsert(
        session, site_id=site.id, dept_path="qycsj", dept_binding="pending"
    )
    session.flush()
    assert depts.get(session, site.id, "qycsj").id == d.id

    depts.rebind(
        session, site_id=site.id, dept_path="qycsj",
        dept_binding="city_level", local_dept_id=None,
    )
    session.flush()
    assert depts.get(session, site.id, "qycsj").dept_binding == "city_level"

    t = targets.upsert_by_code(
        session, target_code="qingcheng_fgw__qycsj__9",
        site_id=site.id, site_department_id=d.id,
    )
    session.flush()
    assert targets.list_for_site(session, site.id)[0].id == t.id
    assert targets.list_for_dept(session, d.id)[0].id == t.id
