"""End-to-end smoke test: config/sites_v2/qingcheng_fgw.yaml → DB → crawl_target.

This exercises the full v2 wire in one go:

  1. seed local_department(dept_id=301) — YAML has one `mapped` dept
  2. `config.sync.sync_file` imports the YAML into crawl_site / site_department
     / crawl_target (3 depts × 1 column = 3 targets)
  3. `pipeline.crawl_target(target_code)` is driven against a mocked
     `fetch_html` so the list-API JSON and detail HTML are synthetic
  4. we assert the DB now has an Article row + a success CrawlLog row, both
     joined back to the right site/target/dept

Nothing hits the network. `fetch_html` is URL-dispatched: the gkmlpt list URL
returns synthetic JSON that matches `parse_list_response`; the detail URL
returns HTML that matches `DEFAULT_DETAIL_SELECTORS`.
"""
from __future__ import annotations

import json
from datetime import datetime, timezone
from pathlib import Path

import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from govcrawler import pipeline
from govcrawler.config.sync import sync_file
from govcrawler.fetcher.browser import FetchResult
from govcrawler.models import (
    Article,
    Base,
    CrawlLog,
    CrawlSite,
    CrawlTarget,
    LocalDepartment,
    SiteDepartment,
)


YAML_PATH = (
    Path(__file__).resolve().parent.parent
    / "config" / "sites_v2" / "qingcheng_fgw.yaml"
)


# ----------------------------------------------------------------------------
# Synthetic responses (list JSON + detail HTML) matching the gkmlpt adapter
# ----------------------------------------------------------------------------
# One live-ish article — id=12345 → bucket=12 to pass verify_bucket_invariant
_FAKE_LIST_PAYLOAD = {
    "articles": [
        {
            "id": 12345,
            "url": "http://fgw.qingcheng.gov.cn/qycsj/gkmlpt/content/12/12345/post_12345.html",
            "title": "2026 年 Q1 财政预算执行情况说明",
            "first_publish_time": int(datetime(2026, 4, 1, 9, 0, tzinfo=timezone.utc).timestamp()),
            "publisher": "清城区发改局",
            "document_number": "清发改〔2026〕12号",
            "identifier": "QFG-2026-0012",
        }
    ],
    "total": 1,
}


_FAKE_DETAIL_HTML = """<html><body>
<h1 class="article-title">2026 年 Q1 财政预算执行情况说明</h1>
<div class="article-info">
  <span class="time">2026-04-01 09:00:00</span>
  <span class="source">清城区发改局</span>
</div>
<div class="article-content">
  <p>一、总体情况：一季度全区一般公共预算收入 X 亿元，完成年初预算的 Y%。
  支出聚焦民生、产业升级与基础设施建设，重点项目进度良好。</p>
  <p>二、分科目：税收收入、非税收入、上级补助收入均按序时进度达成目标。</p>
  <p>三、下阶段安排：坚持过紧日子、兜牢基层三保、加快项目支出进度。</p>
</div>
</body></html>
"""


def _dispatch_fetch_html(url: str) -> FetchResult:
    """URL-routing fake of `fetcher.chain.fetch_html`."""
    if "/gkmlpt/api/all/" in url:
        # list-API JSON
        return FetchResult(
            url=url, final_url=url, status=200,
            html=json.dumps(_FAKE_LIST_PAYLOAD, ensure_ascii=False),
            fetched_at=0.0, duration_ms=42,
            is_challenge=False, strategy="httpx",
        )
    # detail page
    return FetchResult(
        url=url, final_url=url, status=200, html=_FAKE_DETAIL_HTML,
        fetched_at=0.0, duration_ms=58,
        is_challenge=False, strategy="httpx",
    )


# ----------------------------------------------------------------------------
# Fixture: in-memory SQLite + bound SM + wire all the pipeline seams
# ----------------------------------------------------------------------------
@pytest.fixture
def db(tmp_path, monkeypatch):
    monkeypatch.setenv("DB_URL", f"sqlite:///{tmp_path}/smoke.db")
    monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
    monkeypatch.setenv("USER_AGENT", "GovCrawlerSmoke/1.0")

    engine = create_engine(f"sqlite:///{tmp_path}/smoke.db", future=True)
    Base.metadata.create_all(engine)
    SM = sessionmaker(bind=engine, expire_on_commit=False)

    # Seed the local_department row the YAML's mapped dept references.
    with SM() as s:
        s.add(LocalDepartment(dept_id=301, dept_name="统计局", full_name="清城区统计局"))
        s.commit()

    # Redirect pipeline's session factory at our SM
    monkeypatch.setattr(pipeline, "get_sessionmaker", lambda: SM)

    # Neutralise throttle + robots + attachments (no HTTP)
    monkeypatch.setattr(pipeline.HostThrottle, "wait", lambda self, url, **kw: 0.0)
    monkeypatch.setattr(pipeline, "is_allowed", lambda url, ua: True)
    monkeypatch.setattr(
        pipeline, "download_attachment",
        lambda *a, **kw: (_ for _ in ()).throw(AssertionError("no attachments in this fixture HTML")),
    )

    # URL-dispatched fetch
    monkeypatch.setattr(pipeline, "fetch_html", _dispatch_fetch_html)

    return SM


# ----------------------------------------------------------------------------
# The smoke test
# ----------------------------------------------------------------------------
def test_sync_qingcheng_fgw_yaml_then_crawl_target(db):
    SM = db

    # ---------- 1. yaml-sync ----------
    with SM() as s:
        report = sync_file(s, YAML_PATH)
        s.commit()

    # 1 site + 3 depts (qycsj, xxgk, tjgb) + 3 targets, no warnings
    assert report.sites_created == 1 and report.sites_updated == 0
    assert report.depts_created == 3 and report.depts_disabled == 0
    assert report.targets_created == 3 and report.targets_disabled == 0
    assert report.warnings == []

    with SM() as s:
        assert s.query(CrawlSite).count() == 1
        assert s.query(SiteDepartment).count() == 3
        assert s.query(CrawlTarget).count() == 3

        site = s.query(CrawlSite).filter_by(site_code="qingcheng_fgw").one()
        assert site.cms_adapter == "gkmlpt"
        assert site.yaml_path is None  # XOR: adapter-bound site has no yaml_path

        # Target codes follow {site_code}__{dept_path}__{column_id}
        codes = {t.target_code for t in s.query(CrawlTarget)}
        assert codes == {
            "qingcheng_fgw__qycsj__1234",
            "qingcheng_fgw__xxgk__5678",
            "qingcheng_fgw__tjgb__9012",
        }

    # ---------- 2. crawl one target end-to-end ----------
    target_code = "qingcheng_fgw__qycsj__1234"
    result = pipeline.crawl_target(target_code, max_items=2)

    # Adapter path should have completed cleanly
    assert result["status"] == "ok", result
    assert result["items_seen"] == 1
    assert result["items_new"] == 1
    assert result["items_failed"] == 0

    # ---------- 3. DB inspection ----------
    with SM() as s:
        articles = s.query(Article).all()
        assert len(articles) == 1
        a = articles[0]
        assert a.title == "2026 年 Q1 财政预算执行情况说明"
        assert a.status == "ready"
        assert a.url_hash and len(a.url_hash) == 64
        assert a.has_attachment is False

        # FKs land where we expect
        assert a.site_id == s.query(CrawlSite).one().id
        tgt = s.query(CrawlTarget).filter_by(target_code=target_code).one()
        assert a.target_id == tgt.id
        # mapped dept → dept_id threaded onto article
        assert a.dept_id == 301

        # crawl_log: one success row for this run (plus nothing else)
        logs = s.query(CrawlLog).all()
        assert len(logs) == 1
        log = logs[0]
        assert log.success is True
        assert log.http_status == 200
        assert log.site_id == a.site_id
        assert log.target_id == tgt.id
        assert log.article_url.endswith("post_12345.html")


def test_sync_is_idempotent(db):
    """Running sync twice must produce the same DB state with no warnings."""
    SM = db
    with SM() as s:
        r1 = sync_file(s, YAML_PATH); s.commit()
        r2 = sync_file(s, YAML_PATH); s.commit()

    # First pass creates everything, second pass only updates — no
    # duplicate rows, no accidental disables.
    assert (r1.sites_created, r1.depts_created, r1.targets_created) == (1, 3, 3)
    assert (r2.sites_created, r2.depts_created, r2.targets_created) == (0, 0, 0)
    assert (r2.sites_updated, r2.depts_updated, r2.targets_updated) == (1, 3, 3)
    assert r2.targets_disabled == 0 and r2.depts_disabled == 0 and r2.sites_disabled == 0

    with SM() as s:
        assert s.query(CrawlSite).count() == 1
        assert s.query(SiteDepartment).count() == 3
        assert s.query(CrawlTarget).count() == 3
