"""crawl_target list-iteration tests — v2 schema aligned.

(File name kept for git history; tests exercise `pipeline.crawl_target`.)

We mock the list-fetch helper `_list_via_yaml` to return a scripted list of
entries, so tests focus on the iterate/dedup/early-stop logic rather than
list-page HTML parsing.
"""
from __future__ import annotations

import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from govcrawler import pipeline
from govcrawler.fetcher.browser import FetchResult
from govcrawler.models import Base

from tests._v2fixtures import make_site_and_target


def _ok_list_fr(list_url: str = "https://example.com/list") -> FetchResult:
    return FetchResult(
        url=list_url, final_url=list_url, status=200, html="<ok>",
        fetched_at=0.0, duration_ms=10, is_challenge=False, strategy="httpx",
    )


def _entries(urls: list[str]) -> list[pipeline._ListEntry]:
    return [pipeline._ListEntry(url=u, item=None) for u in urls]


@pytest.fixture
def db(tmp_path, monkeypatch):
    monkeypatch.setenv("DB_URL", "sqlite:///" + str(tmp_path / "c.db"))
    monkeypatch.setenv("DATA_DIR", str(tmp_path))
    monkeypatch.setenv("USER_AGENT", "TestBot/1.0")
    # no real waiting
    monkeypatch.setattr(pipeline.HostThrottle, "wait", lambda self, url, **kw: 0.0)
    monkeypatch.setattr(pipeline, "is_safe_to_fetch", lambda url: True)
    monkeypatch.setattr(pipeline, "is_allowed", lambda url, ua: True)

    engine = create_engine("sqlite:///" + str(tmp_path / "c.db"), future=True)
    Base.metadata.create_all(engine)
    SM = sessionmaker(bind=engine, expire_on_commit=False)
    with SM() as s:
        _, target = make_site_and_target(s, site_code="gdqy", column_id="szfwj")
        s.commit()
        target_code = target.target_code

    monkeypatch.setattr(pipeline, "get_sessionmaker", lambda: SM)
    return {"target_code": target_code}


# ---------- unit tests on _derive_article_key ----------

def test_derive_article_key_standard():
    assert (
        pipeline._derive_article_key("https://x.com/a/b/post_2136593.html")
        == "post_2136593"
    )


def test_derive_article_key_no_ext():
    assert pipeline._derive_article_key("https://x.com/a/b/abc") == "abc"


def test_derive_article_key_query_stripped():
    assert pipeline._derive_article_key("https://x.com/p.html?q=1") == "p"


# ---------- crawl_target iteration logic ----------

def _install_entries(monkeypatch, urls: list[str]):
    """Patch the list helpers to return scripted entries."""
    list_url = "https://example.com/list"
    entries = _entries(urls)

    def _fake_list(rt, **_kw):
        return list_url, entries, _ok_list_fr(list_url)

    monkeypatch.setattr(pipeline, "_list_via_yaml", _fake_list)
    monkeypatch.setattr(pipeline, "_list_via_adapter", _fake_list)


def test_crawl_target_stops_on_duplicate(monkeypatch, db):
    calls: list[str] = []

    def _fake_fetch(*, target_code, url, list_item=None, throttle=None, **_kw):
        calls.append(url)
        if url.endswith("post_1002.html"):
            # Self-dedup → must trigger early-stop
            return {"status": "skipped", "reason": "duplicate_url_hash", "article_id": 42}
        return {"status": "ready", "article_id": len(calls)}

    monkeypatch.setattr(pipeline, "fetch_and_store", _fake_fetch)
    _install_entries(monkeypatch, [
        "https://example.com/a/content/post_1001.html",
        "https://example.com/a/content/post_1002.html",
        "https://example.com/a/content/post_1003.html",
    ])

    r = pipeline.crawl_target(db["target_code"])
    assert r["status"] == "ok"
    assert r["items_seen"] == 3
    # First item ready, second duplicate → stop; third never attempted
    assert len(calls) == 2
    assert r["items_new"] == 1
    assert r["items_skipped"] == 1


def test_crawl_target_cross_target_dedup_does_not_early_stop(monkeypatch, db):
    """When a duplicate hit comes from a different target (e.g. 网易 world
    sees a URL already filed under domestic), it carries no signal about
    this target's historical boundary. The loop must keep scanning instead
    of terminating after the first cross-target hit."""
    calls: list[str] = []

    def _fake_fetch(*, target_code, url, list_item=None, throttle=None, **_kw):
        calls.append(url)
        if url.endswith("post_1001.html"):
            # Cross-target dedup → keep scanning
            return {"status": "skipped", "reason": "duplicate_other_target", "article_id": 99}
        return {"status": "ready", "article_id": len(calls)}

    monkeypatch.setattr(pipeline, "fetch_and_store", _fake_fetch)
    _install_entries(monkeypatch, [
        "https://example.com/a/content/post_1001.html",
        "https://example.com/a/content/post_1002.html",
        "https://example.com/a/content/post_1003.html",
    ])

    r = pipeline.crawl_target(db["target_code"])
    assert len(calls) == 3                    # all three attempted
    assert r["items_new"] == 2
    assert r["items_skipped"] == 1


def test_crawl_target_no_stop_continues(monkeypatch, db):
    calls: list[str] = []

    def _fake_fetch(*, target_code, url, list_item=None, throttle=None, **_kw):
        calls.append(url)
        if url.endswith("post_1002.html"):
            return {"status": "skipped", "article_id": 42}
        return {"status": "ready", "article_id": len(calls)}

    monkeypatch.setattr(pipeline, "fetch_and_store", _fake_fetch)
    _install_entries(monkeypatch, [
        "https://example.com/a/content/post_1001.html",
        "https://example.com/a/content/post_1002.html",
        "https://example.com/a/content/post_1003.html",
    ])

    r = pipeline.crawl_target(db["target_code"], stop_on_duplicate=False)
    assert len(calls) == 3
    assert r["items_new"] == 2
    assert r["items_skipped"] == 1


def test_crawl_target_list_fetch_error(monkeypatch, db):
    def _fail_list(rt, **_kw):
        fr = FetchResult(
            url="https://example.com/list", final_url="https://example.com/list", status=0, html="",
            fetched_at=0.0, duration_ms=1, is_challenge=False,
            error="ConnectError: boom", strategy="httpx",
        )
        return "https://example.com/list", [], fr

    monkeypatch.setattr(pipeline, "_list_via_yaml", _fail_list)
    monkeypatch.setattr(pipeline, "_list_via_adapter", _fail_list)
    r = pipeline.crawl_target(db["target_code"])
    assert r["status"] == "list_failed"


def test_crawl_target_unknown_raises(db):
    with pytest.raises(ValueError):
        pipeline.crawl_target("does_not_exist")
