"""pipeline.fetch_and_store tests — v2 schema aligned.

Instead of a fake session, we use a real in-memory SQLite DB and seed a
crawl_site + crawl_target with parser_override_json.detail that matches the
test HTML's CSS. This exercises the same code paths as production but keeps
the test hermetic.
"""
from __future__ import annotations

import hashlib
from contextlib import contextmanager
from unittest.mock import MagicMock

import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from govcrawler import pipeline
from govcrawler.fetcher.browser import FetchResult
from govcrawler.models import Article, Attachment, Base, CrawlLog

from tests._v2fixtures import make_site, make_target


_DETAIL_SELECTORS = {
    "title": "h1.article-title::text",
    "publish_time": "span.time::text",
    "source": "",
    "content": "div.article-content",
    "attachment_css": "div.article-content a[href]",
}


@pytest.fixture
def db(tmp_path, monkeypatch):
    monkeypatch.setenv("DB_URL", "sqlite:///" + str(tmp_path / "pipe.db"))
    monkeypatch.setenv("DATA_DIR", str(tmp_path))
    monkeypatch.setenv("USER_AGENT", "TestBot/1.0")

    engine = create_engine("sqlite:///" + str(tmp_path / "pipe.db"), future=True)
    Base.metadata.create_all(engine)
    SM = sessionmaker(bind=engine, expire_on_commit=False)

    with SM() as s:
        site = make_site(s, site_code="gdqy", yaml_path="gdqy.yaml")
        target = make_target(
            s, site=site, column_id="szfwj",
            target_name="市政府文件",
            entry_url="https://www.gdqy.gov.cn/szfwj/",
        )
        # Install per-target detail selectors so pipeline doesn't need YAML/config
        target.parser_override_json = {"detail": _DETAIL_SELECTORS}
        s.commit()
        target_code = target.target_code

    # Point pipeline.get_sessionmaker at our SM
    monkeypatch.setattr(pipeline, "get_sessionmaker", lambda: SM)

    return {"SM": SM, "target_code": target_code}


def _ok_fetch(html: str, *, status: int = 200):
    return lambda url: FetchResult(
        url=url, final_url=url, status=status, html=html,
        fetched_at=0.0, duration_ms=100, is_challenge=False,
    )


def test_happy_path_writes_article_and_log(monkeypatch, db):
    html = (
        '<html><body><h1 class="article-title">测试标题</h1>'
        '<span class="time">2026-04-10 16:34:22</span>'
        '<div class="article-content"><p>正文内容，足够长，足够长，足够长，足够长，足够长，足够长。</p></div>'
        "</body></html>"
    )
    monkeypatch.setattr(pipeline, "fetch_html", _ok_fetch(html))

    r = pipeline.fetch_and_store(
        target_code=db["target_code"],
        url="https://www.gdqy.gov.cn/x/post_test.html",
    )
    assert r["status"] == "ready"

    with db["SM"]() as s:
        arts = s.query(Article).all()
        logs = s.query(CrawlLog).all()
        assert len(arts) == 1
        assert arts[0].title == "测试标题"
        assert arts[0].status == "ready"
        assert len(logs) == 1
        assert logs[0].success is True


def test_challenge_page_logged_as_failure(monkeypatch, db):
    monkeypatch.setattr(
        pipeline, "fetch_html",
        lambda url: FetchResult(
            url=url, final_url=url, status=412,
            html="<html><title>请稍候</title></html>",
            fetched_at=0.0, duration_ms=50, is_challenge=True,
        ),
    )
    r = pipeline.fetch_and_store(
        target_code=db["target_code"],
        url="https://www.gdqy.gov.cn/x/post_x.html",
    )
    assert r["status"] == "failed"

    with db["SM"]() as s:
        assert s.query(Article).count() == 0
        logs = s.query(CrawlLog).all()
        assert len(logs) == 1
        assert logs[0].success is False
        assert "challenge" in (logs[0].error_msg or "").lower()


def test_attachment_downloaded_and_recorded(monkeypatch, db):
    from govcrawler.storage import attachments as att_mod

    html = (
        '<html><body><h1 class="article-title">带附件</h1>'
        '<span class="time">2026-04-10 10:00:00</span>'
        '<div class="article-content"><p>'
        + "正文内容足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长。"
        + '</p>'
        '<a href="https://www.gdqy.gov.cn/attach/a.pdf">公告全文.pdf</a></div></body></html>'
    )
    monkeypatch.setattr(pipeline, "fetch_html", _ok_fetch(html))

    FAKE_BYTES = b"%PDF-1.4\n<fake pdf bytes for Path B test>\n" * 32
    EXPECTED_SHA = hashlib.sha256(FAKE_BYTES).hexdigest()

    class _FakeResp:
        headers = {"content-disposition": 'attachment; filename="公告全文.pdf"'}

        def raise_for_status(self):
            pass

        def iter_bytes(self, chunk_size=65536):
            mid = len(FAKE_BYTES) // 2
            yield FAKE_BYTES[:mid]
            yield FAKE_BYTES[mid:]

    @contextmanager
    def fake_stream(method, url, **kw):
        yield _FakeResp()

    monkeypatch.setattr(att_mod.httpx, "stream", fake_stream)

    r = pipeline.fetch_and_store(
        target_code=db["target_code"],
        url="https://www.gdqy.gov.cn/x/post_test.html",
    )
    assert r["status"] == "ready"
    assert r["attachments_downloaded"] == 1

    with db["SM"]() as s:
        atts = s.query(Attachment).all()
        assert len(atts) == 1
        att = atts[0]
        assert att.file_ext == "pdf"
        assert att.file_hash == EXPECTED_SHA
        assert att.size_bytes == len(FAKE_BYTES)


def test_duplicate_url_hash_skips(monkeypatch, db):
    # Seed an existing article with the same url_hash → fetch must be skipped
    from govcrawler.utils.url_norm import url_hash as compute_url_hash

    target_url = "https://www.gdqy.gov.cn/x/post_dup.html"
    dup_hash = compute_url_hash(target_url)

    with db["SM"]() as s:
        site_id = s.query(Article).count()  # 0
        from govcrawler.models import CrawlTarget
        t = s.query(CrawlTarget).filter_by(target_code=db["target_code"]).one()
        s.add(Article(
            site_id=t.site_id, target_id=t.id,
            url=target_url, url_hash=dup_hash,
            title="existing", status="ready", has_attachment=False,
        ))
        s.commit()

    def _boom(url):
        raise AssertionError("fetch_html should not be called on duplicate")

    monkeypatch.setattr(pipeline, "fetch_html", _boom)

    r = pipeline.fetch_and_store(target_code=db["target_code"], url=target_url)
    assert r["status"] == "skipped"
    assert r["article_id"] >= 1
