"""pipeline.fetch_and_store tests — v2 schema aligned. Instead of a fake session, we use a real in-memory SQLite DB and seed a crawl_site + crawl_target with parser_override_json.detail that matches the test HTML's CSS. This exercises the same code paths as production but keeps the test hermetic. """ from __future__ import annotations import hashlib from contextlib import contextmanager from unittest.mock import MagicMock import pytest from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from govcrawler import pipeline from govcrawler.fetcher.browser import FetchResult from govcrawler.models import Article, Attachment, Base, CrawlLog from tests._v2fixtures import make_site, make_target _DETAIL_SELECTORS = { "title": "h1.article-title::text", "publish_time": "span.time::text", "source": "", "content": "div.article-content", "attachment_css": "div.article-content a[href]", } @pytest.fixture def db(tmp_path, monkeypatch): monkeypatch.setenv("DB_URL", "sqlite:///" + str(tmp_path / "pipe.db")) monkeypatch.setenv("DATA_DIR", str(tmp_path)) monkeypatch.setenv("USER_AGENT", "TestBot/1.0") engine = create_engine("sqlite:///" + str(tmp_path / "pipe.db"), future=True) Base.metadata.create_all(engine) SM = sessionmaker(bind=engine, expire_on_commit=False) with SM() as s: site = make_site(s, site_code="gdqy", yaml_path="gdqy.yaml") target = make_target( s, site=site, column_id="szfwj", target_name="市政府文件", entry_url="https://www.gdqy.gov.cn/szfwj/", ) # Install per-target detail selectors so pipeline doesn't need YAML/config target.parser_override_json = {"detail": _DETAIL_SELECTORS} s.commit() target_code = target.target_code # Point pipeline.get_sessionmaker at our SM monkeypatch.setattr(pipeline, "get_sessionmaker", lambda: SM) return {"SM": SM, "target_code": target_code} def _ok_fetch(html: str, *, status: int = 200): return lambda url: FetchResult( url=url, final_url=url, status=status, html=html, fetched_at=0.0, duration_ms=100, is_challenge=False, ) def test_happy_path_writes_article_and_log(monkeypatch, db): html = ( '

测试标题

' '2026-04-10 16:34:22' '

正文内容，足够长，足够长，足够长，足够长，足够长，足够长。

' "" ) monkeypatch.setattr(pipeline, "fetch_html", _ok_fetch(html)) r = pipeline.fetch_and_store( target_code=db["target_code"], url="https://www.gdqy.gov.cn/x/post_test.html", ) assert r["status"] == "ready" with db["SM"]() as s: arts = s.query(Article).all() logs = s.query(CrawlLog).all() assert len(arts) == 1 assert arts[0].title == "测试标题" assert arts[0].status == "ready" assert len(logs) == 1 assert logs[0].success is True def test_challenge_page_logged_as_failure(monkeypatch, db): monkeypatch.setattr( pipeline, "fetch_html", lambda url: FetchResult( url=url, final_url=url, status=412, html="请稍候", fetched_at=0.0, duration_ms=50, is_challenge=True, ), ) r = pipeline.fetch_and_store( target_code=db["target_code"], url="https://www.gdqy.gov.cn/x/post_x.html", ) assert r["status"] == "failed" with db["SM"]() as s: assert s.query(Article).count() == 0 logs = s.query(CrawlLog).all() assert len(logs) == 1 assert logs[0].success is False assert "challenge" in (logs[0].error_msg or "").lower() def test_attachment_downloaded_and_recorded(monkeypatch, db): from govcrawler.storage import attachments as att_mod html = ( '

带附件

' '2026-04-10 10:00:00' '

' + "正文内容足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长足够长。" + '

' '公告全文.pdf

' ) monkeypatch.setattr(pipeline, "fetch_html", _ok_fetch(html)) FAKE_BYTES = b"%PDF-1.4\n\n" * 32 EXPECTED_SHA = hashlib.sha256(FAKE_BYTES).hexdigest() class _FakeResp: headers = {"content-disposition": 'attachment; filename="公告全文.pdf"'} def raise_for_status(self): pass def iter_bytes(self, chunk_size=65536): mid = len(FAKE_BYTES) // 2 yield FAKE_BYTES[:mid] yield FAKE_BYTES[mid:] @contextmanager def fake_stream(method, url, **kw): yield _FakeResp() monkeypatch.setattr(att_mod.httpx, "stream", fake_stream) r = pipeline.fetch_and_store( target_code=db["target_code"], url="https://www.gdqy.gov.cn/x/post_test.html", ) assert r["status"] == "ready" assert r["attachments_downloaded"] == 1 with db["SM"]() as s: atts = s.query(Attachment).all() assert len(atts) == 1 att = atts[0] assert att.file_ext == "pdf" assert att.file_hash == EXPECTED_SHA assert att.size_bytes == len(FAKE_BYTES) def test_duplicate_url_hash_skips(monkeypatch, db): # Seed an existing article with the same url_hash → fetch must be skipped from govcrawler.utils.url_norm import url_hash as compute_url_hash target_url = "https://www.gdqy.gov.cn/x/post_dup.html" dup_hash = compute_url_hash(target_url) with db["SM"]() as s: site_id = s.query(Article).count() # 0 from govcrawler.models import CrawlTarget t = s.query(CrawlTarget).filter_by(target_code=db["target_code"]).one() s.add(Article( site_id=t.site_id, target_id=t.id, url=target_url, url_hash=dup_hash, title="existing", status="ready", has_attachment=False, )) s.commit() def _boom(url): raise AssertionError("fetch_html should not be called on duplicate") monkeypatch.setattr(pipeline, "fetch_html", _boom) r = pipeline.fetch_and_store(target_code=db["target_code"], url=target_url) assert r["status"] == "skipped" assert r["article_id"] >= 1