from __future__ import annotations import json from types import SimpleNamespace from govcrawler.adapters import xinhua_xjp class _Resp: status_code = 200 def __init__(self, text: str, url: str): self.text = text self.url = url def json(self): return json.loads(self.text) def raise_for_status(self): pass LIST_HTML = """

首屏

""" def _rt() -> SimpleNamespace: return SimpleNamespace( site=SimpleNamespace(site_code="xinhua_xjp", base_url="https://www.news.cn"), target=SimpleNamespace( id=175, dept_id=None, entry_url="https://www.news.cn/politics/leaders/xijinping/jhqw.htm", channel_name="讲话全文", channel_path="新华网·习近平报道集/讲话全文", content_category="时政", content_subcategory="讲话全文", ), ) def test_fetch_list_page_reads_load_more_datasource(monkeypatch): payload = { "datasource": [ { "contentId": "row-1", "showTitle": "第一篇", "publishUrl": "/politics/leaders/202601/a/c.html", "publishTime": "2026-01-01 08:00:00", }, { "contentId": "row-2", "title": "第二篇", "publishUrl": "https://www.news.cn/politics/leaders/202601/b/c.html", "publishTime": "2026-01-02 08:00:00", }, { "contentId": "row-3", "showTitle": "第三篇", "publishTime": "2026-01-03 08:00:00", }, ] } seen = [] def _fake_get(url, **_kw): seen.append(str(url)) if str(url).endswith("jhqw.htm"): return _Resp(LIST_HTML, str(url)) return _Resp(json.dumps(payload, ensure_ascii=False), str(url)) monkeypatch.setattr(xinhua_xjp.httpx, "get", _fake_get) list_url, items, fr = xinhua_xjp.fetch_list_page( _rt(), page_num=2, params={"page_size": 2}, ) assert fr.status == 200 assert seen[1] == "https://www.news.cn/politics/leaders/xijinping/ds_abc123.json" assert list_url.endswith("ds_abc123.json?page=2&pageSize=2") assert len(items) == 1 item = items[0] assert item.native_post_id == "row-3" assert item.title == "第三篇" assert item.url == "https://www.news.cn/c.html" assert item.publish_date.isoformat() == "2026-01-03" assert item.channel_path == "新华网·习近平报道集/讲话全文" assert item.metadata_json["datasource_id"] == "abc123" def test_extract_datasource_url_requires_datasource_marker(): try: xinhua_xjp._extract_datasource_url("", "https://www.news.cn/a/b.htm") except ValueError as exc: assert "datasource" in str(exc) else: raise AssertionError("expected ValueError")