from govcrawler.parser.detail_parser import parse_detail from govcrawler.sites.gdqy import DETAIL_SELECTORS SAMPLE_HTML = """

关于无人驾驶航空器安全管控的公告

2026-04-10 16:34:22 清远市人民政府

第一段：为保障 2026 年第四届全国轻型飞机锦标赛航空嘉年华活动期间空域安全，现就相关事项公告如下。

第二段：管控时间自 2026 年 5 月 1 日起至 5 月 10 日止。

第三段：管控区域为清远市清城区、清新区以北半径 30 公里内空域。

附件：公告全文.pdf

""" def test_main_xpath_path(): r = parse_detail(SAMPLE_HTML, "https://www.gdqy.gov.cn/x/post_1.html", DETAIL_SELECTORS) assert "无人驾驶航空器安全管控" in r.title assert "2026-04-10" in r.publish_time_raw assert "清远市人民政府" in r.source assert "第一段" in r.content_text assert r.used_fallback is False assert r.fallback_engine is None def test_attachment_absolute(): r = parse_detail(SAMPLE_HTML, "https://www.gdqy.gov.cn/x/post_1.html", DETAIL_SELECTORS) assert len(r.attachment_urls) == 1 assert r.attachment_urls[0].startswith("https://www.gdqy.gov.cn/") assert r.attachment_urls[0].endswith("notice.pdf") def test_content_text_no_html_tags(): r = parse_detail(SAMPLE_HTML, "https://www.gdqy.gov.cn/x/post_1.html", DETAIL_SELECTORS) assert "

" not in r.content_text assert "\n" in r.content_text # paragraph preserved def test_source_prefix_is_normalized(): html = """

标题

2026-04-10 16:34:22 文章来源：新华社

正文内容足够长，足够长，足够长，足够长，足够长，足够长。

""" selectors = dict(DETAIL_SELECTORS) selectors["source"] = "span.source::text" r = parse_detail(html, "https://www.news.cn/x.html", selectors) assert r.source == "新华社" def test_extracts_government_public_metadata_from_label_table(): html = """

广东省人民政府关于印发规划纲要的通知

2026-04-28 10:00

索引号：006939748/2026-00100

分类：国民经济管理、国有资产监管

发布机构：广东省人民政府

成文日期：2026-03-26

施行日期：2026-07-01

效力状态：现行有效

废止日期：2031-06-30

文号：粤府〔2026〕24号

发布日期：2026-04-28

正文内容足够长，足够长，足够长，足够长，足够长，足够长。

""" r = parse_detail(html, "https://www.gd.gov.cn/zwgk/wjk/qbwj/yf/content/post_1.html", DETAIL_SELECTORS) assert r.index_no == "006939748/2026-00100" assert r.publisher == "广东省人民政府" assert r.doc_no == "粤府〔2026〕24号" assert r.publish_date.isoformat() == "2026-03-26" assert r.effective_date.isoformat() == "2026-07-01" assert r.is_effective is True assert r.expiry_date.isoformat() == "2031-06-30" assert r.content_category == "国民经济管理" assert r.content_subcategory == "国有资产监管" assert r.open_category == "国民经济管理、国有资产监管" assert r.public_meta["索引号"] == "006939748/2026-00100" def test_extracts_government_public_metadata_from_gov_cn_table_aliases(): html = """ 国务院关于试点实施方案的批复 2025-09-11 15:00

索引号：	000014349/2025-00075	主题分类：	国民经济管理、国有资产监管\\经济体制改革
发文机关：	国务院	成文日期：	2025年09月08日
发文字号：	国函〔2025〕86号	发布日期：	2025年09月11日
有效期：	2025年10月01日至2030年09月30日	效力状态：	已失效

正文内容足够长，足够长，足够长，足够长，足够长，足够长。

""" r = parse_detail(html, "https://www.gov.cn/zhengce/content/x.htm", DETAIL_SELECTORS) assert r.index_no == "000014349/2025-00075" assert r.publisher == "国务院" assert r.doc_no == "国函〔2025〕86号" assert r.publish_date.isoformat() == "2025-09-08" assert r.effective_date.isoformat() == "2025-10-01" assert r.is_effective is False assert r.expiry_date.isoformat() == "2030-09-30" assert r.content_category == "国民经济管理、国有资产监管" assert r.content_subcategory == "经济体制改革" assert r.open_category == "国民经济管理、国有资产监管、经济体制改革" def test_public_metadata_does_not_treat_next_label_as_empty_doc_no(): html = """

文号：

发布日期：

2026-04-28

正文内容足够长，足够长，足够长，足够长，足够长，足够长。

""" r = parse_detail(html, "https://example.com/a.html", DETAIL_SELECTORS) assert r.doc_no is None