from govcrawler.parser.cleaner import html_to_text def test_empty(): assert html_to_text("") == "" assert html_to_text(" ") == "" def test_paragraph_newlines(): h = "

第一段

第二段

" out = html_to_text(h) assert "第一段" in out and "第二段" in out assert "\n" in out # 段落分隔 def test_strip_tags(): assert "" not in html_to_text("
hello world
") def test_drop_script(): h = "
keep
" out = html_to_text(h) assert "keep" in out assert "alert" not in out def test_drop_style(): h = "
kept
" out = html_to_text(h) assert "kept" in out assert "color:red" not in out def test_entity_decoded(): assert " " not in html_to_text("
a b
") def test_multi_column_table_stays_markdown(): out = html_to_text( """

名称数量

项目A 10

""" ) assert "| 名称 | 数量 |" in out assert "| --- | --- |" in out def test_single_column_layout_table_is_plain_text(): out = html_to_text( """

广东省人民政府关于印发规划的通知

各地级以上市人民政府：

现将规划印发给你们，请认真贯彻执行。

""" ) assert "| 广东省人民政府" not in out assert "广东省人民政府关于印发规划的通知" in out assert "各地级以上市人民政府：" in out assert "现将规划印发给你们，请认真贯彻执行。" in out def test_sparse_signature_table_is_plain_text(): out = html_to_text( """

省长

2021年1月30日

""" ) assert "| 省长 |" not in out assert "| --- |" not in out assert "省长" in out assert "2021年1月30日" in out