from govcrawler.parser.cleaner import html_to_text def test_empty(): assert html_to_text("") == "" assert html_to_text(" ") == "" def test_paragraph_newlines(): h = "

第一段

第二段

" out = html_to_text(h) assert "第一段" in out and "第二段" in out assert "\n" in out # 段落分隔 def test_strip_tags(): assert "" not in html_to_text("
hello world
") def test_drop_script(): h = "
keep
" out = html_to_text(h) assert "keep" in out assert "alert" not in out def test_drop_style(): h = "
kept
" out = html_to_text(h) assert "kept" in out assert "color:red" not in out def test_entity_decoded(): assert " " not in html_to_text("
a b
")