from govcrawler.parser.cleaner import html_to_text


def test_empty():
    assert html_to_text("") == ""
    assert html_to_text("   ") == ""


def test_paragraph_newlines():
    h = "<div><p>第一段</p><p>第二段</p></div>"
    out = html_to_text(h)
    assert "第一段" in out and "第二段" in out
    assert "\n" in out  # 段落分隔


def test_strip_tags():
    assert "<b>" not in html_to_text("<p>hello <b>world</b></p>")


def test_drop_script():
    h = "<div><p>keep</p><script>alert(1)</script></div>"
    out = html_to_text(h)
    assert "keep" in out
    assert "alert" not in out


def test_drop_style():
    h = "<div><style>.x{color:red}</style><p>kept</p></div>"
    out = html_to_text(h)
    assert "kept" in out
    assert "color:red" not in out


def test_entity_decoded():
    assert "&nbsp;" not in html_to_text("<p>a&nbsp;b</p>")
