from govcrawler.parser.cleaner import html_to_text


def test_empty():
    assert html_to_text("") == ""
    assert html_to_text("   ") == ""


def test_paragraph_newlines():
    h = "<div><p>第一段</p><p>第二段</p></div>"
    out = html_to_text(h)
    assert "第一段" in out and "第二段" in out
    assert "\n" in out  # 段落分隔


def test_strip_tags():
    assert "<b>" not in html_to_text("<p>hello <b>world</b></p>")


def test_drop_script():
    h = "<div><p>keep</p><script>alert(1)</script></div>"
    out = html_to_text(h)
    assert "keep" in out
    assert "alert" not in out


def test_drop_style():
    h = "<div><style>.x{color:red}</style><p>kept</p></div>"
    out = html_to_text(h)
    assert "kept" in out
    assert "color:red" not in out


def test_entity_decoded():
    assert "&nbsp;" not in html_to_text("<p>a&nbsp;b</p>")


def test_multi_column_table_stays_markdown():
    out = html_to_text(
        """
        <table>
          <tr><td>名称</td><td>数量</td></tr>
          <tr><td>项目A</td><td>10</td></tr>
        </table>
        """
    )

    assert "| 名称 | 数量 |" in out
    assert "| --- | --- |" in out


def test_single_column_layout_table_is_plain_text():
    out = html_to_text(
        """
        <table>
          <tr><td>
            <p>广东省人民政府关于印发规划的通知</p>
            <p>各地级以上市人民政府：</p>
            <p>现将规划印发给你们，请认真贯彻执行。</p>
          </td></tr>
        </table>
        """
    )

    assert "| 广东省人民政府" not in out
    assert "广东省人民政府关于印发规划的通知" in out
    assert "各地级以上市人民政府：" in out
    assert "现将规划印发给你们，请认真贯彻执行。" in out


def test_sparse_signature_table_is_plain_text():
    out = html_to_text(
        """
        <table>
          <tr><td>省长</td><td></td></tr>
          <tr><td>2021年1月30日</td><td></td></tr>
        </table>
        """
    )

    assert "| 省长 |" not in out
    assert "| --- |" not in out
    assert "省长" in out
    assert "2021年1月30日" in out
