"""HTML sub-column discovery heuristics — pure unit tests, no network."""
from __future__ import annotations

from govcrawler.api.admin.discover_html import (
    _column_id_from_path,
    _has_article_links,
    discover_html_sub_columns,
)


def test_column_id_from_path():
    assert _column_id_from_path("/zwgk/zfwj/gfxwj/index.html") == "gfxwj"
    assert _column_id_from_path("/zwgk/qxyw/") == "qxyw"
    assert _column_id_from_path("/zwgk/") == "zwgk"
    assert _column_id_from_path("/foo-bar/") == "foo_bar"
    assert _column_id_from_path("/") is None


def test_has_article_links_threshold():
    html = """
        <a href="/x/post_1.html">a</a>
        <a href="/x/post_2.html">b</a>
        <a href="/x/post_3.html">c</a>
        <a href="/about.html">about</a>
    """
    assert _has_article_links(html) is True
    assert _has_article_links(html, threshold=10) is False
    assert _has_article_links("") is False


def test_discover_html_sub_columns_includes_index_html_children():
    """Anchors pointing at deeper /index.html under the same path qualify."""
    html = """
    <html><body>
      <a href="/zwgk/zfwj/index.html">政府文件</a>
      <a href="/zwgk/tzgg/index.html">通知公告</a>
      <a href="/zwgk/qxyw/">政务要闻</a>
    </body></html>
    """
    out = discover_html_sub_columns("https://x.example/zwgk/", html)
    cids = sorted(c["column_id"] for c in out)
    assert cids == ["qxyw", "tzgg", "zfwj"]


def test_discover_html_sub_columns_rejects_external_and_articles_and_self():
    html = """
    <html><body>
      <a href="https://other.example/zwgk/foo/">external</a>
      <a href="/zwgk/zfwj/content/post_1.html">article</a>
      <a href="/zwgk/">self</a>
      <a href="/zwgk/legit/index.html">legit</a>
      <a href="/zwgk/legit/index.html">duplicate</a>
      <a href="/zwgk/more/index.html">更多</a>
      <a href="/zwgk/x/leaf.html">non-index html leaf</a>
    </body></html>
    """
    out = discover_html_sub_columns("https://x.example/zwgk/", html)
    cids = [c["column_id"] for c in out]
    # legit appears once, none of the rejects make it through
    assert cids == ["legit"]


def test_discover_html_sub_columns_rejects_outside_subtree():
    """Links pointing above the portal path (e.g. site nav) are rejected."""
    html = """
    <a href="/about/index.html">关于</a>
    <a href="/zwgk/inside/index.html">inside</a>
    """
    out = discover_html_sub_columns("https://x.example/zwgk/", html)
    cids = [c["column_id"] for c in out]
    assert cids == ["inside"]
