import pytest
from govcrawler.utils.url_norm import normalize_url, url_hash, sha256_bytes


class TestNormalize:
    def test_idempotent(self):
        u = "HTTP://Www.Gdqy.Gov.CN/a/b/?b=2&a=1#frag"
        once = normalize_url(u)
        assert normalize_url(once) == once

    def test_scheme_preserved_by_default(self):
        # 默认保留原 scheme（政务站 http → http，便于 hostname-mismatch 证书的站点）
        assert normalize_url("http://www.gdqy.gov.cn/").startswith("http://")

    def test_scheme_force_https_opt_in(self):
        # force_https=True 时才提升，供 url_hash 跨协议去重使用
        assert normalize_url("http://www.gdqy.gov.cn/", force_https=True).startswith("https://")

    def test_host_lowered(self):
        assert "www.gdqy.gov.cn" in normalize_url("HTTPS://WWW.GDQY.GOV.CN/x")

    def test_trailing_slash_stripped(self):
        assert normalize_url("https://a.com/x/") == "https://a.com/x"

    def test_root_slash_preserved(self):
        assert normalize_url("https://a.com/") == "https://a.com/"

    def test_fragment_dropped(self):
        assert "#" not in normalize_url("https://a.com/x#frag")

    def test_query_sorted(self):
        assert normalize_url("https://a.com/?b=2&a=1") == normalize_url("https://a.com/?a=1&b=2")

    def test_refuse_file_scheme(self):
        with pytest.raises(ValueError):
            normalize_url("file:///etc/passwd")

    def test_refuse_javascript_scheme(self):
        with pytest.raises(ValueError):
            normalize_url("javascript:alert(1)")

    def test_refuse_empty_host(self):
        with pytest.raises(ValueError):
            normalize_url("https:///x")


class TestUrlHash:
    def test_64_hex(self):
        h = url_hash("https://www.gdqy.gov.cn/a/b")
        assert len(h) == 64
        assert all(c in "0123456789abcdef" for c in h)

    def test_same_after_normalization(self):
        assert url_hash("http://Www.GDQY.gov.cn/a/b/?b=2&a=1") == url_hash(
            "https://www.gdqy.gov.cn/a/b?a=1&b=2"
        )


class TestSha256Bytes:
    def test_known_vector(self):
        # sha256("") = e3b0c44...
        assert (
            sha256_bytes(b"")
            == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
        )
