from unittest.mock import patch

import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from govcrawler.compliance import paths as paths_mod
from govcrawler.compliance import robots as robots_mod
from govcrawler.compliance.paths import is_public_path
from govcrawler.compliance.robots import RobotsCache
from govcrawler.models import Base

from tests._v2fixtures import make_site_and_target


# ---------- path blacklist (COMP-03) ----------

def test_public_path_ok():
    assert is_public_path("https://www.gdqy.gov.cn/gdqy/newxxgk/fgwj/szfwj/") is True


@pytest.mark.parametrize(
    "bad",
    [
        "https://x/admin/",
        "https://x/api/internal/users",
        "https://x/internal/dash",
        "https://x/login?next=/home",
        "https://x/user/42",
        "https://x/account/settings",
    ],
)
def test_non_public_paths_rejected(bad):
    assert is_public_path(bad) is False


# ---------- robots.txt (COMP-02) ----------

class _FakeResponse:
    def __init__(self, body: bytes):
        self._body = body
    def read(self):
        return self._body
    def __enter__(self): return self
    def __exit__(self, *a): pass


def test_robots_allows_when_no_rule(monkeypatch):
    body = b"User-agent: *\nAllow: /\n"
    monkeypatch.setattr(
        robots_mod.urllib.request, "urlopen",
        lambda url, timeout=10: _FakeResponse(body),
    )
    cache = RobotsCache(ttl_s=60)
    assert cache.is_allowed("https://a.com/page", "GovCrawlerBot/1.0") is True


def test_robots_disallow_specific_path(monkeypatch):
    body = b"User-agent: *\nDisallow: /admin/\n"
    monkeypatch.setattr(
        robots_mod.urllib.request, "urlopen",
        lambda url, timeout=10: _FakeResponse(body),
    )
    cache = RobotsCache(ttl_s=60)
    assert cache.is_allowed("https://a.com/admin/secret", "X") is False
    assert cache.is_allowed("https://a.com/news/p1", "X") is True


def test_robots_fetch_error_defaults_allow(monkeypatch):
    def _boom(url, timeout=10):
        raise OSError("dns_fail")
    monkeypatch.setattr(robots_mod.urllib.request, "urlopen", _boom)
    cache = RobotsCache(ttl_s=60)
    assert cache.is_allowed("https://a.com/page", "X") is True


def test_robots_cache_reuses_parsed(monkeypatch):
    body = b"User-agent: *\nDisallow: /x/\n"
    calls = {"n": 0}

    def _fake(url, timeout=10):
        calls["n"] += 1
        return _FakeResponse(body)

    monkeypatch.setattr(robots_mod.urllib.request, "urlopen", _fake)
    cache = RobotsCache(ttl_s=60)
    cache.is_allowed("https://a.com/page", "X")
    cache.is_allowed("https://a.com/x/", "X")
    cache.is_allowed("https://a.com/page2", "X")
    assert calls["n"] == 1   # only one fetch per host within TTL


# ---------- integration with crawl_target ----------

@pytest.fixture
def db(tmp_path, monkeypatch):
    monkeypatch.setenv("DB_URL", "sqlite:///" + str(tmp_path / "comp.db"))
    monkeypatch.setenv("DATA_DIR", str(tmp_path))
    monkeypatch.setenv("USER_AGENT", "TestBot/1.0")

    from govcrawler import pipeline as pl
    monkeypatch.setattr(pl.HostThrottle, "wait", lambda self, url, **kw: 0.0)

    engine = create_engine("sqlite:///" + str(tmp_path / "comp.db"), future=True)
    Base.metadata.create_all(engine)
    SM = sessionmaker(bind=engine, expire_on_commit=False)
    with SM() as s:
        _, target = make_site_and_target(s, site_code="x", column_id="c")
        s.commit()
        target_code = target.target_code
    monkeypatch.setattr(pl, "get_sessionmaker", lambda: SM)
    return {"target_code": target_code}


def test_crawl_target_rejects_non_public_list_url(monkeypatch, db):
    from govcrawler import pipeline as pl
    from govcrawler.fetcher.browser import FetchResult

    def _fake_list(rt):
        fr = FetchResult(
            url="https://x/admin/dashboard", final_url="https://x/admin/dashboard",
            status=200, html="<ok>", fetched_at=0.0, duration_ms=1,
            is_challenge=False, strategy="httpx",
        )
        return "https://x/admin/dashboard", [], fr

    monkeypatch.setattr(pl, "_list_via_yaml", _fake_list)

    with pytest.raises(ValueError, match="non-public"):
        pl.crawl_target(db["target_code"])


def test_crawl_target_respects_robots_toggle(monkeypatch, db, tmp_path):
    """respect_robots=True blocks when disallowed; False bypasses the gate."""
    from govcrawler import pipeline as pl
    from govcrawler.fetcher.browser import FetchResult
    from govcrawler.models import CrawlSite

    def _fake_list(rt):
        fr = FetchResult(
            url="https://x/public/list", final_url="https://x/public/list",
            status=200, html="<ok>", fetched_at=0.0, duration_ms=1,
            is_challenge=False, strategy="httpx",
        )
        return "https://x/public/list", [], fr

    monkeypatch.setattr(pl, "_list_via_yaml", _fake_list)
    # Force robots to deny
    monkeypatch.setattr(pl, "is_allowed", lambda url, ua: False)

    r = pl.crawl_target(db["target_code"])
    assert r["status"] == "robots_blocked"

    # Flip respect_robots=False on the site row → gate bypassed
    SM = pl.get_sessionmaker()
    with SM() as s:
        site = s.query(CrawlSite).filter_by(site_code="x").one()
        site.respect_robots = False
        s.commit()

    r2 = pl.crawl_target(db["target_code"])
    assert r2["status"] in ("ok", "aborted")
    assert r2["items_seen"] == 0
