from pathlib import Path

import pytest

from govcrawler.config import loader, registry


GDQY_YAML = Path(__file__).resolve().parents[1] / "config" / "sites" / "gdqy.yaml"


def test_load_gdqy_yaml_parses():
    cfg = loader.load_site(GDQY_YAML)
    assert cfg.site_id == "gdqy"
    assert cfg.base_url.startswith("https://")
    assert cfg.default_strategy == "playwright"
    szfwj = cfg.get_column("szfwj")
    assert szfwj is not None
    assert "article-content" in szfwj.detail.content
    assert szfwj.pagination.type == "page_param"


def test_load_sites_dir_picks_gdqy():
    sites = loader.load_sites_dir(GDQY_YAML.parent)
    assert "gdqy" in sites
    assert sites["gdqy"].columns[0].column_id == "szfwj"


def test_invalid_strategy_rejected(tmp_path):
    bad = tmp_path / "bogus.yaml"
    bad.write_text(
        "site_id: bogus\nsite_name: x\nbase_url: https://x\n"
        "default_strategy: banana\ncolumns: []\n",
        encoding="utf-8",
    )
    with pytest.raises(Exception):
        loader.load_site(bad)


def test_filename_stem_must_match_site_id(tmp_path):
    (tmp_path / "wrong.yaml").write_text(
        "site_id: gdqy\nsite_name: x\nbase_url: https://x\ncolumns: []\n",
        encoding="utf-8",
    )
    with pytest.raises(ValueError):
        loader.load_sites_dir(tmp_path)


def test_registry_returns_yaml_selectors_for_gdqy():
    registry.reload()
    sel = registry.get_detail_selectors("gdqy", "szfwj")
    assert sel is not None
    # parity with hardcoded sites/gdqy.py
    from govcrawler.sites.gdqy import DETAIL_SELECTORS
    assert sel["content"] == DETAIL_SELECTORS["content"]
    assert sel["title"] == DETAIL_SELECTORS["title"]


def test_default_column_falls_back_for_unknown_column_id(tmp_path):
    """A site with `default_column` makes get_column() synthesize a ColumnConfig
    on the fly for any column_id not enumerated under `columns:`. The
    synthetic ColumnConfig has list_url='' (pipeline reads from
    crawl_target.entry_url instead) and the default's selectors."""
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
default_column:
  list_selector: { row: ".cm li" }
  pagination: { type: none }
  detail:
    title: "h1::text"
    publish_time: "p::text"
    source: ""
    content: "div.body"
columns: []
""",
        encoding="utf-8",
    )
    cfg = loader.load_site(p)
    # Any column_id resolves to a synthetic ColumnConfig
    col = cfg.get_column("anything")
    assert col is not None
    assert col.column_id == "anything"
    assert col.list_url == ""
    assert col.list_selector.row == ".cm li"
    assert col.detail.content == "div.body"


def test_default_column_explicit_overrides_default(tmp_path):
    """If a column_id is enumerated explicitly, that wins over default_column."""
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
default_column:
  list_selector: { row: ".default li" }
  detail: { title: "h1::text", publish_time: "p::text", source: "", content: "div.def" }
columns:
  - column_id: special
    name: 特殊栏目
    list_url: https://x.example/special/
    list_selector: { row: ".special li" }
    detail: { title: "h2::text", publish_time: "time::text", source: "", content: "div.spec" }
""",
        encoding="utf-8",
    )
    cfg = loader.load_site(p)
    sp = cfg.get_column("special")
    assert sp.list_selector.row == ".special li"   # explicit
    assert sp.detail.content == "div.spec"

    other = cfg.get_column("other")
    assert other.list_selector.row == ".default li"  # falls back
    assert other.detail.content == "div.def"


def test_default_column_schedule_propagates(tmp_path):
    """default_column.schedule overrides ColumnConfig's '0 2 * * *' default
    on synthesized columns. Lets a yaml-driven site set its own per-site
    cron without enumerating every column."""
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
default_column:
  schedule: "*/30 * * * *"
  list_selector: { row: ".cm li" }
  detail: { title: "h1::text", publish_time: "p::text", source: "", content: "div.body" }
columns: []
""",
        encoding="utf-8",
    )
    cfg = loader.load_site(p)
    col = cfg.get_column("anything")
    assert col is not None
    assert col.schedule == "*/30 * * * *"


def test_default_column_schedule_unset_keeps_per_column_default(tmp_path):
    """When default_column omits schedule, the synthesized column falls
    back to ColumnConfig's own default '0 2 * * *'."""
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
default_column:
  list_selector: { row: ".cm li" }
  detail: { title: "h1::text", publish_time: "p::text", source: "", content: "div.body" }
columns: []
""",
        encoding="utf-8",
    )
    cfg = loader.load_site(p)
    col = cfg.get_column("anything")
    # Class default of ColumnConfig.schedule was changed from '0 2 * * *' to
    # None so the cron resolver can fall through yaml sites to
    # crawl_site.schedule_cron when nothing is set explicitly.
    assert col.schedule is None


def test_no_default_no_match_returns_none(tmp_path):
    """Without default_column, unknown column_id returns None as before."""
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
columns:
  - column_id: a
    name: A
    list_url: https://x.example/a/
    list_selector: { row: ".cm li" }
    detail: { title: "h1::text", publish_time: "p::text", source: "", content: "div.body" }
""",
        encoding="utf-8",
    )
    cfg = loader.load_site(p)
    assert cfg.get_column("a") is not None
    assert cfg.get_column("missing") is None


def test_alias_of_inherits_selectors_pagination_detail(tmp_path):
    """alias_of: a column with no list_selector / pagination / detail of its
    own pulls them from the named parent column. Local list_url / name /
    schedule stay column-specific."""
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
columns:
  - column_id: domestic
    name: 国内
    list_url: https://x.example/domestic/
    list_selector: { row: ".cm li" }
    pagination: { type: none }
    detail:
      title: "h1::text"
      publish_time: "div.info::text"
      source: "div.info a::text"
      content: "div.body"
  - column_id: world
    name: 国际
    list_url: https://x.example/world/
    alias_of: domestic
""",
        encoding="utf-8",
    )
    cfg = loader.load_site(p)
    world = cfg.get_column("world")
    assert world is not None
    assert world.list_url == "https://x.example/world/"
    assert world.list_selector.row == ".cm li"           # inherited
    assert world.pagination.type == "none"                # inherited
    assert world.detail.content == "div.body"             # inherited


def test_alias_of_local_override_wins(tmp_path):
    """When the alias DOES set a field, the override wins over inheritance."""
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
columns:
  - column_id: domestic
    name: 国内
    list_url: https://x.example/domestic/
    list_selector: { row: ".cm li" }
    detail: { title: "h1::text", publish_time: "p::text", source: "", content: "div.body" }
  - column_id: world
    name: 国际
    list_url: https://x.example/world/
    alias_of: domestic
    list_selector: { row: ".world-list li" }
""",
        encoding="utf-8",
    )
    cfg = loader.load_site(p)
    world = cfg.get_column("world")
    assert world.list_selector.row == ".world-list li"    # local wins
    assert world.detail.content == "div.body"             # still inherited


def test_alias_of_unknown_target_rejected(tmp_path):
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
columns:
  - column_id: a
    name: A
    list_url: https://x.example/a/
    alias_of: nonexistent
""",
        encoding="utf-8",
    )
    with pytest.raises(Exception, match="alias_of"):
        loader.load_site(p)


def test_alias_of_cycle_rejected(tmp_path):
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
columns:
  - column_id: a
    name: A
    list_url: https://x.example/a/
    alias_of: b
  - column_id: b
    name: B
    list_url: https://x.example/b/
    alias_of: a
""",
        encoding="utf-8",
    )
    with pytest.raises(Exception, match="cycle"):
        loader.load_site(p)


def test_missing_detail_without_alias_rejected(tmp_path):
    p = tmp_path / "wy.yaml"
    p.write_text(
        """
site_id: wy
site_name: WY
base_url: https://x.example
default_strategy: httpx
columns:
  - column_id: a
    name: A
    list_url: https://x.example/a/
""",
        encoding="utf-8",
    )
    with pytest.raises(Exception, match="detail"):
        loader.load_site(p)


def test_registry_returns_none_for_unknown_site():
    registry.reload()
    assert registry.get_detail_selectors("unknown_site", "x") is None


def test_registry_fallback_when_no_yaml(monkeypatch, tmp_path):
    # point registry at empty dir — gdqy YAML won't load, but legacy fallback kicks in
    monkeypatch.setattr(registry, "DEFAULT_CONFIG_DIR", tmp_path)
    registry.reload()
    sel = registry.get_detail_selectors("gdqy", "szfwj")
    assert sel is not None  # fallback to sites/gdqy.py
    # cleanup for other tests
    registry.reload()
