"""YAML site/column configuration loader.

Schema matches design doc §4.3. A site YAML lives at `config/sites/<site_id>.yaml`
and is the single source of truth for selectors, schedule, throttle, and columns.
Hardcoded Python site modules (e.g. sites/gdqy.py) become a fallback for
development and can be retired once all 20–30 sites are YAML-defined.
"""
from __future__ import annotations

from pathlib import Path
from typing import Any

import yaml
from pydantic import BaseModel, Field, field_validator


class DetailSelectors(BaseModel):
    title: str
    publish_time: str
    source: str = ""
    content: str
    attachment_css: str = (
        "a[href$='.pdf'], a[href$='.doc'], a[href$='.docx'], "
        "a[href$='.xls'], a[href$='.xlsx'], a[href$='.zip']"
    )


class ListSelectors(BaseModel):
    row: str
    href: str = "a::attr(href)"
    title: str = "a::text"
    date: str = ""


class Pagination(BaseModel):
    type: str = "none"          # "none" | "page_param"
    param: str = "page"
    max_pages: int = 5


class ColumnConfig(BaseModel):
    column_id: str
    name: str
    category: str = ""
    list_url: str
    list_selector: ListSelectors | None = None
    pagination: Pagination = Field(default_factory=Pagination)
    detail: DetailSelectors
    schedule: str = "0 2 * * *"  # default: daily 02:00
    enabled: bool = True


class SiteConfig(BaseModel):
    site_id: str
    site_name: str
    base_url: str
    default_strategy: str = "httpx"   # "httpx" | "playwright" | "drission"
    concurrency: int = 1
    interval_sec: float = 5.0
    enabled: bool = True
    respect_robots: bool = True   # COMP-02 opt-out (default: obey robots.txt)
    columns: list[ColumnConfig]

    @field_validator("default_strategy")
    @classmethod
    def _strategy_known(cls, v: str) -> str:
        if v not in ("httpx", "playwright", "drission"):
            raise ValueError(f"default_strategy must be httpx|playwright|drission, got {v!r}")
        return v

    def get_column(self, column_id: str) -> ColumnConfig | None:
        for c in self.columns:
            if c.column_id == column_id:
                return c
        return None


def load_site(path: str | Path) -> SiteConfig:
    """Load a single YAML file into a validated SiteConfig."""
    data: Any = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
    if not isinstance(data, dict):
        raise ValueError(f"site YAML root must be a mapping: {path}")
    return SiteConfig(**data)


def load_sites_dir(directory: str | Path) -> dict[str, SiteConfig]:
    """Load every *.yaml / *.yml in a directory. Filename stem must match site_id."""
    d = Path(directory)
    out: dict[str, SiteConfig] = {}
    if not d.is_dir():
        return out
    for p in sorted(d.iterdir()):
        if p.suffix.lower() not in (".yaml", ".yml"):
            continue
        cfg = load_site(p)
        if cfg.site_id != p.stem:
            raise ValueError(
                f"YAML filename stem {p.stem!r} must equal site_id {cfg.site_id!r}"
            )
        out[cfg.site_id] = cfg
    return out
