"""YAML site/column configuration loader.

Schema matches design doc §4.3. A site YAML lives at `config/sites/<site_id>.yaml`
and is the single source of truth for selectors, schedule, throttle, and columns.
Hardcoded Python site modules (e.g. sites/gdqy.py) become a fallback for
development and can be retired once all 20–30 sites are YAML-defined.
"""
from __future__ import annotations

from pathlib import Path
from typing import Any

import yaml
from pydantic import BaseModel, Field, field_validator, model_validator


class DetailSelectors(BaseModel):
    title: str
    publish_time: str
    source: str = ""
    content: str
    attachment_css: str = (
        "a[href$='.pdf'], a[href$='.doc'], a[href$='.docx'], "
        "a[href$='.xls'], a[href$='.xlsx'], a[href$='.zip']"
    )


class ListSelectors(BaseModel):
    row: str
    href: str = "a::attr(href)"
    title: str = "a::text"
    date: str = ""


class Pagination(BaseModel):
    """How to walk past page 1 of a list URL. Pipeline reads this in
    _list_via_yaml and accumulates entries across pages.

    Supported types:
      • none          → only the canonical list_url (page 1) is fetched
      • page_param    → append `?<param>=<n>` (or merge into existing query)
                         to list_url for pages 2..max_pages
      • path_pattern  → substitute `pattern` (with {page}) into the last
                         path segment of list_url. e.g. /zfwj/index.html
                         + pattern='index_{page}.html' → /zfwj/index_2.html
                         All 清远 NF-CMS sites use this style.

    Page 1 is always the original list_url; paginated pages start at
    `start` (default 2). max_pages is a hard cap to keep one pass bounded.
    """
    type: str = "none"
    param: str = "page"
    pattern: str | None = None
    start: int = 2
    max_pages: int = 5


class CrawlOrder(BaseModel):
    """Optional per-column detail fetch order.

    `ordered` preserves publish-time/list order. `paged_batch` fetches and
    processes a small batch of list pages before moving on, which limits lost
    in-memory URL discovery on restart. `random_batch` does the same but
    shuffles detail URLs inside each batch for WAF-sensitive long-tail targets.
    """
    mode: str = "ordered"
    batch_pages: int = 1


class ColumnConfig(BaseModel):
    column_id: str
    name: str
    category: str = ""
    list_url: str = ""
    list_selector: ListSelectors | None = None
    pagination: Pagination = Field(default_factory=Pagination)
    # Optional: inherit selectors/pagination/detail from another column in the
    # same site. Per-column overrides (anything explicitly set here) win.
    # Resolved at SiteConfig validation time — see _resolve_aliases below.
    alias_of: str | None = None
    # `detail` is required, but when alias_of is set we let it be filled in
    # post-validation, so accept None at parse time and fail later if it's
    # still missing.
    detail: DetailSelectors | None = None
    # None means 'inherit from upper level'. The cron resolver order is:
    #   target.schedule_cron → THIS → default_column.schedule →
    #   crawl_site.schedule_cron → DEFAULT_SCHEDULE.
    # If we kept '0 2 * * *' as the class default, every yaml column would
    # silently win the resolution race and crawl_site.schedule_cron could
    # never apply to yaml sites — defeating the whole site-level knob.
    schedule: str | None = None
    enabled: bool = True
    crawl_order: CrawlOrder = Field(default_factory=CrawlOrder)


class ColumnDefaults(BaseModel):
    """Site-level fallback column shape: holds the inheritable selectors but
    no column_id / name / list_url. Used by `SiteConfig.get_column()` to
    synthesize a ColumnConfig on the fly when a target asks for a column_id
    that wasn't enumerated in the yaml. The pipeline pulls list_url from
    `crawl_target.entry_url` instead of yaml in that case, so the operator
    can keep adding new sub-columns via the admin UI without editing yaml.

    `schedule` here is the site-level cron default: when neither
    `crawl_target.schedule_cron` nor an explicit `column.schedule` is set,
    the scheduled-jobs resolver falls through to this. None preserves the
    legacy DEFAULT_SCHEDULE behavior.
    """
    list_selector: ListSelectors | None = None
    pagination: Pagination = Field(default_factory=Pagination)
    detail: DetailSelectors | None = None
    schedule: str | None = None
    crawl_order: CrawlOrder = Field(default_factory=CrawlOrder)


class SiteConfig(BaseModel):
    site_id: str
    site_name: str
    base_url: str
    default_strategy: str = "httpx"   # "httpx" | "playwright" | "drission"
    concurrency: int = 1
    interval_sec: float = 5.0
    enabled: bool = True
    respect_robots: bool = True   # COMP-02 opt-out (default: obey robots.txt)
    columns: list[ColumnConfig] = Field(default_factory=list)
    # Fallback shape for any column_id not explicitly listed in `columns`.
    # When set, `get_column(<unknown>)` synthesizes a ColumnConfig from it
    # rather than returning None — pipeline then uses target.entry_url as
    # the list_url. See ColumnDefaults docstring.
    default_column: ColumnDefaults | None = None

    @field_validator("default_strategy")
    @classmethod
    def _strategy_known(cls, v: str) -> str:
        if v not in ("httpx", "playwright", "drission"):
            raise ValueError(f"default_strategy must be httpx|playwright|drission, got {v!r}")
        return v

    @model_validator(mode="after")
    def _resolve_aliases(self) -> "SiteConfig":
        """Apply `alias_of` inheritance: a column with `alias_of=foo` borrows
        list_selector / pagination / detail from column `foo` for any field
        the alias didn't set explicitly. Per-column overrides always win.

        Errors out on:
          - alias_of pointing at a missing column_id
          - alias_of forming a cycle (a→b→a)
          - column with alias_of=None still missing required `detail`
        """
        by_id = {c.column_id: c for c in self.columns}
        resolved: set[str] = set()

        def _resolve(c: ColumnConfig, chain: list[str]) -> None:
            if c.column_id in resolved:
                return
            if c.alias_of is None:
                if c.detail is None:
                    raise ValueError(
                        f"column {c.column_id!r} has no `detail` and no `alias_of` to inherit from"
                    )
                resolved.add(c.column_id)
                return
            if c.alias_of in chain:
                raise ValueError(
                    f"alias_of cycle: {' → '.join(chain + [c.alias_of])}"
                )
            parent = by_id.get(c.alias_of)
            if parent is None:
                raise ValueError(
                    f"column {c.column_id!r} alias_of={c.alias_of!r} not found in this site"
                )
            _resolve(parent, chain + [c.column_id])
            # Inherit only when the alias didn't set the field. We compare
            # against fields_set (Pydantic v2) so a value that happens to
            # equal the default is still treated as explicit.
            if "list_selector" not in c.model_fields_set:
                c.list_selector = parent.list_selector
            if "pagination" not in c.model_fields_set:
                c.pagination = parent.pagination
            if "detail" not in c.model_fields_set:
                c.detail = parent.detail
            resolved.add(c.column_id)

        for col in self.columns:
            _resolve(col, [col.column_id])
        return self

    def get_column(self, column_id: str) -> ColumnConfig | None:
        for c in self.columns:
            if c.column_id == column_id:
                return c
        # Fallback: synthesize a ColumnConfig from default_column so newly
        # discovered sub-columns work without yaml edits. list_url stays
        # empty — pipeline resolves it from crawl_target.entry_url.
        if self.default_column is not None and self.default_column.detail is not None:
            kwargs = dict(
                column_id=column_id,
                name=column_id,
                list_url="",
                list_selector=self.default_column.list_selector,
                pagination=self.default_column.pagination,
                detail=self.default_column.detail,
            )
            # If the site sets default_column.schedule, surface it as the
            # column's schedule so the cron resolver picks it up. Otherwise
            # ColumnConfig's own default ('0 2 * * *') applies.
            if self.default_column.schedule:
                kwargs["schedule"] = self.default_column.schedule
            if "crawl_order" in self.default_column.model_fields_set:
                kwargs["crawl_order"] = self.default_column.crawl_order
            return ColumnConfig(**kwargs)
        return None


def load_site(path: str | Path) -> SiteConfig:
    """Load a single YAML file into a validated SiteConfig."""
    data: Any = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
    if not isinstance(data, dict):
        raise ValueError(f"site YAML root must be a mapping: {path}")
    return SiteConfig(**data)


def load_sites_dir(directory: str | Path) -> dict[str, SiteConfig]:
    """Load every *.yaml / *.yml in a directory. Filename stem must match site_id."""
    d = Path(directory)
    out: dict[str, SiteConfig] = {}
    if not d.is_dir():
        return out
    for p in sorted(d.iterdir()):
        if p.suffix.lower() not in (".yaml", ".yml"):
            continue
        cfg = load_site(p)
        if cfg.site_id != p.stem:
            raise ValueError(
                f"YAML filename stem {p.stem!r} must equal site_id {cfg.site_id!r}"
            )
        out[cfg.site_id] = cfg
    return out
