"""Selector validator (Phase 2 站点联调辅助).

Runs the fetch+parse stack end-to-end against a site's YAML selectors without
touching the database or local filesystem. Intended for the operator who just
wrote a new `config/sites/<site>.yaml` and wants a tight feedback loop while
tweaking XPath/CSS:

    uv run python -m govcrawler validate gdqy szfwj --limit 3
    uv run python -m govcrawler validate gdqy szfwj --url https://.../post_123.html

Modes:
  - list mode (no --url): fetch list page, parse rows, fetch first N details
  - detail mode (with --url): skip list page, parse the single detail URL

Output is a dict designed for humans AND tests: the CLI pretty-prints it,
tests assert on its structure.
"""
from __future__ import annotations

from dataclasses import dataclass
from typing import Any

from govcrawler.config.registry import get_detail_selectors, get_site_config
from govcrawler.fetcher.browser import FetchResult
from govcrawler.fetcher.chain import fetch_html
from govcrawler.parser.detail_parser import parse_detail
from govcrawler.parser.list_parser import parse_list

PREVIEW_CHARS = 240
LIST_PARSE_WARN_BELOW = 1  # <1 row likely means bad selector


@dataclass
class ValidationHint:
    level: str        # "warn" | "info"
    message: str


def _fetch_summary(fr: FetchResult, max_html_preview: int = 0) -> dict[str, Any]:
    out = {
        "strategy": fr.strategy,
        "status": fr.status,
        "duration_ms": fr.duration_ms,
        "html_size": len(fr.html) if fr.html else 0,
        "is_challenge": fr.is_challenge,
        "error": fr.error,
        "final_url": fr.final_url,
    }
    if max_html_preview and fr.html:
        out["html_preview"] = fr.html[:max_html_preview]
    return out


def _collect_detail_hints(parsed: dict[str, Any]) -> list[ValidationHint]:
    hints: list[ValidationHint] = []
    if not parsed.get("title"):
        hints.append(ValidationHint("warn", "title 为空 — 检查 detail.title selector"))
    if not parsed.get("publish_time_raw"):
        hints.append(ValidationHint("warn", "publish_time 为空 — 检查 detail.publish_time selector"))
    if parsed.get("content_text_length", 0) < 50:
        hints.append(ValidationHint("warn", "正文 < 50 字 — detail.content selector 可能选错了节点"))
    if parsed.get("used_fallback"):
        engine = parsed.get("fallback_engine") or "?"
        hints.append(ValidationHint(
            "info",
            f"触发了 {engine} 兜底抽取 — 主 XPath 也许只抓到头部；运行没问题但可以优化",
        ))
    if not parsed.get("source"):
        hints.append(ValidationHint("info", "source（发布主体）为空 — 该字段非必填"))
    return hints


def _detail_block(url: str, html: str, selectors: dict) -> dict[str, Any]:
    fields = parse_detail(html, url, selectors)
    block: dict[str, Any] = {
        "url": url,
        "title": fields.title,
        "publish_time_raw": fields.publish_time_raw,
        "source": fields.source,
        "content_text_length": len(fields.content_text),
        "content_text_preview": (fields.content_text or "")[:PREVIEW_CHARS],
        "attachment_urls": list(fields.attachment_urls),
        "used_fallback": fields.used_fallback,
        "fallback_engine": fields.fallback_engine,
    }
    block["hints"] = [{"level": h.level, "message": h.message}
                      for h in _collect_detail_hints(block)]
    return block


def validate(
    site_id: str,
    column_id: str,
    *,
    url: str | None = None,
    max_detail: int = 3,
    fetch: callable | None = None,
) -> dict[str, Any]:
    """Dry-run the fetch+parse stack against YAML selectors.

    `fetch` lets tests inject a deterministic stub; production calls
    `fetcher.chain.fetch_html`.
    """
    fetch = fetch or fetch_html

    site = get_site_config(site_id)
    if site is None:
        return {"ok": False, "error": f"no YAML config for site {site_id!r}"}
    col = site.get_column(column_id)
    if col is None:
        return {"ok": False, "error": f"no column {column_id!r} in site {site_id!r}"}

    selectors = get_detail_selectors(site_id, column_id) or {}
    result: dict[str, Any] = {
        "ok": True,
        "site_id": site_id,
        "column_id": column_id,
        "default_strategy": site.default_strategy,
        "selectors": selectors,
    }

    # ----- Detail-only mode -----
    if url:
        fr = fetch(url)
        result["detail_fetch"] = _fetch_summary(fr)
        if not fr.html or fr.error:
            result["ok"] = False
            result["error"] = fr.error or "empty_html"
            return result
        result["detail"] = _detail_block(url, fr.html, selectors)
        return result

    # ----- List mode -----
    if col.list_selector is None:
        result["ok"] = False
        result["error"] = f"list_selector missing on column {column_id!r}"
        return result

    list_fr = fetch(col.list_url)
    result["list_fetch"] = _fetch_summary(list_fr)
    if not list_fr.html or list_fr.error:
        result["ok"] = False
        result["error"] = list_fr.error or "empty_html on list page"
        return result

    list_sel = {
        "row": col.list_selector.row,
        "href": col.list_selector.href,
        "title": col.list_selector.title,
        "date": col.list_selector.date or "span::text",
    }
    items = parse_list(list_fr.html, col.list_url, list_sel)
    result["list_items_parsed"] = len(items)
    result["list_items_preview"] = [
        {"title": it.title, "url": it.url, "date": it.publish_time_raw}
        for it in items[:max_detail]
    ]

    list_hints: list[dict[str, str]] = []
    if len(items) < LIST_PARSE_WARN_BELOW + 1 and len(items) == 0:
        list_hints.append({
            "level": "warn",
            "message": "0 rows parsed — list_selector.row 很可能选错了节点",
        })
    result["list_hints"] = list_hints

    # Try first few detail URLs
    details: list[dict[str, Any]] = []
    for it in items[:max_detail]:
        dfr = fetch(it.url)
        block = {"fetch": _fetch_summary(dfr)}
        if dfr.html and not dfr.error:
            block.update(_detail_block(it.url, dfr.html, selectors))
        else:
            block["error"] = dfr.error or "empty_html"
        details.append(block)
    result["detail_samples"] = details
    return result


def render_human(result: dict[str, Any]) -> str:
    """Compact, operator-friendly text output. The CLI prints this."""
    lines: list[str] = []
    header_ok = "OK" if result.get("ok") else "FAIL"
    lines.append(f"[{header_ok}] site={result.get('site_id')} column={result.get('column_id')}")
    if result.get("error"):
        lines.append(f"  error: {result['error']}")
        return "\n".join(lines)

    if "list_fetch" in result:
        lf = result["list_fetch"]
        lines.append(
            f"  list page: strategy={lf['strategy']} status={lf['status']} "
            f"html={lf['html_size']}B duration={lf['duration_ms']}ms"
        )
        lines.append(
            f"  list parse: {result.get('list_items_parsed', 0)} rows "
            f"(showing first {len(result.get('list_items_preview', []))})"
        )
        for it in result.get("list_items_preview", []):
            lines.append(f"    - {it['date'] or '—':<12}  {it['title'][:40]}  {it['url']}")
        for h in result.get("list_hints", []):
            lines.append(f"  [{h['level']}] {h['message']}")

    details = result.get("detail_samples") or ([result["detail"]] if result.get("detail") else [])
    for i, d in enumerate(details):
        lines.append(f"  --- detail #{i + 1}: {d.get('url', '')} ---")
        if d.get("error"):
            lines.append(f"    error: {d['error']}")
            continue
        lines.append(f"    title        : {d.get('title', '')!r}")
        lines.append(f"    publish_time : {d.get('publish_time_raw', '')!r}")
        lines.append(f"    source       : {d.get('source', '')!r}")
        lines.append(f"    content_text : {d.get('content_text_length', 0)} chars")
        preview = (d.get("content_text_preview") or "").replace("\n", " ⏎ ")[:200]
        lines.append(f"      preview     : {preview}")
        atts = d.get("attachment_urls") or []
        lines.append(f"    attachments  : {len(atts)}" +
                     ("  " + ", ".join(atts[:3]) if atts else ""))
        if d.get("used_fallback"):
            lines.append(f"    fallback     : {d.get('fallback_engine')}")
        for h in d.get("hints", []):
            lines.append(f"    [{h['level']}] {h['message']}")
    return "\n".join(lines)
