"""Crawl targets — CRUD, toggle, manual run, parser override, category, dry-run."""
from __future__ import annotations

import json
import logging
import re
from typing import Any
from urllib.parse import urlparse

from fastapi import BackgroundTasks, Body, Depends, HTTPException, Query
from sqlalchemy.orm import Session

from govcrawler.models import LocalDepartment, SiteDepartment
from govcrawler.repositories import sites as sites_repo
from govcrawler.repositories import targets as targets_repo

from ._common import (
    _infer_legacy_validator_key,
    _normalize_str,
    _serialize_target,
    _session,
    _validate_target_payload,
    router,
)

log = logging.getLogger(__name__)


@router.post("/api/targets/{target_code}/toggle")
def toggle_target(
    target_code: str, enabled: bool = Query(...),
    s: Session = Depends(_session),
) -> dict[str, Any]:
    row = targets_repo.set_enabled(s, target_code, enabled)
    if row is None:
        raise HTTPException(404, f"target not found: {target_code}")
    s.commit()
    return {"target_code": target_code, "enabled": enabled}


@router.delete("/api/targets/{target_code}")
def delete_target(
    target_code: str, s: Session = Depends(_session),
) -> dict[str, Any]:
    row = targets_repo.get_by_code(s, target_code)
    if row is None:
        raise HTTPException(404, f"target not found: {target_code}")
    # Article.target_id + CrawlLog.target_id are ondelete=SET NULL, so historical
    # rows survive with a null target_id pointer. Safe to hard-delete.
    s.delete(row)
    s.commit()
    return {"deleted": True, "target_code": target_code}


@router.post("/api/targets/{target_code}/run")
def run_target(
    target_code: str, background: BackgroundTasks,
    max_items: int | None = Query(None, ge=1),
    s: Session = Depends(_session),
) -> dict[str, Any]:
    """Fire-and-forget manual crawl of one target. Status shows up in /api/logs."""
    t = targets_repo.get_by_code(s, target_code)
    if t is None:
        raise HTTPException(404, f"target not found: {target_code}")

    try:
        from govcrawler.pipeline import crawl_target  # type: ignore
    except ImportError as e:
        raise HTTPException(
            501, f"pipeline.crawl_target not wired yet for v2: {e}"
        )

    def _go():
        try:
            crawl_target(target_code, max_items=max_items)
        except Exception:
            log.exception("manual crawl failed target=%s", target_code)

    background.add_task(_go)
    return {"queued": True, "target_code": target_code, "max_items": max_items}


@router.post("/api/targets")
def create_target(
    payload: dict[str, Any] = Body(...),
    s: Session = Depends(_session),
) -> dict[str, Any]:
    fields = _validate_target_payload(payload, partial=False)
    site_code = fields.pop("site_code")
    target_code = fields.pop("target_code")
    assert site_code is not None and target_code is not None
    if targets_repo.get_by_code(s, target_code) is not None:
        raise HTTPException(409, f"target already exists: {target_code}")
    site = sites_repo.get_by_code(s, site_code)
    if site is None:
        raise HTTPException(404, f"site not found: {site_code}")
    dept_id = fields.get("dept_id")
    if dept_id is not None and s.get(LocalDepartment, dept_id) is None:
        raise HTTPException(404, f"local_department not found: {dept_id}")
    site_department_id = fields.pop("site_department_id", None)
    if site_department_id is not None and s.get(SiteDepartment, site_department_id) is None:
        raise HTTPException(404, f"site_department not found: {site_department_id}")
    row = targets_repo.upsert_by_code(
        s,
        target_code=target_code,
        site_id=site.id,
        site_department_id=site_department_id,
        **fields,
    )
    s.commit()
    s.refresh(row)
    return {"target": _serialize_target(row), "site_code": site.site_code}


@router.put("/api/targets/{target_code}")
def update_target(
    target_code: str,
    payload: dict[str, Any] = Body(...),
    s: Session = Depends(_session),
) -> dict[str, Any]:
    if "target_code" in payload and payload["target_code"] != target_code:
        raise HTTPException(400, "target_code in path/body must match")
    row = targets_repo.get_by_code(s, target_code)
    if row is None:
        raise HTTPException(404, f"target not found: {target_code}")
    fields = _validate_target_payload(payload, partial=True)
    site_code = fields.pop("site_code", None)
    if site_code is not None:
        site = sites_repo.get_by_code(s, site_code)
        if site is None:
            raise HTTPException(404, f"site not found: {site_code}")
        row.site_id = site.id
    dept_id = fields.get("dept_id")
    if dept_id is not None and s.get(LocalDepartment, dept_id) is None:
        raise HTTPException(404, f"local_department not found: {dept_id}")
    if "site_department_id" in fields:
        site_department_id = fields.pop("site_department_id")
        if site_department_id is not None and s.get(SiteDepartment, site_department_id) is None:
            raise HTTPException(404, f"site_department not found: {site_department_id}")
        row.site_department_id = site_department_id
    for key, value in fields.items():
        if key not in {"site_code", "target_code"}:
            setattr(row, key, value)
    s.commit()
    s.refresh(row)
    return {"target": _serialize_target(row), "site_code": row.site.site_code if row.site else None}


# ---------- Parser override (crawl_target.parser_override_json) ----------

@router.get("/api/targets/{target_code}/parser")
def get_target_parser(
    target_code: str, s: Session = Depends(_session),
) -> dict[str, Any]:
    t = targets_repo.get_by_code(s, target_code)
    if t is None:
        raise HTTPException(404, f"target not found: {target_code}")
    return {
        "target_code": t.target_code,
        "parser_override": t.parser_override_json or {},
    }


@router.put("/api/targets/{target_code}/parser")
def put_target_parser(
    target_code: str,
    payload: dict[str, Any] = Body(..., description='{"detail": {"title": "...", ...}}'),
    s: Session = Depends(_session),
) -> dict[str, Any]:
    """Overwrite `parser_override_json`. Pass {} to clear (stores NULL)."""
    if not isinstance(payload, dict):
        raise HTTPException(400, "body must be a JSON object")
    if payload and "detail" in payload and not isinstance(payload["detail"], dict):
        raise HTTPException(400, '"detail" must be an object if present')

    t = targets_repo.get_by_code(s, target_code)
    if t is None:
        raise HTTPException(404, f"target not found: {target_code}")
    t.parser_override_json = payload or None
    s.commit()
    return {
        "target_code": target_code,
        "parser_override": t.parser_override_json or {},
    }


@router.put("/api/targets/{target_code}/category")
def put_target_category(
    target_code: str,
    payload: dict[str, Any] = Body(
        ..., description='{"content_category": "...", "content_subcategory": "..."}'
    ),
    s: Session = Depends(_session),
) -> dict[str, Any]:
    """Update a target's content_category / content_subcategory."""
    allowed = {"content_category", "content_subcategory"}
    unknown = set(payload.keys()) - allowed
    if unknown:
        raise HTTPException(400, f"unknown keys: {sorted(unknown)}")

    t = targets_repo.get_by_code(s, target_code)
    if t is None:
        raise HTTPException(404, f"target not found: {target_code}")

    for k in allowed:
        if k in payload:
            v = payload[k]
            if v is not None and not isinstance(v, str):
                raise HTTPException(400, f"{k} must be string or null")
            setattr(t, k, v or None)
    s.commit()
    return {
        "target_code": target_code,
        "content_category": t.content_category,
        "content_subcategory": t.content_subcategory,
    }


@router.post("/api/targets/{target_code}/dry-run")
def dry_run_target(
    target_code: str,
    payload: dict[str, Any] = Body(default={}),
    s: Session = Depends(_session),
) -> dict[str, Any]:
    """Best-effort validation for a target.

    If the target_code still maps cleanly to legacy YAML `(site_id, column_id)`,
    reuse `govcrawler.validator.validate`. Otherwise fall back to lightweight
    field checks so the admin UI has a meaningful pre-save sanity gate.
    """
    t = targets_repo.get_by_code(s, target_code)
    if t is None:
        raise HTTPException(404, f"target not found: {target_code}")

    sample_url = _normalize_str(payload.get("sample_article_url")) if "sample_article_url" in payload else t.sample_article_url
    entry_url = _normalize_str(payload.get("entry_url")) if "entry_url" in payload else t.entry_url
    result: dict[str, Any] = {
        "target_code": t.target_code,
        "entry_url": entry_url,
        "sample_article_url": sample_url,
        "mode": "basic",
        "ok": True,
        "checks": [],
    }
    if not entry_url:
        result["ok"] = False
        result["checks"].append({"level": "error", "message": "entry_url 为空"})
    else:
        result["checks"].append({"level": "info", "message": "entry_url 已填写"})
    if sample_url:
        result["checks"].append({"level": "info", "message": "sample_article_url 已填写"})
    else:
        result["checks"].append({"level": "warn", "message": "未提供 sample_article_url，只做基础校验"})

    inferred = _infer_legacy_validator_key(target_code)
    if inferred:
        from govcrawler.validator import validate

        site_id, column_id = inferred
        v = validate(site_id, column_id, url=sample_url) if sample_url else validate(site_id, column_id, max_detail=1)
        result["validator"] = v
        result["mode"] = "validator"
        result["ok"] = bool(v.get("ok")) and result["ok"]
        if v.get("error"):
            result["checks"].append({"level": "error", "message": str(v["error"])})
        else:
            if "detail" in v:
                detail = v["detail"]
                if detail.get("title"):
                    result["checks"].append({"level": "info", "message": "标题命中"})
                if detail.get("content_text_length", 0) >= 50:
                    result["checks"].append({"level": "info", "message": "正文长度通过"})
                else:
                    result["checks"].append({"level": "warn", "message": "正文长度较短"})
            if "list_items_parsed" in v:
                result["checks"].append({"level": "info", "message": f"列表解析 {v['list_items_parsed']} 条"})
    else:
        result["checks"].append({"level": "warn", "message": "target_code 无法映射旧 validator，仅完成基础字段校验"})
    return result


# ---------- Column discovery + bulk create (gkmlpt dept index → targets) ----------

_TREE_RE = re.compile(r"TREE\s*:\s*(\[.*?\])\s*,\s*\n", re.DOTALL)
_SID_RE = re.compile(r"SID\s*:\s*'([^']+)'")


def _flatten_tree(nodes: list[dict[str, Any]], depth: int = 0,
                  ancestors: list[str] | None = None) -> list[dict[str, Any]]:
    """Walk the gkmlpt TREE tree, returning a flat list of columns.

    Skips entries with non-empty jump_url (external links — not crawlable).
    Carries the ancestor name chain so each column can store a human-readable
    `path` like "机构信息｜内设机构".
    """
    ancestors = ancestors or []
    out: list[dict[str, Any]] = []
    for n in nodes or []:
        jump = (n.get("jump_url") or "").strip()
        is_external = bool(jump)
        name = (n.get("name") or "").strip()
        path_chain = ancestors + [name] if name else ancestors
        out.append({
            "id": str(n.get("id")),
            "name": name,
            "path": "｜".join(path_chain),
            "parent": n.get("parent") or 0,
            "depth": depth,
            "is_external": is_external,
            "has_children": bool(n.get("children")),
        })
        if n.get("children"):
            out.extend(_flatten_tree(n["children"], depth + 1, path_chain))
    return out


@router.post("/api/targets/discover")
def discover_columns(payload: dict[str, Any] = Body(...)) -> dict[str, Any]:
    """Fetch a gkmlpt dept index page and return all columns under it.

    Accepts a URL like `http://<host>/<dept_path>/gkmlpt/index` (no column
    anchor). Reads the inline `TREE: […]` JSON + `SID` config shipped by the
    CMS and returns a flat, depth-annotated column list so the UI can show a
    picker and bulk-create crawl_targets.
    """
    entry_url = _normalize_str(payload.get("entry_url"))
    if not entry_url:
        raise HTTPException(400, "entry_url is required")
    try:
        parsed = urlparse(entry_url)
    except Exception:
        raise HTTPException(400, f"malformed URL: {entry_url}")
    if not parsed.scheme or not parsed.netloc:
        raise HTTPException(400, f"URL must include scheme + host: {entry_url}")
    segs = [p for p in parsed.path.split("/") if p]
    if "gkmlpt" not in segs:
        raise HTTPException(400, "URL 不是 gkmlpt 公开目录平台的栏目入口")
    dept_path = segs[0] if segs and segs[0] != "gkmlpt" else ""

    import httpx
    try:
        r = httpx.get(
            entry_url,
            headers={"User-Agent": "Mozilla/5.0 GovCrawlerAdmin"},
            follow_redirects=True,
            timeout=15.0,
        )
        r.raise_for_status()
    except Exception as e:
        raise HTTPException(502, f"fetch index failed: {e}")
    html = r.text

    tree_m = _TREE_RE.search(html)
    if not tree_m:
        raise HTTPException(400, "页面未找到 TREE 结构 — 该 URL 可能不是 gkmlpt 入口")
    try:
        tree = json.loads(tree_m.group(1))
    except json.JSONDecodeError as e:
        raise HTTPException(500, f"TREE JSON parse failed: {e}")
    sid_m = _SID_RE.search(html)
    sid = sid_m.group(1) if sid_m else None

    columns = _flatten_tree(tree)
    return {
        "entry_url": entry_url,
        "dept_path": dept_path,
        "sid": sid,
        "count": len(columns),
        "columns": columns,
    }


@router.post("/api/targets/bulk-create")
def bulk_create_targets(
    payload: dict[str, Any] = Body(...),
    s: Session = Depends(_session),
) -> dict[str, Any]:
    """Create one crawl_target per column_id picked from discover.

    Body shape:
      {
        "site_code": "fogang",
        "dept_path": "qyfgczj",
        "entry_url_base": "http://www.fogang.gov.cn/qyfgczj/gkmlpt/index",
        "columns": [{"id":"2396","name":"机构信息"}, ...],
        "dept_id": 123 | null,
        "expected_cadence_days": 30,
        "interval_sec": 30,        # per-request delay (politeness)
        "enabled": true
      }
    Idempotent: existing target_code is skipped; returns created + skipped lists.
    """
    site_code = _normalize_str(payload.get("site_code"))
    dept_path = _normalize_str(payload.get("dept_path")) or ""
    entry_url_base = _normalize_str(payload.get("entry_url_base"))
    columns = payload.get("columns") or []
    if not site_code or not entry_url_base or not columns:
        raise HTTPException(400, "site_code, entry_url_base, columns are required")
    if not isinstance(columns, list):
        raise HTTPException(400, "columns must be an array")
    if len(columns) > 500:
        raise HTTPException(400, "columns must be <= 500")

    site = sites_repo.get_by_code(s, site_code)
    if site is None:
        raise HTTPException(404, f"site not found: {site_code}")

    dept_id = payload.get("dept_id")
    dept_row = None
    if dept_id is not None:
        if not isinstance(dept_id, int) or dept_id <= 0:
            raise HTTPException(400, "dept_id must be positive int or null")
        dept_row = s.get(LocalDepartment, dept_id)
        if dept_row is None:
            raise HTTPException(404, f"local_department not found: {dept_id}")

    cadence = payload.get("expected_cadence_days")
    cadence = int(cadence) if isinstance(cadence, int) and cadence > 0 else 30
    interval = payload.get("interval_sec")
    interval = int(interval) if isinstance(interval, int) and interval >= 0 else 30
    enabled = payload.get("enabled")
    enabled = bool(enabled) if isinstance(enabled, bool) else True

    # Short chinese label for the dept — used in the human-readable target_name.
    # Prefer short_name; fall back to full_name stripped of common suffixes.
    dept_label = ""
    if dept_row is not None:
        dept_label = (dept_row.short_name or dept_row.dept_name or dept_row.full_name or "").strip()

    base = entry_url_base.rstrip("#").split("#", 1)[0]
    created: list[dict[str, Any]] = []
    skipped: list[dict[str, Any]] = []
    for col in columns:
        col_id = str(col.get("id") or "").strip()
        col_name = _normalize_str(col.get("name"))
        col_path = _normalize_str(col.get("path")) or col_name
        if not col_id:
            continue
        parts = [site_code]
        if dept_path:
            parts.append(dept_path)
        parts.append(col_id)
        target_code = "__".join(parts)
        if targets_repo.get_by_code(s, target_code) is not None:
            skipped.append({"target_code": target_code, "reason": "exists"})
            continue
        # Human-readable name: "佛冈县政数局-机构信息" when dept is bound;
        # falls back to just the column name otherwise.
        human_name = (
            f"{dept_label}-{col_name}" if dept_label and col_name
            else (col_name or None)
        )
        row = targets_repo.upsert_by_code(
            s,
            target_code=target_code,
            site_id=site.id,
            target_name=human_name,
            entry_url=f"{base}#{col_id}",
            dept_id=dept_id,
            channel_name=col_name,
            channel_path=col_path,
            expected_cadence_days=cadence,
            interval_sec=interval,
            enabled=enabled,
        )
        s.flush()
        created.append({"target_code": row.target_code, "target_name": row.target_name})
    s.commit()
    return {
        "created_count": len(created),
        "skipped_count": len(skipped),
        "created": created,
        "skipped": skipped,
    }