from __future__ import annotations
import argparse
from dataclasses import asdict
import json
import logging
import sys

from govcrawler.pipeline import fetch_and_store
from govcrawler.sites.gdqy import (
    COLUMN_ID as GDQY_COLUMN_ID,
    SITE_ID as GDQY_SITE_ID,
    TARGET_ARTICLE_KEY as GDQY_TARGET_KEY,
    TARGET_ARTICLE_URL as GDQY_TARGET_URL,
)

log = logging.getLogger(__name__)


def _build_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(prog="govcrawler", description="政务网站采集系统 (PoC)")
    sub = p.add_subparsers(dest="cmd", required=True)

    fetch = sub.add_parser("fetch", help="抓取一篇文章")
    fetch.add_argument("site", help="site_id, e.g. gdqy")
    fetch.add_argument("column", help="column_id, e.g. szfwj")
    fetch.add_argument("article_key", help="article key, e.g. post_2136593")
    fetch.add_argument(
        "--url",
        default=None,
        help="目标 URL；未给时使用该 site+column 硬编码的 TARGET_ARTICLE_URL",
    )

    crawl = sub.add_parser("crawl", help="批量抓取一个采集目标（列表页 → 详情页）")
    crawl.add_argument(
        "target",
        help="target_code, e.g. gdqy__szfwj；兼容旧用法时这里传 site_id",
    )
    crawl.add_argument(
        "column",
        nargs="?",
        default=None,
        help="旧用法 column_id；提供时会拼成 <site>__<column>",
    )
    crawl.add_argument(
        "--max-items", type=int, default=None, help="最多抓取条数（默认按列表页全量）"
    )
    crawl.add_argument(
        "--no-stop-on-duplicate",
        action="store_true",
        help="禁用 INC-03 遇已采即停；会继续翻扫整页",
    )

    sub.add_parser("schedule", help="启动 APScheduler 守护进程（01:00-05:00 错峰跑批）")

    serve = sub.add_parser("serve", help="启动 REST API（给 RAG 侧消费）")
    serve.add_argument("--host", default="0.0.0.0")
    serve.add_argument("--port", type=int, default=8787)

    rag = sub.add_parser("rag-export", help="把已采集的公开文章和附件推送到 zm-rag")
    rag.add_argument("--limit", type=int, default=None, help="本次最多导出文章数")
    rag.add_argument("--article-id", type=int, default=None, help="只导出指定文章 ID")
    rag.add_argument("--target-code", default=None, help="只导出指定采集目标下的文章")
    rag.add_argument("--dry-run", action="store_true", help="只生成导出计划，不调用 zm-rag")

    alerts = sub.add_parser(
        "check-alerts",
        help="跑一遍阈值检查并 POST 到飞书/企微 webhook（OBS-03）",
    )
    alerts.add_argument("--dry-run", action="store_true", help="只打印，不发送")

    sub.add_parser("list-sites", help="打印所有站点/栏目及 enabled 状态")

    toggle = sub.add_parser("toggle", help="启停站点或栏目（写回 YAML 并热重载）")
    toggle.add_argument("site", help="site_id")
    toggle.add_argument("column", nargs="?", default=None, help="column_id（省略即操作站点）")
    g = toggle.add_mutually_exclusive_group(required=True)
    g.add_argument("--enable", action="store_true")
    g.add_argument("--disable", action="store_true")

    validate = sub.add_parser(
        "validate",
        help="对 YAML selectors 做端到端试抓诊断（联调辅助，不写 DB/磁盘）",
    )
    validate.add_argument("site", help="site_id")
    validate.add_argument("column", help="column_id")
    validate.add_argument(
        "--url", default=None,
        help="直接传一个详情页 URL 只跑 detail 解析；省略则从列表页走完",
    )
    validate.add_argument(
        "--limit", type=int, default=3,
        help="列表模式下最多抽样抓取几篇详情（默认 3）",
    )
    validate.add_argument(
        "--json", action="store_true",
        help="输出结构化 JSON（默认 human-friendly 文本）",
    )

    sync = sub.add_parser(
        "sync-yaml",
        help="把 config/sites_v2/ 的 v2 YAML 同步进 DB（§7.5.3 yaml→DB 单向）",
    )
    sync.add_argument(
        "--dir", default="config/sites_v2",
        help="v2 YAML 目录（默认 config/sites_v2）",
    )
    sync.add_argument(
        "--file", default=None,
        help="只同步单个 YAML 文件；与 --dir 互斥",
    )
    sync.add_argument(
        "--dry-run", action="store_true",
        help="跑解析 + 报告生成，不 commit",
    )
    return p


def _resolve_url(site: str, column: str, article_key: str, explicit: str | None) -> str:
    if explicit:
        return explicit
    if site == GDQY_SITE_ID and column == GDQY_COLUMN_ID and article_key == GDQY_TARGET_KEY:
        return GDQY_TARGET_URL
    raise SystemExit(
        f"no hardcoded URL for {site}/{column}/{article_key}; pass --url explicitly"
    )


def main(argv: list[str] | None = None) -> int:
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    )
    args = _build_parser().parse_args(argv)
    if args.cmd == "fetch":
        url = _resolve_url(args.site, args.column, args.article_key, args.url)
        result = fetch_and_store(target_code=f"{args.site}__{args.column}", url=url)
        if result.get("status") == "skipped":
            print(f"[skip] duplicate url_hash article_id={result['article_id']}")
            return 0
        print(json.dumps(result, ensure_ascii=False, indent=2))
        return 0 if result.get("status") == "ready" else 2
    if args.cmd == "crawl":
        from govcrawler.pipeline import crawl_target

        target_code = args.target if args.column is None else f"{args.target}__{args.column}"
        result = crawl_target(
            target_code,
            max_items=args.max_items,
            stop_on_duplicate=not args.no_stop_on_duplicate,
        )
        if result.get("status") == "ok":
            try:
                from govcrawler.settings import get_settings

                if get_settings().rag_export_after_crawl_enabled:
                    from govcrawler.rag.exporter import export_pending_to_rag

                    rag_result = export_pending_to_rag(target_code=target_code)
                    result["rag_export"] = {
                        "status": "completed" if rag_result.failed == 0 else "partial_failed",
                        "target_code": target_code,
                        **asdict(rag_result),
                    }
            except Exception as exc:
                log.exception("rag export after crawl failed target=%s", target_code)
                result["rag_export"] = {
                    "status": "failed",
                    "target_code": target_code,
                    "error": str(exc),
                }
        print(json.dumps(result, ensure_ascii=False, indent=2))
        return 0 if result.get("status") == "ok" else 2
    if args.cmd == "schedule":
        from govcrawler.scheduler import run_forever
        run_forever()
        return 0
    if args.cmd == "serve":
        import uvicorn
        uvicorn.run("govcrawler.api:app", host=args.host, port=args.port, reload=False)
        return 0
    if args.cmd == "rag-export":
        from govcrawler.rag.exporter import export_pending_to_rag

        result = export_pending_to_rag(
            limit=args.limit,
            article_id=args.article_id,
            target_code=args.target_code,
            dry_run=args.dry_run,
        )
        print(json.dumps(asdict(result), ensure_ascii=False, indent=2))
        return 0 if result.failed == 0 else 2
    if args.cmd == "list-sites":
        from govcrawler.config import list_sites
        for s in list_sites():
            tag = "✓" if s.get("enabled") else "✗"
            print(f"{tag} {s['site_id']:<12} {s.get('site_name','')}")
            for c in s.get("columns", []):
                ctag = "✓" if c["enabled"] else "✗"
                print(f"    {ctag} {c['column_id']:<16} {c.get('name','')}  "
                      f"schedule={c.get('schedule','')}")
        return 0
    if args.cmd == "toggle":
        from govcrawler.config import set_enabled
        enabled = bool(args.enable)
        try:
            path = set_enabled(args.site, column_id=args.column, enabled=enabled)
        except (FileNotFoundError, KeyError) as e:
            print(f"error: {e}")
            return 2
        scope = f"column={args.column}" if args.column else "site"
        print(f"[toggle] {args.site} {scope} → enabled={enabled} (wrote {path})")
        return 0
    if args.cmd == "check-alerts":
        from govcrawler.alerting import run_checks, send_alert
        rules = run_checks()
        if not rules:
            print("[ok] no alert threshold breached")
            return 0
        sent = 0
        for r in rules:
            print(r.message)
            if not args.dry_run:
                if send_alert(r.message):
                    sent += 1
        print(f"[done] {len(rules)} rule(s) triggered, {sent} posted to webhook")
        return 0
    if args.cmd == "sync-yaml":
        from govcrawler.config.sync import sync_dir, sync_file
        from govcrawler.db import get_sessionmaker

        Session = get_sessionmaker()
        with Session() as session:
            if args.file:
                report = sync_file(session, args.file)
            else:
                report = sync_dir(session, args.dir)
            if args.dry_run:
                session.rollback()
                print("[dry-run] rolled back; no DB writes committed")
            else:
                session.commit()
        print(json.dumps(report.as_dict(), ensure_ascii=False, indent=2))
        return 0
    if args.cmd == "validate":
        from govcrawler.validator import render_human, validate
        result = validate(
            args.site, args.column, url=args.url, max_detail=args.limit,
        )
        if args.json:
            print(json.dumps(result, ensure_ascii=False, indent=2, default=str))
        else:
            print(render_human(result))
        return 0 if result.get("ok") else 2
    return 1


if __name__ == "__main__":
    sys.exit(main())
