from __future__ import annotations

import argparse
import json
import subprocess
import zipfile
from datetime import date, datetime, timedelta
from pathlib import Path
from typing import Any

from sqlalchemy import desc, or_, select

from govcrawler.db import get_sessionmaker
from govcrawler.models import (
    Article,
    Attachment,
    CrawlJob,
    CrawlLog,
    CrawlSite,
    CrawlTarget,
)
from govcrawler.settings import get_settings


def _json_default(value: Any) -> Any:
    if isinstance(value, datetime):
        return value.isoformat(sep=" ")
    if isinstance(value, date):
        return value.isoformat()
    return str(value)


def _row_dict(obj: Any, *, max_text: int = 4000) -> dict[str, Any] | None:
    if obj is None:
        return None
    out: dict[str, Any] = {}
    for col in obj.__table__.columns:
        value = getattr(obj, col.name)
        if isinstance(value, str) and len(value) > max_text:
            value = value[:max_text] + "...<truncated>"
        out[col.name] = value
    return out


def _run(cmd: list[str], *, timeout_s: int = 20) -> dict[str, Any]:
    try:
        p = subprocess.run(
            cmd,
            text=True,
            capture_output=True,
            timeout=timeout_s,
            check=False,
        )
        return {
            "cmd": cmd,
            "returncode": p.returncode,
            "stdout": p.stdout[-20000:],
            "stderr": p.stderr[-20000:],
        }
    except Exception as e:
        return {"cmd": cmd, "error": f"{type(e).__name__}: {e}"}


def _read_sample(rel_path: str | None, *, limit: int) -> dict[str, Any] | None:
    if not rel_path:
        return None
    settings = get_settings()
    base = Path(settings.data_dir).resolve()
    path = (base / rel_path).resolve()
    try:
        path.relative_to(base)
    except Exception:
        return {"path": rel_path, "error": "outside_data_dir"}
    if not path.is_file():
        return {"path": rel_path, "exists": False}
    data = path.read_bytes()[:limit]
    return {
        "path": rel_path,
        "exists": True,
        "size_bytes": path.stat().st_size,
        "sample": data.decode("utf-8", errors="replace"),
    }


def _target_from_args(session, job_id: str | None, target_code: str | None) -> CrawlTarget | None:
    if target_code:
        return session.scalar(select(CrawlTarget).where(CrawlTarget.target_code == target_code))
    if not job_id:
        return None
    job = session.get(CrawlJob, job_id)
    if job is None:
        return None
    return session.scalar(select(CrawlTarget).where(CrawlTarget.target_code == job.target_code))


def collect(args: argparse.Namespace) -> dict[str, Any]:
    session_maker = get_sessionmaker()
    since = datetime.now() - timedelta(hours=args.since_hours)
    data: dict[str, Any] = {
        "generated_at": datetime.now(),
        "input": {
            "job_id": args.job_id,
            "target_code": args.target_code,
            "since_hours": args.since_hours,
        },
        "git": _run(["git", "rev-parse", "HEAD"]),
        "git_status": _run(["git", "status", "--short", "--branch"]),
    }

    with session_maker() as session:
        target = _target_from_args(session, args.job_id, args.target_code)
        site = session.get(CrawlSite, target.site_id) if target is not None else None
        job = session.get(CrawlJob, args.job_id) if args.job_id else None

        jobs_stmt = select(CrawlJob)
        if target is not None:
            jobs_stmt = jobs_stmt.where(CrawlJob.target_code == target.target_code)
        elif args.job_id:
            jobs_stmt = jobs_stmt.where(CrawlJob.job_id == args.job_id)
        jobs = session.scalars(
            jobs_stmt.order_by(desc(CrawlJob.enqueued_at)).limit(args.job_limit)
        ).all()

        logs_stmt = select(CrawlLog)
        if target is not None:
            logs_stmt = logs_stmt.where(CrawlLog.target_id == target.id)
        if since:
            logs_stmt = logs_stmt.where(CrawlLog.occurred_at >= since)
        logs = session.scalars(
            logs_stmt.order_by(desc(CrawlLog.occurred_at)).limit(args.log_limit)
        ).all()

        article_urls = [row.article_url for row in logs if row.article_url]
        articles_stmt = select(Article)
        if target is not None:
            articles_stmt = articles_stmt.where(Article.target_id == target.id)
        if article_urls:
            articles_stmt = articles_stmt.where(
                or_(Article.url.in_(article_urls), Article.created_at >= since, Article.updated_at >= since)
            )
        else:
            articles_stmt = articles_stmt.where(or_(Article.created_at >= since, Article.updated_at >= since))
        articles = session.scalars(
            articles_stmt.order_by(desc(Article.updated_at)).limit(args.article_limit)
        ).all()
        article_ids = [a.id for a in articles]
        attachments = []
        if article_ids:
            attachments = session.scalars(
                select(Attachment)
                .where(Attachment.article_id.in_(article_ids))
                .order_by(desc(Attachment.downloaded_at))
                .limit(args.attachment_limit)
            ).all()

        data.update({
            "selected_job": _row_dict(job),
            "target": _row_dict(target),
            "site": _row_dict(site),
            "recent_jobs": [_row_dict(x) for x in jobs],
            "recent_logs": [_row_dict(x) for x in logs],
            "recent_articles": [_row_dict(x) for x in articles],
            "recent_attachments": [_row_dict(x) for x in attachments],
            "raw_html_samples": [
                _read_sample(a.raw_html_path, limit=args.sample_bytes)
                for a in articles[: args.sample_files]
                if a.raw_html_path
            ],
        })

    if args.include_docker_logs:
        data["docker"] = {
            "ps": _run(["docker", "ps", "--format", "{{.Names}} {{.Status}} {{.Image}}"]),
            "api_logs": _run(["docker", "logs", "--tail", str(args.docker_tail), "docker-api-1"]),
            "scheduler_logs": _run(["docker", "logs", "--tail", str(args.docker_tail), "docker-scheduler-1"]),
        }

    return data


def write_output(data: dict[str, Any], output: Path) -> Path:
    output.parent.mkdir(parents=True, exist_ok=True)
    payload = json.dumps(data, ensure_ascii=False, indent=2, default=_json_default)
    if output.suffix.lower() == ".json":
        output.write_text(payload, encoding="utf-8")
    else:
        with zipfile.ZipFile(output, "w", compression=zipfile.ZIP_DEFLATED) as zf:
            zf.writestr("diagnostics.json", payload)
    return output


def main() -> None:
    parser = argparse.ArgumentParser(description="Collect read-only GovCrawler diagnostics.")
    parser.add_argument("--job-id")
    parser.add_argument("--target-code")
    parser.add_argument("--since-hours", type=float, default=24)
    parser.add_argument("--output", default="")
    parser.add_argument("--job-limit", type=int, default=20)
    parser.add_argument("--log-limit", type=int, default=200)
    parser.add_argument("--article-limit", type=int, default=100)
    parser.add_argument("--attachment-limit", type=int, default=200)
    parser.add_argument("--sample-files", type=int, default=5)
    parser.add_argument("--sample-bytes", type=int, default=12000)
    parser.add_argument("--include-docker-logs", action="store_true")
    parser.add_argument("--docker-tail", type=int, default=300)
    args = parser.parse_args()

    if not args.job_id and not args.target_code:
        parser.error("one of --job-id or --target-code is required")

    stem = args.job_id or args.target_code or "govcrawler"
    output = Path(args.output or f"diagnostics_{stem}_{datetime.now():%Y%m%d_%H%M%S}.zip")
    path = write_output(collect(args), output)
    print(path)


if __name__ == "__main__":
    main()
