"""FastAPI app exposing the RAG consumer contract (API-01..04) — v2 schema.

Endpoints:
  GET  /api/articles?since=<ISO>&site=<code>&target=<code>&limit=&only_unexported=
  GET  /api/articles/{id}                         -> detail incl. content + attachments
  GET  /api/articles/{id}/attachments/{aid}       -> file stream
  POST /api/articles/{id}/ack                     -> set exported_to_rag_at=now()
  GET  /health                                    -> liveness
  GET  /metrics                                   -> Prometheus

Schema change notes (v2 vs 1.0):
  * `Article.site_id` is now int FK. Public API still surfaces `site_code` (str).
  * `Article.column_id` / `.category` / `.source` removed — replaced by
    `target_id` FK + `channel_name` + `content_category` + `source_raw`.
  * Filtering by `site=<code>` joins crawl_site; by `target=<code>` joins crawl_target.
"""
from __future__ import annotations

from datetime import datetime
from pathlib import Path, PurePosixPath
from typing import Any

from fastapi import Depends, FastAPI, HTTPException, Query
from fastapi.responses import FileResponse, Response
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
from sqlalchemy import select, update
from sqlalchemy.orm import Session

from govcrawler.db import get_sessionmaker
from govcrawler.models import Article, Attachment, CrawlSite, CrawlTarget
from govcrawler.settings import get_settings
from govcrawler.storage.paths import to_os_path


def _session() -> Session:
    SessionMaker = get_sessionmaker()
    with SessionMaker() as s:
        yield s


def _article_to_dict(a: Article) -> dict[str, Any]:
    """Summary view — no body, no attachments. Used in list + detail payloads."""
    site_code = a.site.site_code if a.site_id and _loaded(a, "site") else None
    target_code = (
        a.target.target_code if a.target_id and _loaded(a, "target") else None
    )
    return {
        "id": a.id,
        "site_code": site_code,
        "site_id": a.site_id,
        "target_code": target_code,
        "target_id": a.target_id,
        "dept_id": a.dept_id,
        "native_post_id": a.native_post_id,
        "channel_name": a.channel_name,
        "channel_path": a.channel_path,
        "content_category": a.content_category,
        "content_subcategory": a.content_subcategory,
        "url": a.url,
        "url_hash": a.url_hash,
        "title": a.title,
        "publish_time": a.publish_time.isoformat() if a.publish_time else None,
        "publish_date": a.publish_date.isoformat() if a.publish_date else None,
        "effective_date": a.effective_date.isoformat() if a.effective_date else None,
        "is_effective": a.is_effective,
        "expiry_date": a.expiry_date.isoformat() if a.expiry_date else None,
        "publisher": a.publisher,
        "source_raw": a.source_raw,
        "doc_no": a.doc_no,
        "index_no": a.index_no,
        "topic_words": a.topic_words,
        "open_category": a.open_category,
        "has_attachment": a.has_attachment,
        "status": a.status,
        "fetch_strategy": a.fetch_strategy,
        "fetched_at": a.fetched_at.isoformat() if a.fetched_at else None,
        "exported_to_rag_at": a.exported_to_rag_at.isoformat()
        if a.exported_to_rag_at
        else None,
    }


def _loaded(obj, rel_name: str) -> bool:
    """True if the SA relationship is actually loaded (avoids lazy-load on detach)."""
    from sqlalchemy import inspect as sa_inspect

    state = sa_inspect(obj)
    return rel_name not in state.unloaded


app = FastAPI(title="GovCrawler RAG API", version="2.0.0")

# ---------- Admin Basic-Auth gate ----------
# Settings.admin_user / admin_password gate the entire /admin/** namespace
# (HTML shell, static assets, and API endpoints). Empty admin_user disables
# the gate (dev convenience). Compares with hmac.compare_digest to avoid
# timing leaks on the password.
import base64 as _base64  # noqa: E402
import hmac as _hmac  # noqa: E402

from starlette.middleware.base import BaseHTTPMiddleware  # noqa: E402
from starlette.responses import Response as _Resp  # noqa: E402


def _standard_downloader_allowed(path: str, method: str) -> bool:
    if path in ("/admin", "/admin/"):
        return True
    if path.startswith("/admin/static/"):
        return True
    if path == "/admin/api/me" and method == "GET":
        return True
    if path == "/admin/api/standard-attachments/pending" and method == "GET":
        return True
    if (
        path.startswith("/admin/api/articles/")
        and path.endswith("/openstd-download/start")
        and method == "POST"
    ):
        return True
    if (
        path.startswith("/admin/api/openstd-download-sessions/")
        and path.endswith("/captcha")
        and method == "GET"
    ):
        return True
    if (
        path.startswith("/admin/api/openstd-download-sessions/")
        and path.endswith("/submit")
        and method == "POST"
    ):
        return True
    return False


class _AdminBasicAuth(BaseHTTPMiddleware):
    async def dispatch(self, request, call_next):
        if not request.url.path.startswith("/admin"):
            return await call_next(request)
        cfg = get_settings()
        # Empty user => auth disabled
        if not cfg.admin_user:
            return await call_next(request)
        auth = request.headers.get("authorization", "")
        if auth.lower().startswith("basic "):
            try:
                raw = _base64.b64decode(auth.split(" ", 1)[1]).decode("utf-8", "replace")
                user, _, pw = raw.partition(":")
                if (_hmac.compare_digest(user, cfg.admin_user)
                        and _hmac.compare_digest(pw, cfg.admin_password)):
                    request.state.admin_role = "admin"
                    request.state.admin_user = user
                    return await call_next(request)
                if (
                    cfg.standard_downloader_user
                    and _hmac.compare_digest(user, cfg.standard_downloader_user)
                    and _hmac.compare_digest(pw, cfg.standard_downloader_password)
                ):
                    if _standard_downloader_allowed(request.url.path, request.method.upper()):
                        request.state.admin_role = "standard_downloader"
                        request.state.admin_user = user
                        return await call_next(request)
                    return _Resp(status_code=403, content="Forbidden")
            except Exception:
                pass
        return _Resp(
            status_code=401,
            headers={"WWW-Authenticate": 'Basic realm="GovCrawler Admin"'},
            content="Authentication required",
        )


app.add_middleware(_AdminBasicAuth)


# ---------- RAG consumer Bearer-token gate ----------
# Reviewer P1: /api/articles**, /api/raw-html/**, /api/text/**, the
# attachment streamer and the ack endpoint were all unauthenticated.
# Anyone hitting :8787/api/articles could pull article bodies and POST
# /ack to mark them as exported. We now require a Bearer token matching
# settings.rag_api_token; empty token disables the check (dev only).
class _RagTokenGate(BaseHTTPMiddleware):
    # Paths that need the gate. We don't gate /admin (that has Basic Auth),
    # /health (k8s probes), or /metrics (Prometheus inside the cluster).
    _GATED_PREFIXES = (
        "/api/articles",
        "/api/raw-html",
        "/api/text",
    )

    async def dispatch(self, request, call_next):
        path = request.url.path
        if not any(path.startswith(p) for p in self._GATED_PREFIXES):
            return await call_next(request)
        cfg = get_settings()
        if not cfg.rag_api_token:
            # Token unset → dev mode, bypass. We still log a warning at
            # startup so production deployers don't accidentally ship this.
            return await call_next(request)
        auth = request.headers.get("authorization", "")
        # Accept EITHER: (a) Bearer token matching RAG_API_TOKEN, or
        # (b) Basic Auth matching the admin credentials. The admin UI
        # legitimately calls /api/articles/<id>, /api/articles/.../raw-html,
        # /api/articles/.../text and /api/articles/.../attachments/... when
        # the operator views article detail. The UI is already gated by
        # /admin Basic Auth (browser sends those creds on every same-origin
        # request), so accepting them here lets the UI work without
        # exposing a separate cookie / token plumbing.
        if auth.lower().startswith("bearer "):
            sent = auth.split(" ", 1)[1].strip()
            if _hmac.compare_digest(sent, cfg.rag_api_token):
                return await call_next(request)
        elif auth.lower().startswith("basic ") and cfg.admin_user:
            try:
                raw = _base64.b64decode(auth.split(" ", 1)[1]).decode(
                    "utf-8", "replace",
                )
                user, _, pw = raw.partition(":")
                if (_hmac.compare_digest(user, cfg.admin_user)
                        and _hmac.compare_digest(pw, cfg.admin_password)):
                    return await call_next(request)
            except Exception:
                pass
        return _Resp(
            status_code=401,
            headers={"WWW-Authenticate": 'Bearer realm="GovCrawler RAG"'},
            content="Authentication required",
        )


app.add_middleware(_RagTokenGate)


# ---------- Operation audit log (v2.1.1) ----------
# Every state-changing call ([POST, PUT, DELETE, PATCH] on /admin/api/* +
# POST on /api/articles/.../ack) is recorded in admin_audit_log so we can
# answer "who did what, when, from where" after the fact.
from govcrawler.api.audit import AuditMiddleware  # noqa: E402

app.add_middleware(AuditMiddleware)


@app.on_event("startup")
async def _restore_durable_jobs() -> None:
    """Recover jobs from crawl_job DB rows on every api boot:
      • status='running' rows are orphans (their process died mid-flight)
        → flip to 'queued', bump attempt_count, re-enqueue. After 3
        consecutive restart-recoveries we give up and mark 'failed' to
        avoid an infinite reboot-and-resume loop.
      • status='queued' rows get re-pushed into the in-memory FIFO so
        a fresh worker drains them at the next opportunity.

    No-op if the table is empty. Failures here must NOT block startup —
    log and continue."""
    import logging
    try:
        from govcrawler.api.task_queue import get_queue
        summary = await get_queue().restore_from_db()
        if any(summary.values()):
            logging.getLogger("govcrawler.api").info(
                "task_queue restored from DB: running→requeued=%d, "
                "queued_requeued=%d, permanently_failed=%d",
                summary["recovered"],
                summary["requeued"] - summary["recovered"],
                summary["permanently_failed"],
            )
    except Exception:
        logging.getLogger("govcrawler.api").exception(
            "task_queue restore failed — continuing without recovery",
        )


@app.on_event("startup")
def _warn_unsecured_rag_endpoints() -> None:
    """Production guardrail: when admin Basic Auth is enabled (i.e. this
    is a real deployment, not a dev box), RAG_API_TOKEN MUST also be set.
    Otherwise the RAG endpoints leak articles + accept ack writes from
    anyone who can reach :8787. We refuse to start to make this a release
    blocker rather than a runtime warning that gets ignored.

    Dev mode (admin_user empty) keeps the warn-only behavior so local
    work doesn't need a token."""
    import logging
    import sys
    cfg = get_settings()
    log = logging.getLogger("govcrawler.api")
    if cfg.admin_user and not cfg.rag_api_token:
        log.error(
            "FATAL: ADMIN_USER is set (production mode) but RAG_API_TOKEN is "
            "missing. /api/articles** and /api/articles/.../ack would be "
            "publicly accessible. Set RAG_API_TOKEN in .env (e.g. via "
            "`openssl rand -hex 24`) and restart. Aborting startup.",
        )
        # Force a clean shutdown — uvicorn will exit non-zero so a
        # supervisor (docker compose) reports the misconfigure as a crash
        # loop and the deployer notices immediately.
        sys.exit(1)
    if not cfg.rag_api_token:
        log.warning(
            "RAG_API_TOKEN is not set — /api/articles** is publicly readable "
            "and /api/articles/.../ack is publicly writable. This is OK in "
            "dev (ADMIN_USER also unset) but REFUSE before going to prod.",
        )

# Admin dashboard router at /admin
from pathlib import Path as _Path  # noqa: E402
from fastapi.staticfiles import StaticFiles  # noqa: E402

from govcrawler.api.admin import router as _admin_router  # noqa: E402

app.include_router(_admin_router)
# Serve /admin/static/** (css + js) alongside the dashboard HTML
_ADMIN_STATIC = _Path(__file__).parent / "static"
app.mount("/admin/static", StaticFiles(directory=str(_ADMIN_STATIC)), name="admin-static")


@app.get("/health")
def health() -> dict[str, str]:
    return {"status": "ok"}


@app.get("/metrics")
def metrics() -> Response:
    """Prometheus scrape endpoint — text format, default global REGISTRY."""
    import govcrawler.observability  # noqa: F401
    return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)


# ---------------------------------------------------------------------------
# articles
# ---------------------------------------------------------------------------
def _apply_filters(
    stmt, *, site: str | None, target: str | None
):
    """Join crawl_site/crawl_target only when the caller actually filters on them."""
    if site:
        stmt = stmt.join(CrawlSite, CrawlSite.id == Article.site_id).where(
            CrawlSite.site_code == site
        )
    if target:
        stmt = stmt.join(CrawlTarget, CrawlTarget.id == Article.target_id).where(
            CrawlTarget.target_code == target
        )
    return stmt


@app.get("/api/articles")
def list_articles(
    since: datetime | None = Query(None, description="fetched_at floor, ISO-8601"),
    site: str | None = Query(None, description="crawl_site.site_code"),
    target: str | None = Query(None, description="crawl_target.target_code"),
    limit: int = Query(100, ge=1, le=1000),
    only_unexported: bool = Query(
        True, description="默认只返回 exported_to_rag_at IS NULL 的文章"
    ),
    s: Session = Depends(_session),
) -> dict[str, Any]:
    stmt = select(Article).where(Article.status == "ready")
    stmt = _apply_filters(stmt, site=site, target=target)
    if since is not None:
        stmt = stmt.where(Article.fetched_at >= since)
    if only_unexported:
        stmt = stmt.where(Article.exported_to_rag_at.is_(None))
    stmt = stmt.order_by(Article.fetched_at.asc()).limit(limit)
    rows = s.execute(stmt).scalars().all()
    # eager-load site/target codes for the projection
    for r in rows:
        _ = r.site, r.target  # triggers load within session
    return {"count": len(rows), "items": [_article_to_dict(a) for a in rows]}


@app.get("/api/articles/{article_id}")
def get_article(article_id: int, s: Session = Depends(_session)) -> dict[str, Any]:
    a = s.get(Article, article_id)
    if a is None:
        raise HTTPException(404, "article not found")
    # eager-load codes
    _ = a.site, a.target
    payload = _article_to_dict(a)
    payload["content_text"] = a.content_text
    payload["raw_html_path"] = a.raw_html_path
    payload["text_path"] = a.text_path
    payload["metadata_json"] = a.metadata_json
    payload["attachments"] = [
        {
            "id": att.id,
            "file_name": att.file_name,
            "file_ext": att.file_ext,
            "size_bytes": att.size_bytes,
            "file_hash": att.file_hash,
            "source_url": att.source_url,
        }
        for att in a.attachments
    ]
    return payload


@app.get("/api/articles/{article_id}/attachments/{attachment_id}")
def download_attachment(
    article_id: int, attachment_id: int, s: Session = Depends(_session)
) -> FileResponse:
    att = s.get(Attachment, attachment_id)
    if att is None or att.article_id != article_id:
        raise HTTPException(404, "attachment not found")
    if not att.file_path:
        raise HTTPException(404, "attachment file_path empty")

    data_dir = Path(get_settings().data_dir)
    abs_path = to_os_path(data_dir, PurePosixPath(att.file_path))

    # Path-traversal guard (defensive — file_path came from our own downloader)
    try:
        abs_path.resolve().relative_to(data_dir.resolve())
    except Exception:
        raise HTTPException(400, "invalid path")

    if not abs_path.exists():
        raise HTTPException(404, "file missing on disk")
    return FileResponse(
        path=str(abs_path),
        filename=att.file_name or abs_path.name,
        media_type="application/octet-stream",
    )


def _serve_article_file(abs_rel_path: str | None, media_type: str, download_name_hint: str):
    """Shared helper for /raw-html and /text routes — safely streams a file
    from `data_dir` with a path-traversal guard."""
    if not abs_rel_path:
        raise HTTPException(404, "path not set on article")
    data_dir = Path(get_settings().data_dir)
    abs_path = to_os_path(data_dir, PurePosixPath(abs_rel_path))
    try:
        abs_path.resolve().relative_to(data_dir.resolve())
    except Exception:
        raise HTTPException(400, "invalid path")
    if not abs_path.exists():
        raise HTTPException(404, "file missing on disk")
    return FileResponse(
        path=str(abs_path),
        filename=download_name_hint or abs_path.name,
        media_type=media_type,
    )


@app.get("/api/articles/{article_id}/raw-html")
def get_article_raw_html(article_id: int, s: Session = Depends(_session)) -> FileResponse:
    """Stream the original HTML snapshot saved at fetch time (raw_html_path)."""
    a = s.get(Article, article_id)
    if a is None:
        raise HTTPException(404, "article not found")
    return _serve_article_file(a.raw_html_path, "text/html; charset=utf-8", f"article-{article_id}.html")


@app.get("/api/articles/{article_id}/text")
def get_article_text(article_id: int, s: Session = Depends(_session)) -> FileResponse:
    """Stream the plain-text extraction saved to disk (text_path)."""
    a = s.get(Article, article_id)
    if a is None:
        raise HTTPException(404, "article not found")
    return _serve_article_file(a.text_path, "text/plain; charset=utf-8", f"article-{article_id}.txt")


@app.post("/api/articles/{article_id}/ack")
def ack_article(article_id: int, s: Session = Depends(_session)) -> dict[str, Any]:
    a = s.get(Article, article_id)
    if a is None:
        raise HTTPException(404, "article not found")
    now = datetime.utcnow()
    s.execute(
        update(Article).where(Article.id == article_id).values(exported_to_rag_at=now)
    )
    s.commit()
    return {"id": article_id, "exported_to_rag_at": now.isoformat()}