"""Canonical adapter output contract — §7.5.7 of 2.0 design doc.

**Iron rule**: no matter which adapter or fetch strategy produced the data,
the objects returned to the pipeline MUST be these exact Pydantic models.
The SQLAlchemy tables in `govcrawler.models` are a projection of this
contract (Phase-A schema rebuild will reconcile them 1:1).

Today we define the contract in code so:
  1. Adapter authors have a single source of truth for field names/types.
  2. `tests/test_adapter_contract.py` can validate any adapter's output
     on real sample data without touching the DB.
  3. When Phase-A lands, the SQLAlchemy models become a mechanical
     translation of these Pydantic classes — no semantic re-work.
"""
from __future__ import annotations

import re
from datetime import date, datetime, timezone
from enum import Enum
from typing import Any

from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator


class Status(str, Enum):
    """Article lifecycle. RAG only consumes `READY`."""
    RAW = "raw"
    READY = "ready"
    FAILED = "failed"


class FetchStrategy(str, Enum):
    """Which tier actually fetched this item. Must match CrawlLog.strategy."""
    HTTPX = "httpx"
    PLAYWRIGHT = "playwright"


class ContractViolation(ValueError):
    """Raised when an adapter output fails canonical-contract validation."""


# ---------------------------------------------------------------------------
# Attachment
# ---------------------------------------------------------------------------
class AttachmentItem(BaseModel):
    """Per §5.5 attachments table."""
    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)

    file_name: str | None = None
    file_ext: str | None = Field(default=None, max_length=16)
    size_bytes: int | None = Field(default=None, ge=0)
    file_path: str = Field(..., description="On-disk/object-store path, relative to storage root")
    file_hash: str = Field(..., description="sha256 hex of raw bytes")
    source_url: str = Field(..., description="Original download URL (pre-normalization OK)")

    @field_validator("file_hash")
    @classmethod
    def _check_sha256(cls, v: str) -> str:
        if not re.fullmatch(r"[0-9a-f]{64}", v):
            raise ContractViolation(f"file_hash must be 64-char lowercase hex sha256, got {v!r}")
        return v


# ---------------------------------------------------------------------------
# Article (the big one)
# ---------------------------------------------------------------------------
class CrawlItem(BaseModel):
    """Canonical output of an adapter's detail-parse step.

    Field naming mirrors §5.4 `article` table. Nullable where §5.4 allows null.
    **No `content_simhash`** — §5.4 explicitly excludes it (RAG-side dedup).
    """
    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)

    # ----- provenance -----
    site_id: str = Field(..., min_length=1, max_length=64)
    target_id: int | None = Field(
        default=None,
        description="crawl_target.id; may be None during adapter unit-tests",
    )
    dept_id: int | None = None

    # ----- identity & de-dup -----
    native_post_id: str | None = Field(
        default=None,
        max_length=64,
        description=(
            "CMS-native stable id if the adapter found one (e.g. gkmlpt "
            "articles[*].id). Leave None if the CMS has no such key — "
            "**never fabricate**. Dedup then falls back to url_hash."
        ),
    )
    url: str = Field(..., min_length=1)
    url_hash: str = Field(..., description="sha256 hex of normalized URL")

    # ----- core content -----
    title: str = Field(..., min_length=1)
    publish_time: datetime | None = None  # UTC-aware
    source_raw: str | None = Field(default=None, max_length=500)
    publisher: str | None = Field(default=None, max_length=500)
    content_text: str | None = None
    raw_html_path: str | None = None
    text_path: str | None = None

    # ----- channel/classification snapshots (inherited from crawl_target) -----
    channel_name: str | None = Field(default=None, max_length=200)
    channel_path: str | None = Field(default=None, max_length=1000)
    content_category: str | None = Field(default=None, max_length=100)
    content_subcategory: str | None = Field(default=None, max_length=100)

    # ----- §6 public attributes -----
    index_no: str | None = Field(default=None, max_length=200)
    doc_no: str | None = Field(default=None, max_length=200)
    publish_date: date | None = None
    effective_date: date | None = None
    is_effective: bool | None = None
    expiry_date: date | None = None
    topic_words: str | None = Field(default=None, max_length=500)
    open_category: str | None = Field(default=None, max_length=200)
    metadata_json: dict[str, Any] | None = None

    # ----- attachments -----
    has_attachment: bool = False
    attachments: list[AttachmentItem] = Field(default_factory=list)

    # ----- lifecycle -----
    status: Status = Status.RAW
    fetch_strategy: FetchStrategy | None = None

    # ----- validators -----
    @field_validator("url_hash")
    @classmethod
    def _check_url_hash(cls, v: str) -> str:
        if not re.fullmatch(r"[0-9a-f]{64}", v):
            raise ContractViolation(f"url_hash must be 64-char hex sha256, got {v!r}")
        return v

    @field_validator("publish_time")
    @classmethod
    def _require_utc_and_sane(cls, v: datetime | None) -> datetime | None:
        if v is None:
            return v
        if v.tzinfo is None:
            raise ContractViolation(
                "publish_time must be timezone-aware; adapters should convert to UTC"
            )
        # coerce to UTC for uniform storage
        v = v.astimezone(timezone.utc)
        now = datetime.now(timezone.utc)
        # gov sites sometimes list future dates for 公告生效日 — allow up to +30d,
        # but reject absurd values (likely unit bug)
        if v.year < 1949 or v > now.replace(year=now.year + 1):
            raise ContractViolation(f"publish_time {v} out of plausible range [1949, now+1y]")
        return v

    @model_validator(mode="after")
    def _has_attachment_consistent(self) -> "CrawlItem":
        if self.has_attachment and not self.attachments:
            raise ContractViolation("has_attachment=True but attachments list is empty")
        if not self.has_attachment and self.attachments:
            raise ContractViolation("attachments present but has_attachment=False")
        return self

    @model_validator(mode="after")
    def _ready_requires_paths(self) -> "CrawlItem":
        if self.status == Status.READY:
            missing = [f for f in ("raw_html_path", "text_path", "content_text")
                       if not getattr(self, f)]
            if missing:
                raise ContractViolation(
                    f"status=ready requires non-null {missing}; leave status=raw if still pending"
                )
        return self


# ---------------------------------------------------------------------------
# CrawlLog — per-fetch audit record. Must align with Article.fetch_strategy.
# ---------------------------------------------------------------------------
class CrawlLogItem(BaseModel):
    """Per §5.6 crawl_log table."""
    model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)

    site_id: str = Field(..., min_length=1, max_length=64)
    target_id: int | None = None
    article_url: str | None = None
    strategy: FetchStrategy
    http_status: int | None = Field(default=None, ge=100, le=599)
    duration_ms: int | None = Field(default=None, ge=0)
    success: bool
    error_msg: str | None = None
    occurred_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))

    @field_validator("occurred_at")
    @classmethod
    def _require_utc(cls, v: datetime) -> datetime:
        if v.tzinfo is None:
            raise ContractViolation("occurred_at must be timezone-aware UTC")
        return v.astimezone(timezone.utc)

    @model_validator(mode="after")
    def _consistency(self) -> "CrawlLogItem":
        if self.success and self.error_msg:
            raise ContractViolation("success=True but error_msg is set")
        if not self.success and self.http_status is None and not self.error_msg:
            raise ContractViolation("failed log must have either http_status or error_msg")
        return self
