"""SQLAlchemy 2.0 ORM — projection of the canonical contract in
`govcrawler.adapters.contract`.

Schema layout follows §5 of the 2.0 design doc:

    crawl_site (domain-level, 9 Qingyuan sites today)
        └── site_department (dept_path-level; Qingyuan cluster increment)
                └── crawl_target (column_id-level)
                        └── article / crawl_log
    local_department (OA snapshot, independent table)
    attachment (child of article)
    master_column_registry (read-only view over adapter classify calls)

Design guarantees enforced at the DB layer (matches contract.py one-to-one):
- **`dept_binding` on site_department** is a string enum with CHECK constraint:
  `pending / mapped / city_level / cross_dept / external_ref`
- **`crawl_target.site_department_id`** is nullable —走 YAML 站 / 外部参考站 leave
  it empty; 走适配器站 must point to a site_department row
- **`article.native_post_id`** is nullable; UNIQUE index is conditional so
  rows without a native id don't collide. On PG the partial index is native;
  MySQL would need a sentinel — we stay PG-friendly here
- **No `content_simhash`** — §5.5 (old §5.4) explicitly excluded it; RAG owns
  content de-dup. Adapter contract rejects it too.
"""
from __future__ import annotations

from datetime import date, datetime, time
from typing import Any

from sqlalchemy import (
    BigInteger,
    Boolean,
    CHAR,
    CheckConstraint,
    Date,
    DateTime,
    ForeignKey,
    Index,
    Integer,
    JSON,
    String,
    Text,
    Time,
    UniqueConstraint,
    column,
    func,
    text,
)
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship

# Autoincrement-friendly BigInt PK:
# - PG/MySQL get BIGINT (full range)
# - SQLite gets INTEGER (aliases rowid → autoincrement "just works" in tests)
_PK_BIG = BigInteger().with_variant(Integer(), "sqlite")


class Base(DeclarativeBase):
    pass


# ---------------------------------------------------------------------------
# §5.1 crawl_site — domain-level
# ---------------------------------------------------------------------------
class CrawlSite(Base):
    __tablename__ = "crawl_site"

    id: Mapped[int] = mapped_column(_PK_BIG, primary_key=True, autoincrement=True)
    site_code: Mapped[str] = mapped_column(String(50), unique=True, nullable=False)
    site_name: Mapped[str | None] = mapped_column(String(200))
    base_url: Mapped[str | None] = mapped_column(String(500))
    site_role: Mapped[str | None] = mapped_column(String(50))  # qingyuan_local / county_local / province_ref / nation_ref
    cms_adapter: Mapped[str | None] = mapped_column(String(50))  # Qingyuan-增强: gkmlpt / trs_wcm / ...
    adapter_params_json: Mapped[dict[str, Any] | None] = mapped_column(JSON)
    default_fetch_strategy: Mapped[str | None] = mapped_column(String(50))  # httpx / playwright
    strategy_override_reason: Mapped[str | None] = mapped_column(String(200))
    respect_robots: Mapped[bool] = mapped_column(Boolean, server_default=text("true"))
    yaml_path: Mapped[str | None] = mapped_column(String(500))
    # §5.1 [2.0-增补] managed_by: 'yaml' (默认，sync_dir 托管) 或 'ui' (管理后台
    # 托管，sync_dir 永不改动)。允许运维混合两种配置源：YAML 行仍走铁律，
    # UI 行直接在 DB 上增删改。
    managed_by: Mapped[str] = mapped_column(String(10), server_default="yaml", nullable=False)
    enabled: Mapped[bool] = mapped_column(Boolean, server_default=text("true"))
    remark: Mapped[str | None] = mapped_column(String(500))
    # Daily crawl plan — see migration 0005. NULL means "no constraint".
    crawl_window_start: Mapped[time | None] = mapped_column(Time)
    crawl_window_end: Mapped[time | None] = mapped_column(Time)
    daily_max_requests: Mapped[int | None] = mapped_column(Integer)
    weekend_enabled: Mapped[bool] = mapped_column(Boolean, server_default=text("true"))
    backoff_min_sec: Mapped[int | None] = mapped_column(Integer)
    backoff_max_sec: Mapped[int | None] = mapped_column(Integer)
    # Site-level cron default — applies when no per-target override and no
    # yaml column / default_column schedule is set. Lets gkmlpt-adapter
    # sites (which have no yaml file) opt out of DEFAULT_SCHEDULE without
    # touching every target row. See migration 0006.
    schedule_cron: Mapped[str | None] = mapped_column(String(100))
    created_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())
    updated_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())

    # Relationships
    site_departments: Mapped[list["SiteDepartment"]] = relationship(
        back_populates="site", cascade="all"
    )
    targets: Mapped[list["CrawlTarget"]] = relationship(
        back_populates="site", cascade="all"
    )

    __table_args__ = (
        # §5.1 约束：cms_adapter 与 yaml_path 恰好一个非空。基线默认走 yaml_path。
        CheckConstraint(
            "(cms_adapter IS NULL) <> (yaml_path IS NULL)",
            name="ck_crawl_site_adapter_xor_yaml",
        ),
        CheckConstraint(
            "managed_by IN ('yaml','ui')",
            name="ck_crawl_site_managed_by",
        ),
    )


# ---------------------------------------------------------------------------
# §5.2 site_department — dept_path level [Qingyuan 专项增强]
# ---------------------------------------------------------------------------
DEPT_BINDINGS: tuple[str, ...] = (
    "pending",
    "mapped",
    "city_level",
    "cross_dept",
    "external_ref",
)


class SiteDepartment(Base):
    __tablename__ = "site_department"

    id: Mapped[int] = mapped_column(_PK_BIG, primary_key=True, autoincrement=True)
    site_id: Mapped[int] = mapped_column(
        BigInteger, ForeignKey("crawl_site.id", ondelete="CASCADE"), nullable=False
    )
    dept_path: Mapped[str] = mapped_column(String(100), nullable=False)
    local_dept_id: Mapped[int | None] = mapped_column(
        BigInteger, ForeignKey("local_department.dept_id", ondelete="SET NULL")
    )
    dept_binding: Mapped[str] = mapped_column(
        String(20), nullable=False, server_default="pending"
    )
    dept_display_name: Mapped[str | None] = mapped_column(String(200))
    detect_status: Mapped[str] = mapped_column(String(20), server_default="pending")  # pending / ok / failed
    last_probed_at: Mapped[datetime | None] = mapped_column(DateTime)
    last_classify_json: Mapped[dict[str, Any] | None] = mapped_column(JSON)
    enabled: Mapped[bool] = mapped_column(Boolean, server_default=text("true"))
    created_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())
    updated_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())

    site: Mapped[CrawlSite] = relationship(back_populates="site_departments")
    local_dept: Mapped["LocalDepartment | None"] = relationship()
    targets: Mapped[list["CrawlTarget"]] = relationship(back_populates="site_department")

    __table_args__ = (
        UniqueConstraint("site_id", "dept_path", name="uq_site_department_path"),
        CheckConstraint(
            "dept_binding IN ('pending','mapped','city_level','cross_dept','external_ref')",
            name="ck_site_department_binding",
        ),
        # mapped ⇒ local_dept_id 必填；其他四种 ⇒ local_dept_id 必须为空
        CheckConstraint(
            "(dept_binding = 'mapped' AND local_dept_id IS NOT NULL) OR "
            "(dept_binding <> 'mapped' AND local_dept_id IS NULL)",
            name="ck_site_department_binding_local_dept",
        ),
    )


# ---------------------------------------------------------------------------
# §5.3 crawl_target — column_id level
# ---------------------------------------------------------------------------
class CrawlTarget(Base):
    __tablename__ = "crawl_target"

    id: Mapped[int] = mapped_column(_PK_BIG, primary_key=True, autoincrement=True)
    site_id: Mapped[int] = mapped_column(
        BigInteger, ForeignKey("crawl_site.id", ondelete="CASCADE"), nullable=False
    )
    site_department_id: Mapped[int | None] = mapped_column(
        BigInteger, ForeignKey("site_department.id", ondelete="CASCADE")
    )
    target_code: Mapped[str] = mapped_column(String(100), unique=True, nullable=False)
    target_name: Mapped[str | None] = mapped_column(String(200))
    entry_url: Mapped[str | None] = mapped_column(String(1000))
    sample_article_url: Mapped[str | None] = mapped_column(String(1000))
    dept_id: Mapped[int | None] = mapped_column(
        BigInteger, ForeignKey("local_department.dept_id", ondelete="SET NULL")
    )
    parser_override_json: Mapped[dict[str, Any] | None] = mapped_column(JSON)
    channel_name: Mapped[str | None] = mapped_column(String(200))
    channel_path: Mapped[str | None] = mapped_column(String(1000))
    content_category: Mapped[str | None] = mapped_column(String(100))
    content_subcategory: Mapped[str | None] = mapped_column(String(100))
    schedule_cron: Mapped[str | None] = mapped_column(String(100))
    expected_cadence_days: Mapped[int] = mapped_column(Integer, server_default="30")
    interval_sec: Mapped[int | None] = mapped_column(Integer)
    interval_jitter_sec: Mapped[int | None] = mapped_column(Integer)
    enabled: Mapped[bool] = mapped_column(Boolean, server_default=text("true"))
    # Opt-in page-level checkpoint. When True, crawl_target() records
    # progress in crawl_job.last_completed_page after every fully-completed
    # list page; a restart resumes from page+1 instead of re-walking the
    # whole list. Default False — only valuable for big WAF-sensitive
    # targets (gd_wjk__qbwj full backfill) where list re-walk costs minutes.
    track_checkpoint: Mapped[bool] = mapped_column(
        Boolean, server_default=text("false"),
    )
    last_crawled_at: Mapped[datetime | None] = mapped_column(DateTime)
    last_article_time: Mapped[datetime | None] = mapped_column(DateTime)
    created_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())
    updated_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())

    site: Mapped[CrawlSite] = relationship(back_populates="targets")
    site_department: Mapped[SiteDepartment | None] = relationship(back_populates="targets")
    local_dept: Mapped["LocalDepartment | None"] = relationship()


# ---------------------------------------------------------------------------
# §5.4 local_department — OA snapshot
# ---------------------------------------------------------------------------
class LocalDepartment(Base):
    __tablename__ = "local_department"

    dept_id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
    dept_name: Mapped[str | None] = mapped_column(String(250))
    parent_dept_id: Mapped[int | None] = mapped_column(BigInteger)
    dept_level: Mapped[int | None] = mapped_column(Integer)
    dept_code: Mapped[str | None] = mapped_column(String(250))
    short_name: Mapped[str | None] = mapped_column(String(50))
    full_name: Mapped[str | None] = mapped_column(String(500))
    state: Mapped[int | None] = mapped_column(Integer)
    order_id: Mapped[int | None] = mapped_column(Integer)
    updated_at: Mapped[datetime | None] = mapped_column(DateTime)
    region: Mapped[str | None] = mapped_column(String(50), index=True)


# ---------------------------------------------------------------------------
# §5.5 article
# ---------------------------------------------------------------------------
class Article(Base):
    __tablename__ = "article"

    id: Mapped[int] = mapped_column(_PK_BIG, primary_key=True, autoincrement=True)
    site_id: Mapped[int] = mapped_column(
        BigInteger, ForeignKey("crawl_site.id", ondelete="RESTRICT"), nullable=False
    )
    target_id: Mapped[int | None] = mapped_column(
        BigInteger, ForeignKey("crawl_target.id", ondelete="SET NULL")
    )
    dept_id: Mapped[int | None] = mapped_column(
        BigInteger, ForeignKey("local_department.dept_id", ondelete="SET NULL")
    )
    native_post_id: Mapped[str | None] = mapped_column(String(64))
    url: Mapped[str] = mapped_column(Text, nullable=False)
    url_hash: Mapped[str] = mapped_column(CHAR(64), nullable=False)
    title: Mapped[str | None] = mapped_column(Text)
    publish_time: Mapped[datetime | None] = mapped_column(DateTime)
    source_raw: Mapped[str | None] = mapped_column(String(500))
    channel_name: Mapped[str | None] = mapped_column(String(200))
    channel_path: Mapped[str | None] = mapped_column(String(1000))
    content_category: Mapped[str | None] = mapped_column(String(100))
    content_subcategory: Mapped[str | None] = mapped_column(String(100))
    index_no: Mapped[str | None] = mapped_column(String(200))
    doc_no: Mapped[str | None] = mapped_column(String(200))
    publisher: Mapped[str | None] = mapped_column(String(500))
    publish_date: Mapped[date | None] = mapped_column(Date)
    effective_date: Mapped[date | None] = mapped_column(Date)
    is_effective: Mapped[bool | None] = mapped_column(Boolean)
    expiry_date: Mapped[date | None] = mapped_column(Date)
    topic_words: Mapped[str | None] = mapped_column(String(500))
    open_category: Mapped[str | None] = mapped_column(String(200))
    metadata_json: Mapped[dict[str, Any] | None] = mapped_column(JSON)
    content_text: Mapped[str | None] = mapped_column(Text)
    raw_html_path: Mapped[str | None] = mapped_column(Text)
    text_path: Mapped[str | None] = mapped_column(Text)
    has_attachment: Mapped[bool] = mapped_column(Boolean, server_default=text("false"))
    status: Mapped[str] = mapped_column(String(20), server_default="raw")  # raw / ready / failed
    fetch_strategy: Mapped[str | None] = mapped_column(String(50))
    fetched_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())
    exported_to_rag_at: Mapped[datetime | None] = mapped_column(DateTime)
    rag_export_status: Mapped[str | None] = mapped_column(String(20))
    rag_export_started_at: Mapped[datetime | None] = mapped_column(DateTime)
    rag_export_finished_at: Mapped[datetime | None] = mapped_column(DateTime)
    rag_export_error: Mapped[str | None] = mapped_column(Text)
    rag_export_task_ids: Mapped[list[str] | None] = mapped_column(JSON)
    created_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())
    updated_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())

    attachments: Mapped[list["Attachment"]] = relationship(
        back_populates="article", cascade="all, delete-orphan"
    )
    site: Mapped[CrawlSite] = relationship(foreign_keys=[site_id])
    target: Mapped["CrawlTarget | None"] = relationship(foreign_keys=[target_id])

    __table_args__ = (
        # §5.5 去重三条铁律。
        # 1) url_hash 永远 NOT NULL + UNIQUE（DB 级兜底，所有行生效）
        UniqueConstraint("url_hash", name="uq_article_url_hash"),
        # 2) (site_id, native_post_id) 当 native_post_id 非空时唯一 —— PG 以 partial index 表达
        Index(
            "uq_article_site_native",
            "site_id",
            "native_post_id",
            unique=True,
            postgresql_where=column("native_post_id").isnot(None),
        ),
        Index("ix_article_site_pub", "site_id", "publish_time"),
        Index("ix_article_target_pub", "target_id", "publish_time"),
        Index("ix_article_exported", "exported_to_rag_at"),
        Index("ix_article_rag_export_status", "rag_export_status"),
        Index("ix_article_status", "status"),
        Index("ix_article_is_effective", "is_effective"),
        Index("ix_article_expiry_date", "expiry_date"),
        CheckConstraint(
            "status IN ('raw','ready','failed')",
            name="ck_article_status",
        ),
    )


# ---------------------------------------------------------------------------
# §5.6 attachment
# ---------------------------------------------------------------------------
class Attachment(Base):
    __tablename__ = "attachment"

    id: Mapped[int] = mapped_column(_PK_BIG, primary_key=True, autoincrement=True)
    article_id: Mapped[int] = mapped_column(
        BigInteger, ForeignKey("article.id", ondelete="CASCADE"), nullable=False
    )
    file_name: Mapped[str | None] = mapped_column(Text)
    file_ext: Mapped[str | None] = mapped_column(String(20))
    size_bytes: Mapped[int | None] = mapped_column(BigInteger)
    file_path: Mapped[str | None] = mapped_column(Text)
    file_hash: Mapped[str | None] = mapped_column(CHAR(64))
    downloaded_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())

    article: Mapped[Article] = relationship(back_populates="attachments")

    __table_args__ = (Index("ix_attachment_file_hash", "file_hash"),)


# ---------------------------------------------------------------------------
# §5.7 crawl_log
# ---------------------------------------------------------------------------
class CrawlLog(Base):
    __tablename__ = "crawl_log"

    id: Mapped[int] = mapped_column(_PK_BIG, primary_key=True, autoincrement=True)
    site_id: Mapped[int | None] = mapped_column(
        BigInteger, ForeignKey("crawl_site.id", ondelete="SET NULL")
    )
    target_id: Mapped[int | None] = mapped_column(
        BigInteger, ForeignKey("crawl_target.id", ondelete="SET NULL")
    )
    article_url: Mapped[str | None] = mapped_column(Text)
    strategy: Mapped[str | None] = mapped_column(String(50))
    http_status: Mapped[int | None] = mapped_column(Integer)
    duration_ms: Mapped[int | None] = mapped_column(Integer)
    success: Mapped[bool] = mapped_column(Boolean, nullable=False)
    error_msg: Mapped[str | None] = mapped_column(Text)
    occurred_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())

    __table_args__ = (
        Index("ix_crawl_log_site_occurred", "site_id", "occurred_at"),
        Index("ix_crawl_log_target_occurred", "target_id", "occurred_at"),
    )


# ---------------------------------------------------------------------------
# §5.8 master_column_registry [Qingyuan 专项增强]
# ---------------------------------------------------------------------------
class MasterColumnRegistry(Base):
    __tablename__ = "master_column_registry"

    id: Mapped[int] = mapped_column(_PK_BIG, primary_key=True, autoincrement=True)
    adapter_id: Mapped[str] = mapped_column(String(50), nullable=False)
    site_id: Mapped[int] = mapped_column(
        BigInteger, ForeignKey("crawl_site.id", ondelete="CASCADE"), nullable=False
    )
    column_id: Mapped[str] = mapped_column(String(64), nullable=False)
    column_name: Mapped[str | None] = mapped_column(String(500))
    column_path: Mapped[str | None] = mapped_column(String(1000))
    admin_level: Mapped[str | None] = mapped_column(String(20))  # city / county / township
    topic_tags: Mapped[list[str] | None] = mapped_column(JSON)
    post_count: Mapped[int | None] = mapped_column(Integer)
    last_seen_at: Mapped[datetime | None] = mapped_column(DateTime)
    subscribed_target_id: Mapped[int | None] = mapped_column(
        BigInteger, ForeignKey("crawl_target.id", ondelete="SET NULL")
    )
    active: Mapped[bool] = mapped_column(Boolean, server_default=text("true"))
    created_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())
    updated_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())

    __table_args__ = (
        UniqueConstraint(
            "adapter_id", "site_id", "column_id", name="uq_mcr_adapter_site_col"
        ),
    )


class CrawlJob(Base):
    """Durable mirror of the in-process TaskQueue job.

    The runtime queue lives in `govcrawler.api.task_queue.TaskQueue` (asyncio
    Queues + worker tasks) and is fast for the hot path, but in-memory only.
    A container restart used to lose every queued/running job — bad for
    全量回溯 and long batch runs.

    This row mirrors the JobInfo state machine to MySQL so:
      • on api startup, orphaned status='running' rows can be marked
        'failed' (reason='restart') and any leftover 'queued' rows can
        be re-enqueued into the in-memory FIFO;
      • the admin UI can show full history beyond HISTORY_KEEP=200;
      • cancel / status query work even after a restart.
    """

    __tablename__ = "crawl_job"

    job_id: Mapped[str] = mapped_column(String(32), primary_key=True)
    # Host derived from site.base_url — used for per-host serialization.
    host: Mapped[str] = mapped_column(String(255), nullable=False, index=True)
    site_code: Mapped[str] = mapped_column(String(64), nullable=False, index=True)
    target_code: Mapped[str] = mapped_column(String(128), nullable=False, index=True)
    source: Mapped[str] = mapped_column(String(20), nullable=False)  # manual/schedule/retry
    status: Mapped[str] = mapped_column(
        String(20), nullable=False, default="queued", index=True,
    )
    force: Mapped[bool] = mapped_column(Boolean, server_default=text("false"))
    stop_requested: Mapped[bool] = mapped_column(
        Boolean, server_default=text("false"),
    )
    attempt_count: Mapped[int] = mapped_column(Integer, server_default=text("0"))
    # Last fully-processed list page (1-based). Only meaningful when the
    # owning target has track_checkpoint=True. 0 means "nothing completed
    # yet"; pipeline resumes from page (last_completed_page + 1) on
    # restart. Operator may PATCH this value via API to override.
    last_completed_page: Mapped[int] = mapped_column(
        Integer, server_default=text("0"),
    )

    enqueued_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now())
    started_at: Mapped[datetime | None] = mapped_column(DateTime)
    finished_at: Mapped[datetime | None] = mapped_column(DateTime)

    error_msg: Mapped[str | None] = mapped_column(Text)
    result_json: Mapped[dict[str, Any] | None] = mapped_column(JSON)

    __table_args__ = (
        # Common UI query: list jobs ordered by enqueue time.
        # Composite index on (status, enqueued_at) covers the
        # "show queued jobs in FIFO order" lookup at startup.
    )


class AdminAuditLog(Base):
    """Append-only operation audit trail.

    Captured by a FastAPI middleware for every state-changing admin call:
        POST / PUT / DELETE on /admin/api/*
        POST on /api/articles/.../ack
    Read calls (GET) are NOT logged — would dominate volume + don't
    change state.

    Body bytes are NEVER stored verbatim — only sha256(body)[:16] —
    so passwords / tokens / large json never end up in the DB. The
    digest is enough to (a) prove "the operator submitted this exact
    payload" if asked, and (b) cluster identical bulk-deletes.

    Retention: 1 year (cleanup task TBD). Indexed on (created_at) for
    timeline queries and on (actor, created_at) for "what did user X do".
    """

    __tablename__ = "admin_audit_log"

    id: Mapped[int] = mapped_column(_PK_BIG, primary_key=True, autoincrement=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime, server_default=func.now(), index=True,
    )
    actor: Mapped[str | None] = mapped_column(String(64), index=True)
    actor_ip: Mapped[str | None] = mapped_column(String(64))
    method: Mapped[str] = mapped_column(String(8), nullable=False)
    path: Mapped[str] = mapped_column(String(500), nullable=False)
    status_code: Mapped[int | None] = mapped_column(Integer)
    duration_ms: Mapped[int | None] = mapped_column(Integer)
    # 16-char prefix of sha256 of request body bytes. Empty string for
    # GET / DELETE without body. NEVER plaintext.
    payload_digest: Mapped[str | None] = mapped_column(String(32))
    # Free-text classifier filled by the middleware (e.g. "site.create",
    # "target.run", "article.bulk_delete") — derived from method + path.
    action: Mapped[str | None] = mapped_column(String(64), index=True)
    # Optional: parsed resource id for foreign-key-ish lookups
    # (target_code, site_code, article_id …) without full join.
    resource_type: Mapped[str | None] = mapped_column(String(32))
    resource_id: Mapped[str | None] = mapped_column(String(128))


__all__ = [
    "Base",
    "CrawlSite",
    "SiteDepartment",
    "CrawlTarget",
    "LocalDepartment",
    "Article",
    "Attachment",
    "CrawlLog",
    "CrawlJob",
    "AdminAuditLog",
    "MasterColumnRegistry",
    "DEPT_BINDINGS",
]
