"""Phase-A schema rebuild — v2 of §5 design doc.

Revision ID: 0002_schema_v2_rebuild
Revises: 0001_initial
Create Date: 2026-04-23

This is a **一刀切重建** (per §11 of the 2.0 design doc): the 1.0 schema
(sites/columns/articles/attachments/crawl_logs) has no production data to
preserve, so we drop it wholesale and create the 2.0 schema in its place.

Upgrade:
  1. Drop all 1.0 tables in dependency-reverse order.
  2. Create 8 v2 tables that mirror `govcrawler.models` one-to-one.

The partial unique index on `article(site_id, native_post_id)` is expressed
via `postgresql_where` — PG gets a real partial index, SQLite/MySQL get a
regular composite unique index (safe fallback; rows without native_post_id
will fail only if two such rows share both `site_id=X` and
`native_post_id=NULL`, which depending on dialect NULLs handling is a
non-issue on PG/SQLite).

Downgrade reverses the process by dropping v2 tables and recreating the
1.0 schema via the same DDL as 0001_initial.
"""
from alembic import op
import sqlalchemy as sa

revision = "0002_schema_v2_rebuild"
down_revision = "0001_initial"
branch_labels = None
depends_on = None

# Autoincrement-friendly BigInt PK — mirror of govcrawler.models._PK_BIG.
# On PG/MySQL we get BIGINT; on SQLite we get INTEGER which aliases rowid and
# auto-increments without needing AUTOINCREMENT.
_PK_BIG = sa.BigInteger().with_variant(sa.Integer(), "sqlite")


# ---------------------------------------------------------------------------
# upgrade
# ---------------------------------------------------------------------------
def upgrade() -> None:
    # --- 1. Drop 1.0 tables in reverse dependency order --------------------
    op.drop_table("crawl_logs")
    op.drop_index("ix_attachments_file_hash", table_name="attachments")
    op.drop_table("attachments")
    op.drop_index("ix_articles_exported", table_name="articles")
    op.drop_index("ix_articles_site_col_pub", table_name="articles")
    op.drop_index("ix_articles_url_hash", table_name="articles")
    op.drop_table("articles")
    op.drop_index("ix_columns_site_col", table_name="columns")
    op.drop_table("columns")
    op.drop_index("ix_sites_site_id", table_name="sites")
    op.drop_table("sites")

    # --- 2. Create 2.0 tables ---------------------------------------------
    # §5.1 crawl_site
    op.create_table(
        "crawl_site",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column("site_code", sa.String(50), nullable=False, unique=True),
        sa.Column("site_name", sa.String(200)),
        sa.Column("base_url", sa.String(500)),
        sa.Column("site_role", sa.String(50)),
        sa.Column("cms_adapter", sa.String(50)),
        sa.Column("adapter_params_json", sa.JSON),
        sa.Column("default_fetch_strategy", sa.String(50)),
        sa.Column("strategy_override_reason", sa.String(200)),
        sa.Column("respect_robots", sa.Boolean, server_default=sa.text("true")),
        sa.Column("yaml_path", sa.String(500)),
        sa.Column("enabled", sa.Boolean, server_default=sa.text("true")),
        sa.Column("remark", sa.String(500)),
        sa.Column("created_at", sa.DateTime, server_default=sa.func.now()),
        sa.Column("updated_at", sa.DateTime, server_default=sa.func.now()),
        sa.CheckConstraint(
            "(cms_adapter IS NULL) <> (yaml_path IS NULL)",
            name="ck_crawl_site_adapter_xor_yaml",
        ),
    )

    # §5.4 local_department (referenced by site_department + crawl_target + article)
    op.create_table(
        "local_department",
        sa.Column("dept_id", sa.BigInteger, primary_key=True),
        sa.Column("dept_name", sa.String(250)),
        sa.Column("parent_dept_id", sa.BigInteger),
        sa.Column("dept_level", sa.Integer),
        sa.Column("dept_code", sa.String(250)),
        sa.Column("short_name", sa.String(50)),
        sa.Column("full_name", sa.String(500)),
        sa.Column("state", sa.Integer),
        sa.Column("order_id", sa.Integer),
        sa.Column("updated_at", sa.DateTime),
    )

    # §5.2 site_department [Qingyuan 专项]
    op.create_table(
        "site_department",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column(
            "site_id",
            sa.BigInteger,
            sa.ForeignKey("crawl_site.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column("dept_path", sa.String(100), nullable=False),
        sa.Column(
            "local_dept_id",
            sa.BigInteger,
            sa.ForeignKey("local_department.dept_id", ondelete="SET NULL"),
        ),
        sa.Column(
            "dept_binding",
            sa.String(20),
            nullable=False,
            server_default="pending",
        ),
        sa.Column("dept_display_name", sa.String(200)),
        sa.Column("detect_status", sa.String(20), server_default="pending"),
        sa.Column("last_probed_at", sa.DateTime),
        sa.Column("last_classify_json", sa.JSON),
        sa.Column("enabled", sa.Boolean, server_default=sa.text("true")),
        sa.Column("created_at", sa.DateTime, server_default=sa.func.now()),
        sa.Column("updated_at", sa.DateTime, server_default=sa.func.now()),
        sa.UniqueConstraint("site_id", "dept_path", name="uq_site_department_path"),
        sa.CheckConstraint(
            "dept_binding IN ('pending','mapped','city_level','cross_dept','external_ref')",
            name="ck_site_department_binding",
        ),
        sa.CheckConstraint(
            "(dept_binding = 'mapped' AND local_dept_id IS NOT NULL) OR "
            "(dept_binding <> 'mapped' AND local_dept_id IS NULL)",
            name="ck_site_department_binding_local_dept",
        ),
    )

    # §5.3 crawl_target
    op.create_table(
        "crawl_target",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column(
            "site_id",
            sa.BigInteger,
            sa.ForeignKey("crawl_site.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column(
            "site_department_id",
            sa.BigInteger,
            sa.ForeignKey("site_department.id", ondelete="CASCADE"),
        ),
        sa.Column("target_code", sa.String(100), nullable=False, unique=True),
        sa.Column("target_name", sa.String(200)),
        sa.Column("entry_url", sa.String(1000)),
        sa.Column("sample_article_url", sa.String(1000)),
        sa.Column(
            "dept_id",
            sa.BigInteger,
            sa.ForeignKey("local_department.dept_id", ondelete="SET NULL"),
        ),
        sa.Column("parser_override_json", sa.JSON),
        sa.Column("channel_name", sa.String(200)),
        sa.Column("channel_path", sa.String(1000)),
        sa.Column("content_category", sa.String(100)),
        sa.Column("content_subcategory", sa.String(100)),
        sa.Column("schedule_cron", sa.String(100)),
        sa.Column("expected_cadence_days", sa.Integer, server_default="30"),
        sa.Column("interval_sec", sa.Integer),
        sa.Column("enabled", sa.Boolean, server_default=sa.text("true")),
        sa.Column("last_crawled_at", sa.DateTime),
        sa.Column("last_article_time", sa.DateTime),
        sa.Column("created_at", sa.DateTime, server_default=sa.func.now()),
        sa.Column("updated_at", sa.DateTime, server_default=sa.func.now()),
    )

    # §5.5 article
    op.create_table(
        "article",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column(
            "site_id",
            sa.BigInteger,
            sa.ForeignKey("crawl_site.id", ondelete="RESTRICT"),
            nullable=False,
        ),
        sa.Column(
            "target_id",
            sa.BigInteger,
            sa.ForeignKey("crawl_target.id", ondelete="SET NULL"),
        ),
        sa.Column(
            "dept_id",
            sa.BigInteger,
            sa.ForeignKey("local_department.dept_id", ondelete="SET NULL"),
        ),
        sa.Column("native_post_id", sa.String(64)),
        sa.Column("url", sa.Text, nullable=False),
        sa.Column("url_hash", sa.CHAR(64), nullable=False),
        sa.Column("title", sa.Text),
        sa.Column("publish_time", sa.DateTime),
        sa.Column("source_raw", sa.String(500)),
        sa.Column("channel_name", sa.String(200)),
        sa.Column("channel_path", sa.String(1000)),
        sa.Column("content_category", sa.String(100)),
        sa.Column("content_subcategory", sa.String(100)),
        sa.Column("index_no", sa.String(200)),
        sa.Column("doc_no", sa.String(200)),
        sa.Column("publisher", sa.String(500)),
        sa.Column("publish_date", sa.Date),
        sa.Column("effective_date", sa.Date),
        sa.Column("topic_words", sa.String(500)),
        sa.Column("open_category", sa.String(200)),
        sa.Column("metadata_json", sa.JSON),
        sa.Column("content_text", sa.Text),
        sa.Column("raw_html_path", sa.Text),
        sa.Column("text_path", sa.Text),
        sa.Column("has_attachment", sa.Boolean, server_default=sa.text("false")),
        sa.Column("status", sa.String(20), server_default="raw"),
        sa.Column("fetch_strategy", sa.String(50)),
        sa.Column("fetched_at", sa.DateTime, server_default=sa.func.now()),
        sa.Column("exported_to_rag_at", sa.DateTime),
        sa.Column("created_at", sa.DateTime, server_default=sa.func.now()),
        sa.Column("updated_at", sa.DateTime, server_default=sa.func.now()),
        sa.UniqueConstraint("url_hash", name="uq_article_url_hash"),
        sa.CheckConstraint(
            "status IN ('raw','ready','failed')", name="ck_article_status"
        ),
    )
    # partial unique index — PG gets the WHERE clause; other dialects get a
    # plain composite unique (NULLs are distinct under PG/SQLite so it behaves
    # close enough for the invariant we care about).
    op.create_index(
        "uq_article_site_native",
        "article",
        ["site_id", "native_post_id"],
        unique=True,
        postgresql_where=sa.text("native_post_id IS NOT NULL"),
    )
    op.create_index(
        "ix_article_site_pub", "article", ["site_id", "publish_time"]
    )
    op.create_index(
        "ix_article_target_pub", "article", ["target_id", "publish_time"]
    )
    op.create_index("ix_article_exported", "article", ["exported_to_rag_at"])
    op.create_index("ix_article_status", "article", ["status"])

    # §5.6 attachment
    op.create_table(
        "attachment",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column(
            "article_id",
            sa.BigInteger,
            sa.ForeignKey("article.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column("file_name", sa.Text),
        sa.Column("file_ext", sa.String(20)),
        sa.Column("size_bytes", sa.BigInteger),
        sa.Column("file_path", sa.Text),
        sa.Column("file_hash", sa.CHAR(64)),
        sa.Column("downloaded_at", sa.DateTime, server_default=sa.func.now()),
    )
    op.create_index("ix_attachment_file_hash", "attachment", ["file_hash"])

    # §5.7 crawl_log
    op.create_table(
        "crawl_log",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column(
            "site_id",
            sa.BigInteger,
            sa.ForeignKey("crawl_site.id", ondelete="SET NULL"),
        ),
        sa.Column(
            "target_id",
            sa.BigInteger,
            sa.ForeignKey("crawl_target.id", ondelete="SET NULL"),
        ),
        sa.Column("article_url", sa.Text),
        sa.Column("strategy", sa.String(50)),
        sa.Column("http_status", sa.Integer),
        sa.Column("duration_ms", sa.Integer),
        sa.Column("success", sa.Boolean, nullable=False),
        sa.Column("error_msg", sa.Text),
        sa.Column("occurred_at", sa.DateTime, server_default=sa.func.now()),
    )
    op.create_index(
        "ix_crawl_log_site_occurred", "crawl_log", ["site_id", "occurred_at"]
    )
    op.create_index(
        "ix_crawl_log_target_occurred", "crawl_log", ["target_id", "occurred_at"]
    )

    # §5.8 master_column_registry [Qingyuan 专项]
    op.create_table(
        "master_column_registry",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column("adapter_id", sa.String(50), nullable=False),
        sa.Column(
            "site_id",
            sa.BigInteger,
            sa.ForeignKey("crawl_site.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column("column_id", sa.String(64), nullable=False),
        sa.Column("column_name", sa.String(500)),
        sa.Column("column_path", sa.String(1000)),
        sa.Column("admin_level", sa.String(20)),
        sa.Column("topic_tags", sa.JSON),
        sa.Column("post_count", sa.Integer),
        sa.Column("last_seen_at", sa.DateTime),
        sa.Column(
            "subscribed_target_id",
            sa.BigInteger,
            sa.ForeignKey("crawl_target.id", ondelete="SET NULL"),
        ),
        sa.Column("active", sa.Boolean, server_default=sa.text("true")),
        sa.Column("created_at", sa.DateTime, server_default=sa.func.now()),
        sa.Column("updated_at", sa.DateTime, server_default=sa.func.now()),
        sa.UniqueConstraint(
            "adapter_id", "site_id", "column_id", name="uq_mcr_adapter_site_col"
        ),
    )


# ---------------------------------------------------------------------------
# downgrade — drop v2, recreate 1.0 (mirror of 0001_initial.upgrade)
# ---------------------------------------------------------------------------
def downgrade() -> None:
    # Drop v2 (reverse dependency order)
    op.drop_table("master_column_registry")
    op.drop_index("ix_crawl_log_target_occurred", table_name="crawl_log")
    op.drop_index("ix_crawl_log_site_occurred", table_name="crawl_log")
    op.drop_table("crawl_log")
    op.drop_index("ix_attachment_file_hash", table_name="attachment")
    op.drop_table("attachment")
    op.drop_index("ix_article_status", table_name="article")
    op.drop_index("ix_article_exported", table_name="article")
    op.drop_index("ix_article_target_pub", table_name="article")
    op.drop_index("ix_article_site_pub", table_name="article")
    op.drop_index("uq_article_site_native", table_name="article")
    op.drop_table("article")
    op.drop_table("crawl_target")
    op.drop_table("site_department")
    op.drop_table("local_department")
    op.drop_table("crawl_site")

    # Recreate 1.0 (copied from 0001_initial.upgrade for idempotent reversal)
    op.create_table(
        "sites",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column("site_id", sa.String(64), nullable=False),
        sa.Column("name", sa.Text),
        sa.Column("base_url", sa.Text),
        sa.Column("config_json", sa.JSON),
        sa.Column("enabled", sa.Boolean, server_default=sa.text("true")),
        sa.Column("created_at", sa.DateTime, server_default=sa.func.now()),
    )
    op.create_index("ix_sites_site_id", "sites", ["site_id"], unique=True)

    op.create_table(
        "columns",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column("site_id", sa.String(64), nullable=False),
        sa.Column("column_id", sa.String(64), nullable=False),
        sa.Column("name", sa.Text),
        sa.Column("category", sa.String(64)),
        sa.Column("list_url", sa.Text),
        sa.Column("config_json", sa.JSON),
        sa.Column("last_crawled_at", sa.DateTime),
        sa.Column("last_article_time", sa.DateTime),
        sa.Column("enabled", sa.Boolean, server_default=sa.text("true")),
    )
    op.create_index(
        "ix_columns_site_col", "columns", ["site_id", "column_id"], unique=True
    )

    op.create_table(
        "articles",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column("site_id", sa.String(64), nullable=False),
        sa.Column("column_id", sa.String(64), nullable=False),
        sa.Column("category", sa.String(64)),
        sa.Column("url", sa.Text, nullable=False),
        sa.Column("url_hash", sa.CHAR(64), nullable=False),
        sa.Column("content_simhash", sa.CHAR(16)),
        sa.Column("title", sa.Text),
        sa.Column("publish_time", sa.DateTime),
        sa.Column("source", sa.String(128)),
        sa.Column("content_text", sa.Text),
        sa.Column("raw_html_path", sa.Text),
        sa.Column("text_path", sa.Text),
        sa.Column("has_attachment", sa.Boolean, server_default=sa.text("false")),
        sa.Column("status", sa.String(16), server_default="raw"),
        sa.Column("fetch_strategy", sa.String(32)),
        sa.Column("fetched_at", sa.DateTime, server_default=sa.func.now()),
        sa.Column("exported_to_rag_at", sa.DateTime),
    )
    op.create_index("ix_articles_url_hash", "articles", ["url_hash"], unique=True)
    op.create_index(
        "ix_articles_site_col_pub",
        "articles",
        ["site_id", "column_id", "publish_time"],
    )
    op.create_index("ix_articles_exported", "articles", ["exported_to_rag_at"])

    op.create_table(
        "attachments",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column(
            "article_id",
            sa.BigInteger,
            sa.ForeignKey("articles.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column("file_name", sa.Text),
        sa.Column("file_ext", sa.String(16)),
        sa.Column("size_bytes", sa.BigInteger),
        sa.Column("file_path", sa.Text),
        sa.Column("file_hash", sa.CHAR(64)),
        sa.Column("downloaded_at", sa.DateTime, server_default=sa.func.now()),
    )
    op.create_index("ix_attachments_file_hash", "attachments", ["file_hash"])

    op.create_table(
        "crawl_logs",
        sa.Column("id", _PK_BIG, primary_key=True, autoincrement=True),
        sa.Column("site_id", sa.String(64)),
        sa.Column("column_id", sa.String(64)),
        sa.Column("article_url", sa.Text),
        sa.Column("strategy", sa.String(32)),
        sa.Column("http_status", sa.Integer),
        sa.Column("duration_ms", sa.Integer),
        sa.Column("success", sa.Boolean),
        sa.Column("error_msg", sa.Text),
        sa.Column("occurred_at", sa.DateTime, server_default=sa.func.now()),
    )
