"""Article / Attachment / CrawlLog persistence helpers — v2 schema aligned.

This module is the thin layer between the crawl pipeline and the SQLAlchemy
tables. Adapters produce Pydantic `CrawlItem` (see `govcrawler.adapters.contract`);
this module projects those into rows.

Key change from 1.0:
  * `Article.site_id` is now an **int FK** to `crawl_site.id` (was str).
    Callers pass the site_code (str) and we resolve via the repo.
  * `CrawlLog.column_id` is gone — replaced by `target_id` FK to `crawl_target.id`.
  * `Article.column_id` / `.category` / `.source` → replaced by `target_id`,
    `channel_name`, `content_category`, `source_raw`.
"""
from __future__ import annotations

from typing import Iterable, Sequence

from sqlalchemy import select
from sqlalchemy.orm import Session

from govcrawler.adapters.contract import CrawlItem
from govcrawler.models import Article, Attachment, CrawlLog
from govcrawler.repositories import sites


# ---------------------------------------------------------------------------
# read side
# ---------------------------------------------------------------------------
def get_article_by_url_hash(session: Session, url_hash: str) -> Article | None:
    return session.scalar(select(Article).where(Article.url_hash == url_hash))


def get_article_by_native_id(
    session: Session, *, site_pk: int, native_post_id: str
) -> Article | None:
    """Look up by (site_id, native_post_id) — the primary CMS-side dedup key."""
    return session.scalar(
        select(Article).where(
            Article.site_id == site_pk,
            Article.native_post_id == native_post_id,
        )
    )


# ---------------------------------------------------------------------------
# article insert — contract-driven
# ---------------------------------------------------------------------------
# fields that map 1:1 from CrawlItem → Article column
_CONTRACT_FIELDS: tuple[str, ...] = (
    "native_post_id",
    "url",
    "url_hash",
    "title",
    "publish_time",
    "source_raw",
    "publisher",
    "content_text",
    "raw_html_path",
    "text_path",
    "channel_name",
    "channel_path",
    "content_category",
    "content_subcategory",
    "index_no",
    "doc_no",
    "publish_date",
    "effective_date",
    "topic_words",
    "open_category",
    "metadata_json",
    "has_attachment",
)


def insert_article_from_contract(
    session: Session, item: CrawlItem, *, site_pk: int | None = None
) -> Article:
    """Project a Pydantic `CrawlItem` into an `Article` row and insert it.

    `site_pk` is the int FK target (crawl_site.id). If omitted we resolve it
    by looking up `item.site_id` (which is the site_code) in crawl_site.
    """
    if site_pk is None:
        site_row = sites.get_by_code(session, item.site_id)
        if site_row is None:
            raise ValueError(
                f"crawl_site not found for site_code={item.site_id!r}; "
                "run yaml-sync or create the row first"
            )
        site_pk = site_row.id

    kwargs: dict = {name: getattr(item, name) for name in _CONTRACT_FIELDS}
    kwargs["site_id"] = site_pk
    kwargs["target_id"] = item.target_id
    kwargs["dept_id"] = item.dept_id
    kwargs["status"] = item.status.value
    kwargs["fetch_strategy"] = item.fetch_strategy.value if item.fetch_strategy else None

    row = Article(**kwargs)
    session.add(row)
    session.flush()
    return row


def insert_article(session: Session, **kwargs) -> Article:
    """Low-level insert — passes kwargs through to `Article(...)`.

    Prefer `insert_article_from_contract` in the pipeline; this stays for
    tests and edge callers that have already built a dict.
    """
    row = Article(**kwargs)
    session.add(row)
    session.flush()
    return row


# ---------------------------------------------------------------------------
# attachments
# ---------------------------------------------------------------------------
def insert_attachments(
    session: Session, article_id: int, records: Sequence[dict]
) -> list[Attachment]:
    out: list[Attachment] = []
    for r in records:
        att = Attachment(article_id=article_id, **r)
        session.add(att)
        out.append(att)
    session.flush()
    return out


# ---------------------------------------------------------------------------
# crawl_log — now keyed by target_id, not (site_id, column_id)
# ---------------------------------------------------------------------------
def insert_crawl_log(
    session: Session,
    *,
    site_pk: int | None,
    target_id: int | None,
    article_url: str | None,
    strategy: str,
    http_status: int | None,
    duration_ms: int,
    success: bool,
    error_msg: str | None = None,
) -> CrawlLog:
    row = CrawlLog(
        site_id=site_pk,
        target_id=target_id,
        article_url=article_url,
        strategy=strategy,
        http_status=http_status,
        duration_ms=duration_ms,
        success=success,
        error_msg=error_msg,
    )
    session.add(row)
    session.flush()
    return row
