from __future__ import annotations
from dataclasses import dataclass, field
from urllib.parse import urljoin
from parsel import Selector

from govcrawler.parser.cleaner import html_to_text
from govcrawler.parser.extractor import gne_extract, trafilatura_extract

MIN_CONTENT_CHARS = 100  # 主 XPath content_html 短于此值 → 触发 GNE
MIN_FALLBACK_CHARS = 50  # GNE/trafilatura 正文短于此值 → 认为失败


@dataclass
class DetailFields:
    title: str
    publish_time_raw: str
    source: str
    content_html: str
    content_text: str
    attachment_urls: list[str]
    used_fallback: bool
    fallback_engine: str | None  # "gne" | "trafilatura" | None


def parse_detail(html: str, base_url: str, selectors: dict) -> DetailFields:
    """Parse a detail article page into structured fields.

    selectors keys (all CSS):
      title           : ::text
      publish_time    : ::text
      source          : ::text
      content         : root element of article body
      attachment_css  : CSS for <a href> elements inside content
    """
    sel = Selector(text=html or "")
    title = (sel.css(selectors["title"]).get() or "").strip()
    pub = (sel.css(selectors["publish_time"]).get() or "").strip()
    source_sel = selectors.get("source") or ""
    source = (
        (sel.css(source_sel).get() if source_sel.strip() else "") or ""
    ).strip()
    content_root = sel.css(selectors["content"])
    content_html = content_root.get() or ""

    # 附件抽取（在 content_html 内找链接）
    # Note: parsel cannot append ::attr(href) to a comma-separated compound selector —
    # the pseudo-element only applies to the last clause. We split on commas and query
    # each sub-selector individually, then deduplicate.
    attach_css = selectors.get("attachment_css") or (
        "a[href$='.pdf'], a[href$='.doc'], a[href$='.docx'], "
        "a[href$='.xls'], a[href$='.xlsx'], a[href$='.zip']"
    )
    attach_nodes: list[str] = []
    if content_root:
        seen_raw: set[str] = set()
        for sub in attach_css.split(","):
            sub = sub.strip()
            if not sub:
                continue
            for href in content_root.css(f"{sub}::attr(href)").getall():
                if href and href not in seen_raw:
                    seen_raw.add(href)
                    attach_nodes.append(href)

    used_fallback = False
    fallback_engine: str | None = None

    # Decision point 2 (RESEARCH §System Architecture Diagram):
    # 主 XPath content 过短或 title 为空 → GNE
    if len(content_html) < MIN_CONTENT_CHARS or not title:
        gne = gne_extract(html, base_url)
        gne_content = gne.get("content", "") or ""
        if len(gne_content) >= MIN_FALLBACK_CHARS:
            content_html = gne_content
            used_fallback = True
            fallback_engine = "gne"
            if not title:
                title = gne.get("title", "") or title
            if not pub:
                pub = gne.get("publish_time", "") or pub
        else:
            # GNE 也失败 → trafilatura（二级兜底 per user locked decision）
            tra = trafilatura_extract(html, base_url)
            tra_content = tra.get("content", "") or ""
            if len(tra_content) >= MIN_FALLBACK_CHARS:
                content_html = tra_content
                used_fallback = True
                fallback_engine = "trafilatura"
                if not title:
                    title = tra.get("title", "") or title
                if not pub:
                    pub = tra.get("publish_time", "") or pub

    content_text = html_to_text(content_html)

    attachment_urls: list[str] = []
    seen: set[str] = set()
    for a in attach_nodes:
        if not a:
            continue
        abs_u = urljoin(base_url, a)
        if abs_u in seen:
            continue
        seen.add(abs_u)
        attachment_urls.append(abs_u)

    return DetailFields(
        title=title,
        publish_time_raw=pub,
        source=source,
        content_html=content_html,
        content_text=content_text,
        attachment_urls=attachment_urls,
        used_fallback=used_fallback,
        fallback_engine=fallback_engine,
    )
