from __future__ import annotations
from dataclasses import dataclass, field
from datetime import date
import re
from urllib.parse import urljoin
from parsel import Selector

from govcrawler.parser.cleaner import html_to_text
from govcrawler.parser.extractor import gne_extract, trafilatura_extract


def _robust_css(sel: Selector, css: str):
    """CSS query with two sister-features against parsel's defaults:

    1. Priority fallback chain (NOT union). yaml authors write
       `div.zw, div.TRS_Editor, div.content` meaning "use div.zw if
       present, else div.TRS_Editor, else div.content". parsel's
       sel.css(comma-list) instead returns ALL matches in DOM order,
       which on TRS-CMS pages picks the OUTER wrapper (div.content)
       before the INNER body (div.zw) — pulling in toolbars + sidebars.
       We split on commas and return the first sub-selector that hits.

    2. // xpath fallback for the libxml2 axis bug. On gov.cn
       /zhengce/content/* the body sits inside double-nested tables and
       descendant-or-self::div[has-class('foo')] from <html> finds
       nothing, while //div[has-class('foo')] does. Each sub-selector
       gets the // retry before we move on to the next.

    Returns a SelectorList. Empty list if nothing matched either pass.
    """
    if not css:
        return sel.css("")  # empty SelectorList
    try:
        from cssselect import HTMLTranslator
        translator = HTMLTranslator()
    except Exception:
        translator = None

    last_empty = None  # remember an empty SelectorList to return on miss

    for sub in css.split(","):
        sub = sub.strip()
        if not sub:
            continue
        # Pass 1: vanilla parsel CSS for this sub-selector alone.
        out = sel.css(sub)
        if out:
            return out
        last_empty = out
        # Pass 2: // xpath fallback for the libxml2 axis bug.
        if translator is not None:
            try:
                xp = translator.css_to_xpath(sub, prefix="//")
            except Exception:
                continue
            try:
                found = sel.xpath(xp)
            except Exception:
                found = None
            if found:
                return found
    return last_empty if last_empty is not None else sel.css("")

MIN_CONTENT_CHARS = 100  # 主 XPath content_html 短于此值 → 触发 GNE
MIN_FALLBACK_CHARS = 50  # GNE/trafilatura 正文短于此值 → 认为失败


@dataclass
class DetailFields:
    title: str
    publish_time_raw: str
    source: str
    content_html: str
    content_text: str
    attachment_urls: list[str]
    used_fallback: bool
    fallback_engine: str | None  # "gne" | "trafilatura" | None
    # Count of inline <img> with non-empty src inside content_html. Lets the
    # pipeline mark image-only articles (gov 通知 with photos, no body text)
    # as ready instead of content_text_too_short.
    inline_image_count: int = 0
    index_no: str | None = None
    publisher: str | None = None
    doc_no: str | None = None
    publish_date: date | None = None
    effective_date: date | None = None
    is_effective: bool | None = None
    expiry_date: date | None = None
    topic_words: str | None = None
    open_category: str | None = None
    content_category: str | None = None
    content_subcategory: str | None = None
    public_meta: dict[str, str] = field(default_factory=dict)


def _clean_meta_text(v: str | None) -> str | None:
    if v is None:
        return None
    cleaned = re.sub(r"\s+", " ", v).strip()
    return cleaned or None


def _clean_source_text(v: str | None) -> str | None:
    cleaned = _clean_meta_text(v)
    if not cleaned:
        return None
    for prefix in ("文章来源：", "信息来源：", "来源：", "来源:", "发布机构："):
        if cleaned.startswith(prefix):
            return _clean_meta_text(cleaned[len(prefix):])
    return cleaned


def _date_from_meta_text(v: str | None) -> date | None:
    cleaned = _clean_meta_text(v)
    if not cleaned:
        return None
    m = re.search(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})", cleaned)
    if not m:
        return None
    return date(int(m.group(1)), int(m.group(2)), int(m.group(3)))


def _dates_from_meta_text(v: str | None) -> list[date]:
    cleaned = _clean_meta_text(v)
    if not cleaned:
        return []
    out: list[date] = []
    for m in re.finditer(r"(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})", cleaned):
        try:
            out.append(date(int(m.group(1)), int(m.group(2)), int(m.group(3))))
        except ValueError:
            continue
    return out


def _is_effective_from_meta_text(v: str | None) -> bool | None:
    cleaned = _clean_meta_text(v)
    if not cleaned:
        return None
    compact = cleaned.replace(" ", "")
    if any(word in compact for word in ("已废止", "废止", "已失效", "失效", "无效", "停止执行", "不再有效")):
        return False
    if any(word in compact for word in ("现行有效", "继续有效", "有效")):
        return True
    return None


def _split_open_category(v: str | None) -> tuple[str | None, str | None, str | None]:
    cleaned = _clean_meta_text(v)
    if not cleaned:
        return None, None, None
    cleaned = cleaned.replace("\\", "、")
    parts = [p.strip() for p in cleaned.split("、") if p.strip()]
    if len(parts) <= 1:
        return cleaned, None, cleaned
    return "、".join(parts[:-1]), parts[-1], cleaned


_PUBLIC_META_LABELS = {
    "索引号": "索引号",
    "主题分类": "分类",
    "分类": "分类",
    "发布机构": "发布机构",
    "发文机关": "发布机构",
    "文号": "文号",
    "发文字号": "文号",
    "成文日期": "成文日期",
    "生效日期": "生效日期",
    "施行日期": "生效日期",
    "实施日期": "生效日期",
    "执行日期": "生效日期",
    "效力状态": "效力状态",
    "时效状态": "效力状态",
    "有效性": "效力状态",
    "失效日期": "失效日期",
    "废止日期": "失效日期",
    "有效期至": "失效日期",
    "有效期": "有效期",
    "发布日期": "发布日期",
    "主题词": "主题词",
}


def _canonical_public_label(v: str | None) -> str | None:
    cleaned = _clean_meta_text(v)
    if not cleaned:
        return None
    compact = cleaned.rstrip(":：").replace(" ", "")
    return _PUBLIC_META_LABELS.get(compact)


def _extract_gov_public_meta(sel: Selector) -> dict[str, object]:
    """Extract common government public-document metadata tables.

    gd.gov.cn/wjk renders these as `<label>索引号：</label><span>...</span>`;
    gkmlpt renders a similar table with label/value `<td>` pairs. Keep this
    parser generic so YAML-based targets can enrich normalized Article fields
    without writing a bespoke adapter for every static TRS template.
    """
    pairs: dict[str, str] = {}

    # Shape A: <label>索引号：</label><span>...</span>
    for label in sel.xpath("//label"):
        key = _canonical_public_label(" ".join(label.xpath(".//text()").getall()))
        if not key:
            continue
        value = _clean_meta_text(" ".join(label.xpath("following-sibling::span[1]//text()").getall()))
        if key and value:
            pairs[key] = value

    # Shape B: <td>索引号：</td><td><span>...</span></td>
    for tr in sel.xpath("//tr"):
        cells = [_clean_meta_text(" ".join(td.xpath(".//text()").getall())) for td in tr.xpath("./td")]
        cells = [c for c in cells if c]
        for idx in range(0, len(cells) - 1, 2):
            key = _canonical_public_label(cells[idx])
            value = cells[idx + 1]
            if key and value and _canonical_public_label(value) is None:
                pairs.setdefault(key, value)

    # Shape C: mobile gov.cn table clone:
    #   <h2>索 引 号：</h2><p>000014349/...</p>
    for h in sel.xpath("//h2"):
        key = _canonical_public_label(" ".join(h.xpath(".//text()").getall()))
        if not key:
            continue
        value = _clean_meta_text(" ".join(h.xpath("following-sibling::p[1]//text()").getall()))
        if value:
            pairs.setdefault(key, value)

    top_category, sub_category, open_category = _split_open_category(pairs.get("分类"))
    effective_candidates = (
        _dates_from_meta_text(pairs.get("生效日期"))
        or _dates_from_meta_text(pairs.get("有效期"))[:1]
    )
    expiry_candidates = (
        _dates_from_meta_text(pairs.get("失效日期"))
        or _dates_from_meta_text(pairs.get("有效期"))[-1:]
    )
    return {
        "index_no": pairs.get("索引号"),
        "publisher": _clean_source_text(pairs.get("发布机构")),
        "doc_no": pairs.get("文号"),
        "publish_date": _date_from_meta_text(pairs.get("成文日期")),
        "effective_date": effective_candidates[0] if effective_candidates else None,
        "is_effective": _is_effective_from_meta_text(pairs.get("效力状态")),
        "expiry_date": expiry_candidates[0] if expiry_candidates else None,
        "topic_words": pairs.get("主题词"),
        "open_category": open_category,
        "content_category": top_category,
        "content_subcategory": sub_category,
        "public_meta": pairs,
    }


def parse_detail(html: str, base_url: str, selectors: dict) -> DetailFields:
    """Parse a detail article page into structured fields.

    selectors keys (all CSS):
      title           : ::text
      publish_time    : ::text
      source          : ::text
      content         : root element of article body
      attachment_css  : CSS for <a href> elements inside content
    """
    sel = Selector(text=html or "")
    title = (_robust_css(sel, selectors["title"]).get() or "").strip()
    pub = (_robust_css(sel, selectors["publish_time"]).get() or "").strip()
    source_sel = selectors.get("source") or ""
    source = (
        (_robust_css(sel, source_sel).get() if source_sel.strip() else "") or ""
    ).strip()
    source = _clean_source_text(source) or ""
    content_root = _robust_css(sel, selectors["content"])
    content_html = content_root.get() or ""

    # Title backup: when the domain-specific title selector misses (e.g.
    # gov.cn TRS-CMS body has no h1.article_title — the doc title lives in
    # <title> only), fall back to <title> tag BEFORE the GNE/trafilatura
    # decision. Without this, missing title forced GNE/trafilatura even when
    # content_html was 6000+ chars of perfectly good body, and the fallback
    # would happily overwrite it with the page nav (62 chars).
    if not title:
        page_title = (sel.css("title::text").get() or "").strip()
        if page_title:
            # Strip trailing site name ("..._中国政府网", "..._宏观经济_中国政府网")
            for tail in ("_中国政府网", "-中国政府网"):
                if page_title.endswith(tail):
                    page_title = page_title[: -len(tail)].rstrip("_-")
            # If the title still has a trailing category (e.g. "_宏观经济"), keep it
            # — domain category info is mildly useful and easy to strip downstream.
            title = page_title

    # 附件抽取（在 content_html 内找链接）
    # Note: parsel cannot append ::attr(href) to a comma-separated compound selector —
    # the pseudo-element only applies to the last clause. We split on commas and query
    # each sub-selector individually, then deduplicate.
    attach_css = selectors.get("attachment_css") or (
        "a[href$='.pdf'], a[href$='.doc'], a[href$='.docx'], "
        "a[href$='.xls'], a[href$='.xlsx'], a[href$='.zip']"
    )
    attach_nodes: list[str] = []
    if content_root:
        seen_raw: set[str] = set()
        for sub in attach_css.split(","):
            sub = sub.strip()
            if not sub:
                continue
            for href in content_root.css(f"{sub}::attr(href)").getall():
                if href and href not in seen_raw:
                    seen_raw.add(href)
                    attach_nodes.append(href)

    used_fallback = False
    fallback_engine: str | None = None

    # Decision point 2 (RESEARCH §System Architecture Diagram):
    # 主 XPath content 过短或 title 为空 → GNE
    if len(content_html) < MIN_CONTENT_CHARS or not title:
        gne = gne_extract(html, base_url)
        gne_content = gne.get("content", "") or ""
        if len(gne_content) >= MIN_FALLBACK_CHARS:
            content_html = gne_content
            used_fallback = True
            fallback_engine = "gne"
            if not title:
                title = gne.get("title", "") or title
            if not pub:
                pub = gne.get("publish_time", "") or pub
        else:
            # GNE 也失败 → trafilatura（二级兜底 per user locked decision）
            tra = trafilatura_extract(html, base_url)
            tra_content = tra.get("content", "") or ""
            if len(tra_content) >= MIN_FALLBACK_CHARS:
                content_html = tra_content
                used_fallback = True
                fallback_engine = "trafilatura"
                if not title:
                    title = tra.get("title", "") or title
                if not pub:
                    pub = tra.get("publish_time", "") or pub

    content_text = html_to_text(content_html)

    # Count <img src="..."> inside the article body. Pure-photo announcements
    # (e.g. 4 inline 通知图) have ~0 text but are real content; the pipeline
    # uses this to avoid flagging them as content_text_too_short.
    inline_image_count = 0
    if content_html:
        inline_image_count = len(
            Selector(text=content_html).css("img[src]").getall()
        )
    public_meta = _extract_gov_public_meta(sel)

    attachment_urls: list[str] = []
    seen: set[str] = set()
    for a in attach_nodes:
        if not a:
            continue
        abs_u = urljoin(base_url, a)
        if abs_u in seen:
            continue
        seen.add(abs_u)
        attachment_urls.append(abs_u)

    return DetailFields(
        title=title,
        publish_time_raw=pub,
        source=source,
        content_html=content_html,
        content_text=content_text,
        attachment_urls=attachment_urls,
        used_fallback=used_fallback,
        fallback_engine=fallback_engine,
        inline_image_count=inline_image_count,
        index_no=public_meta.get("index_no"),
        publisher=public_meta.get("publisher"),
        doc_no=public_meta.get("doc_no"),
        publish_date=public_meta.get("publish_date"),
        effective_date=public_meta.get("effective_date"),
        is_effective=public_meta.get("is_effective"),
        expiry_date=public_meta.get("expiry_date"),
        topic_words=public_meta.get("topic_words"),
        open_category=public_meta.get("open_category"),
        content_category=public_meta.get("content_category"),
        content_subcategory=public_meta.get("content_subcategory"),
        public_meta=public_meta.get("public_meta") or {},
    )
