from __future__ import annotations
from typing import Any
import logging

log = logging.getLogger(__name__)

# GNE 懒加载 — import 昂贵且首次调用要加载模型文件
_GNE = None


def _get_gne():
    global _GNE
    if _GNE is None:
        from gne import GeneralNewsExtractor

        _GNE = GeneralNewsExtractor()
    return _GNE


def gne_extract(html: str, base_url: str) -> dict[str, Any]:
    """主兜底：GNE 抽正文。失败返回 {}。
    Returns keys: title, publish_time, content (HTML-ish plain from GNE), author.
    """
    if not html:
        return {}
    try:
        res = _get_gne().extract(html, host=base_url)
        # GNE returns {'title','publish_time','content','author','images'}
        return {
            "title": (res.get("title") or "").strip(),
            "publish_time": (res.get("publish_time") or "").strip(),
            "content": res.get("content") or "",
            "author": (res.get("author") or "").strip(),
        }
    except Exception as e:
        log.warning("gne_extract failed: %s", e)
        return {}


def trafilatura_extract(html: str, base_url: str) -> dict[str, Any]:
    """次兜底（per user additional_locked_decisions）：trafilatura 抽正文。失败返回 {}。
    trafilatura.extract returns plain text (we use output_format='txt').
    """
    if not html:
        return {}
    try:
        import trafilatura

        txt = trafilatura.extract(
            html,
            url=base_url,
            include_comments=False,
            include_tables=True,
            favor_recall=True,
        )
        # metadata
        meta = trafilatura.extract_metadata(html) if hasattr(trafilatura, "extract_metadata") else None
        title = (getattr(meta, "title", "") or "") if meta else ""
        pub = (getattr(meta, "date", "") or "") if meta else ""
        return {
            "title": title.strip() if title else "",
            "publish_time": pub.strip() if pub else "",
            "content": txt or "",  # trafilatura returns plain text; we treat it as content_text
            "author": "",
        }
    except Exception as e:
        log.warning("trafilatura_extract failed: %s", e)
        return {}
