"""Adapter for 新华网·习近平报道集.

The visible column pages render the first batch of rows in HTML and use a
``datasource:<id>`` marker plus ``ds_<id>.json`` for "加载更多".  The legacy
CSS scraper only sees the static first screen, so this adapter reads the JSON
datasource directly and exposes it as normal paginated CrawlItems.
"""
from __future__ import annotations

import html
import json
import re
import time
from datetime import datetime, timezone
from typing import Any
from urllib.parse import urljoin

import httpx

from govcrawler.adapters.contract import CrawlItem, FetchStrategy
from govcrawler.fetcher.browser import FetchResult
from govcrawler.utils.url_norm import url_hash as compute_url_hash

ADAPTER_ID = "xinhua_xjp"
DEFAULT_INTERVAL_SEC: float = 5.0

BASE_URL = "https://www.news.cn"

DEFAULT_DETAIL_SELECTORS = {
    "title": "h1::text, title::text",
    "publish_time": "meta[name='publishdate']::attr(content)",
    "source": "meta[name='source']::attr(content)",
    "content": "span#detailContent, div#detailContent",
    "attachment_css": "",
}

_HEADERS = {
    "User-Agent": "Mozilla/5.0 GovCrawler",
    "Accept": "text/html,application/json,application/xhtml+xml,*/*",
    "Referer": f"{BASE_URL}/politics/leaders/xijinping/",
}
_DATASOURCE_RE = re.compile(
    r"<ul\b(?=[^>]*\bxpage-content-list\b)(?P<attrs>[^>]*)>",
    re.IGNORECASE | re.DOTALL,
)
_ATTR_RE = re.compile(r"\b(?P<name>[a-zA-Z_:.-]+)\s*=\s*(['\"])(?P<value>.*?)\2", re.DOTALL)
_HTML_TAG_RE = re.compile(r"<[^>]+>")
_HREF_RE = re.compile(r"\bhref\s*=\s*(['\"])(?P<href>.*?)\1", re.IGNORECASE | re.DOTALL)


def _make_fetch_result(url: str, status: int, body: str, *, t0: float) -> FetchResult:
    return FetchResult(
        url=url,
        final_url=url,
        status=status,
        html=body or "",
        fetched_at=time.time(),
        duration_ms=int((time.time() - t0) * 1000),
        is_challenge=False,
        strategy=FetchStrategy.HTTPX.value,
    )


def _clean_text(value: Any) -> str:
    text = html.unescape(str(value or ""))
    text = _HTML_TAG_RE.sub("", text)
    return re.sub(r"\s+", " ", text).strip()


def _parse_publish_time(value: Any) -> datetime | None:
    if not value:
        return None
    s = str(value).strip()
    for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y/%m/%d %H:%M:%S", "%Y/%m/%d"):
        try:
            dt = datetime.strptime(s, fmt)
            return dt.replace(tzinfo=timezone.utc)
        except ValueError:
            continue
    return None


def _attrs_from_tag(tag_attrs: str) -> dict[str, str]:
    return {
        m.group("name").lower(): html.unescape(m.group("value"))
        for m in _ATTR_RE.finditer(tag_attrs or "")
    }


def _extract_datasource_url(list_html: str, list_url: str) -> tuple[str, str | None]:
    """Return (json_url, datasource_id) from an Xinhua list page."""
    for m in _DATASOURCE_RE.finditer(list_html or ""):
        attrs = _attrs_from_tag(m.group("attrs"))
        data = attrs.get("data", "")
        if not data.startswith("datasource:"):
            continue
        datasource_id = data.split(":", 1)[1].strip()
        if not datasource_id:
            continue
        preview = attrs.get("preview") or "ds_"
        return urljoin(list_url, f"{preview}{datasource_id}.json"), datasource_id
    raise ValueError("xinhua_xjp list page has no xpage-content-list datasource")


def _extract_url(row: dict[str, Any], list_url: str) -> str | None:
    for key in ("publishUrl", "sourceLink", "url", "href"):
        raw = row.get(key)
        if raw:
            return urljoin(list_url, str(raw).strip())
    rich_title = str(row.get("showTitle") or row.get("title") or "")
    m = _HREF_RE.search(rich_title)
    if m:
        return urljoin(list_url, html.unescape(m.group("href")).strip())
    return None


def _row_to_item(rt, row: dict[str, Any], *, list_url: str, datasource_url: str, datasource_id: str | None) -> CrawlItem | None:
    url = _extract_url(row, list_url)
    if not url:
        return None
    title = _clean_text(row.get("showTitle") or row.get("title"))
    if not title:
        return None
    publish_time = _parse_publish_time(row.get("publishTime") or row.get("publish_time"))
    source = _clean_text(row.get("source") or row.get("sourceName") or row.get("sourceText"))
    native_post_id = str(row.get("contentId") or row.get("id") or "").strip() or None
    return CrawlItem(
        site_id=rt.site.site_code if rt is not None else ADAPTER_ID,
        target_id=rt.target.id if rt is not None else None,
        dept_id=rt.target.dept_id if rt is not None else None,
        native_post_id=native_post_id,
        url=url,
        url_hash=compute_url_hash(url),
        title=title,
        publish_time=publish_time,
        source_raw=source or "新华网",
        publisher=source or "新华网",
        publish_date=publish_time.date() if publish_time else None,
        channel_name=rt.target.channel_name if rt is not None else None,
        channel_path=rt.target.channel_path if rt is not None else None,
        content_category=rt.target.content_category if rt is not None else None,
        content_subcategory=rt.target.content_subcategory if rt is not None else None,
        metadata_json={
            "raw": row,
            "datasource_id": datasource_id,
            "datasource_url": datasource_url,
        },
        fetch_strategy=FetchStrategy.HTTPX,
    )


def fetch_list_page(
    rt,
    *,
    page_num: int,
    params: dict[str, Any] | None = None,
    page_size: int | None = None,
    interval_sec: float | None = None,
) -> tuple[str, list[CrawlItem], FetchResult]:
    params = params or {}
    size = int(page_size or params.get("page_size") or 20)
    list_url = rt.target.entry_url or rt.site.base_url or BASE_URL

    t0 = time.time()
    page_resp = httpx.get(list_url, headers=_HEADERS, timeout=20, follow_redirects=True)
    page_resp.raise_for_status()
    datasource_url, datasource_id = _extract_datasource_url(page_resp.text, str(page_resp.url or list_url))

    json_resp = httpx.get(
        datasource_url,
        headers={**_HEADERS, "Accept": "application/json,text/plain,*/*", "Referer": list_url},
        timeout=20,
        follow_redirects=True,
    )
    json_resp.raise_for_status()
    payload = json_resp.json()
    rows = payload.get("datasource") if isinstance(payload, dict) else None
    if rows is None and isinstance(payload, list):
        rows = payload
    if not isinstance(rows, list):
        raise ValueError("xinhua_xjp datasource payload has no datasource list")

    start = max(0, (int(page_num) - 1) * size)
    page_rows = rows[start:start + size]
    items = [
        item for row in page_rows
        if isinstance(row, dict)
        for item in [_row_to_item(rt, row, list_url=list_url, datasource_url=datasource_url, datasource_id=datasource_id)]
        if item is not None
    ]
    result_url = f"{datasource_url}?page={page_num}&pageSize={size}"
    body = json.dumps(payload, ensure_ascii=False)
    return result_url, items, _make_fetch_result(result_url, json_resp.status_code, body, t0=t0)