"""Adapter for 国家法律法规数据库 (flk.npc.gov.cn).

The site is a Vue SPA. Its visible pages are thin shells over JSON APIs:

  * POST /law-search/search/list       -> paginated law rows
  * GET  /law-search/search/flfgDetails -> detail metadata + OFD/DOCX paths
  * GET  /law-search/amazonFile/previewLink + flkofd reader/text -> body text

This adapter intentionally crawls the API rather than scraping the SPA DOM.
"""
from __future__ import annotations

import json
import logging
import re
import time
import zipfile
from io import BytesIO
from datetime import date, datetime, timezone
from typing import Any
from urllib.parse import parse_qs, urlencode, urlparse
from xml.etree import ElementTree as ET

import httpx

from govcrawler.adapters.contract import CrawlItem, FetchStrategy
from govcrawler.fetcher.browser import FetchResult
from govcrawler.parser.detail_parser import DetailFields
from govcrawler.utils.url_norm import url_hash as compute_url_hash

log = logging.getLogger(__name__)

ADAPTER_ID = "flk_npc"
DEFAULT_INTERVAL_SEC: float = 5.0

BASE_URL = "https://flk.npc.gov.cn"
LIST_URL = f"{BASE_URL}/law-search/search/list"
DETAIL_URL = f"{BASE_URL}/law-search/search/flfgDetails"
DOWNLOAD_PC_URL = f"{BASE_URL}/law-search/download/pc"
PREVIEW_URL = f"{BASE_URL}/law-search/amazonFile/previewLink"
OFD_READER_BASE = "https://flkofd.npc.gov.cn"
MIN_USEFUL_BODY_CHARS = 120
DOCX_MAX_BYTES = 10 * 1024 * 1024
_WORD_NS = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"

# Whole "法律" top-level section, including law interpretations, decisions,
# amendments, and amendment/repeal decisions. Use only 110..170 for narrow
# sectoral laws.
LAW_SECTION_CODE_IDS = [110, 120, 130, 140, 150, 155, 160, 170, 180, 190, 195, 200]
ADMIN_REGULATION_CODE_IDS = [210, 215]
SUPERVISION_REGULATION_CODE_IDS = [220]
JUDICIAL_INTERPRETATION_CODE_IDS = [320, 330, 340, 350]

_HEADERS = {
    "User-Agent": "Mozilla/5.0 GovCrawler",
    "Accept": "application/json, text/plain, */*",
    "Origin": BASE_URL,
    "Referer": f"{BASE_URL}/search",
}


def _make_fetch_result(url: str, status: int, body: str, *, t0: float) -> FetchResult:
    return FetchResult(
        url=url,
        final_url=url,
        status=status,
        html=body or "",
        fetched_at=time.time(),
        duration_ms=int((time.time() - t0) * 1000),
        is_challenge=False,
        strategy=FetchStrategy.HTTPX.value,
    )


def _parse_date(value: Any) -> date | None:
    if not value:
        return None
    s = str(value).strip()
    for fmt in ("%Y-%m-%d", "%Y.%m.%d", "%Y/%m/%d"):
        try:
            return datetime.strptime(s, fmt).date()
        except ValueError:
            continue
    return None


def _parse_datetime(value: Any) -> datetime | None:
    d = _parse_date(value)
    if d is None:
        return None
    return datetime(d.year, d.month, d.day, tzinfo=timezone.utc)


VALIDITY_STATE_LABELS = {
    1: "已废止",
    2: "已修改",
    3: "有效",
    4: "尚未生效",
}


def _validity_state_code(sxx: Any) -> int | None:
    try:
        return int(sxx)
    except Exception:
        return None


def _validity_state_label(sxx: Any) -> str | None:
    state = _validity_state_code(sxx)
    if state is None:
        return None
    return VALIDITY_STATE_LABELS.get(state)


def _is_effective(sxx: Any) -> bool | None:
    """Map FLK validity state.

    Observed values:
      2 -> 已修改
      3 -> 有效
      4 -> 尚未生效
    "已修改" means the legal text has amendments; it is still a current text
    unless the database marks it as 废止.
    """
    state = _validity_state_code(sxx)
    if state is None:
        return None
    if state in (2, 3):
        return True
    if state in (1, 4):
        return False
    return None


def _detail_page_url(bbbs: str, title: str | None = None) -> str:
    params = {"id": bbbs, "fileId": "", "type": ""}
    if title:
        params["title"] = title
    return f"{BASE_URL}/detail?{urlencode(params)}"


def _list_body(
    page_num: int,
    page_size: int,
    code_ids: list[int],
    zdjg_code_ids: list[int] | None = None,
) -> dict[str, Any]:
    return {
        "searchRange": 1,
        "sxrq": [],
        "gbrq": [],
        "searchType": 2,
        "sxx": [],
        "gbrqYear": [],
        "flfgCodeId": code_ids,
        "zdjgCodeId": zdjg_code_ids or [],
        "searchContent": "",
        "orderByParam": {"order": "-1", "sort": ""},
        "pageNum": page_num,
        "pageSize": page_size,
    }


def _row_to_item(row: dict[str, Any]) -> CrawlItem:
    bbbs = str(row.get("bbbs") or "").strip()
    title = str(row.get("title") or "").strip() or "(无标题)"
    url = _detail_page_url(bbbs, title)
    publish_dt = _parse_datetime(row.get("gbrq"))
    effective_date = _parse_date(row.get("sxrq"))
    validity_state = _validity_state_code(row.get("sxx"))
    validity_label = _validity_state_label(row.get("sxx"))
    return CrawlItem(
        site_id="flk_npc",
        native_post_id=bbbs or None,
        url=url,
        url_hash=compute_url_hash(url),
        title=title,
        publish_time=publish_dt,
        source_raw=row.get("zdjgName") or None,
        publisher=row.get("zdjgName") or None,
        publish_date=_parse_date(row.get("gbrq")),
        effective_date=effective_date,
        is_effective=_is_effective(row.get("sxx")),
        content_category=row.get("flxz") or "法律",
        content_subcategory=str(row.get("flfgCodeId")) if row.get("flfgCodeId") else None,
        open_category=row.get("flxz") or None,
        metadata_json={
            "raw": row,
            "flk_bbbs": bbbs,
            "flfgCodeId": row.get("flfgCodeId"),
            "zdjgCodeId": row.get("zdjgCodeId"),
            "sxx": row.get("sxx"),
            "validity_state_code": validity_state,
            "validity_state_label": validity_label,
            "publish_date": str(row.get("gbrq") or ""),
            "effective_date": str(row.get("sxrq") or ""),
        },
    )


def fetch_list_page(
    rt,
    *,
    page_num: int,
    params: dict[str, Any] | None = None,
    page_size: int | None = None,
    interval_sec: float | None = None,
) -> tuple[str, list[CrawlItem], FetchResult]:
    params = params or {}
    code_ids = params.get("flfg_code_ids") or LAW_SECTION_CODE_IDS
    code_ids = [int(x) for x in code_ids]
    zdjg_code_ids = [int(x) for x in (params.get("zdjg_code_ids") or [])]
    size = int(page_size or params.get("page_size") or 20)
    body = _list_body(page_num, size, code_ids, zdjg_code_ids=zdjg_code_ids)

    t0 = time.time()
    r = httpx.post(LIST_URL, json=body, headers=_HEADERS, timeout=20)
    r.raise_for_status()
    payload = r.json()
    if payload.get("code") != 200:
        raise ValueError(f"flk list failed: {payload.get('msg') or payload}")
    rows = payload.get("rows") or []
    items = [_row_to_item(row) for row in rows if row.get("bbbs")]
    query = f"pageNum={page_num}&pageSize={size}&flfgCodeId={','.join(map(str, code_ids))}"
    if zdjg_code_ids:
        query += f"&zdjgCodeId={','.join(map(str, zdjg_code_ids))}"
    list_url = f"{LIST_URL}?{query}"
    return list_url, items, _make_fetch_result(list_url, r.status_code, r.text, t0=t0)


def _preview_file_param(file_path: str) -> str | None:
    r = httpx.get(PREVIEW_URL, params={"filePath": file_path}, headers=_HEADERS, timeout=20)
    r.raise_for_status()
    payload = r.json()
    if payload.get("code") != 200:
        return None
    reader_url = (payload.get("data") or {}).get("url") or ""
    parsed = urlparse(reader_url)
    return (parse_qs(parsed.query).get("file") or [None])[0]


def _extract_page_text(payload: dict[str, Any]) -> str:
    lines: list[str] = []
    for area in payload.get("areas") or []:
        for line in area.get("lines") or []:
            chars = line.get("chars") or []
            text = "".join(str(ch.get("char") or "") for ch in chars)
            if text.strip():
                lines.append(text.rstrip())
    return "\n".join(lines)


_STRUCTURAL_LINE_RE = re.compile(
    r"^("
    r"第[一二三四五六七八九十百千万零〇两0-9]+[编章节条款项目]"
    r"|[一二三四五六七八九十]+、"
    r"|[(（][一二三四五六七八九十0-9]+[)）]"
    r")$"
)
_PAGE_NUMBER_RE = re.compile(r"^[—\-－–]\s*\d+\s*[—\-－–]$")
_SPLIT_HEADING_CHARS = set("目録录序言总則则附")


def _is_structural_line(line: str) -> bool:
    s = line.strip()
    if not s:
        return True
    if _PAGE_NUMBER_RE.match(s):
        return True
    if _STRUCTURAL_LINE_RE.match(s):
        return True
    # Common top-level headings in laws, such as "总则" or "附则", are usually
    # short standalone lines without punctuation.
    if len(s) <= 12 and not re.search(r"[，。；：、,.!?！？]$", s):
        if re.search(r"(总则|附则|目录|说明|序言|分则)$", s):
            return True
    return False


def _should_join_ofd_lines(prev: str, cur: str) -> bool:
    prev_s = prev.strip()
    cur_s = cur.strip()
    if not prev_s or not cur_s:
        return False
    if _is_structural_line(prev_s) or _is_structural_line(cur_s):
        return False
    if _PAGE_NUMBER_RE.match(prev_s) or _PAGE_NUMBER_RE.match(cur_s):
        return False
    # Chinese OFD text is returned by visual line. For legal text, most visual
    # line breaks are hard wraps inside the same paragraph, so merge them
    # unless either side is a structural marker.
    if re.search(r"^[\u4e00-\u9fffA-Za-z0-9（(〔《【〈]", cur_s):
        return True
    return False


def _merge_split_heading_lines(lines: list[str]) -> list[str]:
    """Merge headings that the OFD reader splits into one character per line."""
    out: list[str] = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if (
            len(line) == 1
            and line in _SPLIT_HEADING_CHARS
            and i + 1 < len(lines)
            and len(lines[i + 1].strip()) == 1
            and lines[i + 1].strip() in _SPLIT_HEADING_CHARS
        ):
            merged = line + lines[i + 1].strip()
            if merged in {"目录", "目録", "序言", "总则", "總則", "附则", "附則"}:
                out.append(
                    merged.replace("目録", "目录")
                    .replace("總則", "总则")
                    .replace("附則", "附则")
                )
                i += 2
                continue
        out.append(line)
        i += 1
    return out


def _drop_table_of_contents(lines: list[str]) -> list[str]:
    """Remove OFD table-of-contents blocks when present.

    Some FLK OFD files expose the visual TOC before the actual law text. It is
    noisy for article preview/RAG and often contains split one-character
    headings. Keep the real body starting at "序言" or "第一条".
    """
    try:
        start = lines.index("目录")
    except ValueError:
        return lines

    stop: int | None = None
    for idx in range(start + 1, min(len(lines), start + 160)):
        line = lines[idx].strip()
        if line == "序言":
            next_line = ""
            for next_idx in range(idx + 1, len(lines)):
                next_line = lines[next_idx].strip()
                if next_line:
                    break
            if not next_line or _is_structural_line(next_line):
                continue
            stop = idx
            break
        if re.match(r"^第一条(?:\s|$)", line):
            stop = idx
            break
    if stop is None:
        return lines
    return lines[:start] + lines[stop:]


def _normalize_ofd_text(text: str) -> str:
    """Normalize OFD reader text by removing visual hard-wrap newlines.

    flkofd.npc.gov.cn returns text as page layout lines, not semantic
    paragraphs. Without this, article previews show one visual line per
    paragraph. Keep legal structure markers such as "第一条" on their own
    lines, remove page numbers, and merge wrapped body lines.
    """
    cleaned_lines: list[str] = []
    for raw in (text or "").splitlines():
        line = re.sub(r"\s+", " ", raw).strip()
        if not line:
            continue
        if _PAGE_NUMBER_RE.match(line):
            continue
        cleaned_lines.append(line)

    cleaned_lines = _drop_table_of_contents(_merge_split_heading_lines(cleaned_lines))

    out: list[str] = []
    for line in cleaned_lines:
        if out and out[-1] and _should_join_ofd_lines(out[-1], line):
            out[-1] = out[-1].rstrip() + line
        else:
            out.append(line)

    # Collapse excessive blanks but keep intentional paragraph/page breaks.
    normalized: list[str] = []
    for line in out:
        if line == "" and (not normalized or normalized[-1] == ""):
            continue
        normalized.append(line)
    while normalized and normalized[-1] == "":
        normalized.pop()
    return "\n".join(normalized).strip()


def _fetch_ofd_text(file_path: str, *, max_pages: int = 300) -> str:
    file_param = _preview_file_param(file_path)
    if not file_param:
        return ""
    common = {"file": file_param, "_b": "3.2.0", "_v": "1"}
    t = str(int(time.time() * 1000))
    info = httpx.get(
        f"{OFD_READER_BASE}/reader/info",
        params={**common, "_": t},
        timeout=20,
    )
    info.raise_for_status()
    info_payload = info.json()
    page_count = len(info_payload.get("area") or [])
    if page_count <= 0:
        return ""
    page_count = min(page_count, max_pages)
    chunks: list[str] = []
    for idx in range(page_count):
        r = httpx.get(
            f"{OFD_READER_BASE}/reader/text",
            params={**common, "_": str(int(time.time() * 1000)), "_i": idx},
            timeout=20,
        )
        r.raise_for_status()
        text = _extract_page_text(r.json())
        if text:
            chunks.append(text)
    return _normalize_ofd_text("\n\n".join(chunks))


def _extract_docx_text(data: bytes) -> str:
    """Extract semantic paragraphs from a DOCX attachment.

    FLK exposes many documents as OFD previews plus DOCX downloads. Some OFD
    previews do not return useful page text even though the DOCX contains the
    full legal text. Keep this parser dependency-free so the crawler can use it
    as a narrow fallback without adding a heavyweight document stack.
    """
    if not data:
        return ""
    try:
        with zipfile.ZipFile(BytesIO(data)) as zf:
            xml_bytes = zf.read("word/document.xml")
    except Exception:
        return ""
    try:
        root = ET.fromstring(xml_bytes)
    except ET.ParseError:
        return ""

    paragraphs: list[str] = []
    for para in root.findall(f".//{_WORD_NS}body/{_WORD_NS}p"):
        parts: list[str] = []
        for node in para.iter():
            if node.tag == f"{_WORD_NS}t":
                parts.append(node.text or "")
            elif node.tag == f"{_WORD_NS}tab":
                parts.append(" ")
            elif node.tag == f"{_WORD_NS}br":
                parts.append("\n")
        text = re.sub(r"[ \t]+", " ", "".join(parts)).strip()
        if text:
            paragraphs.append(text)
    return "\n".join(paragraphs).strip()


def _fetch_docx_text(url: str) -> str:
    r = httpx.get(url, headers=_HEADERS, timeout=30, follow_redirects=True)
    r.raise_for_status()
    data = r.content or b""
    if len(data) > DOCX_MAX_BYTES:
        log.warning("flk docx text fallback skipped: file too large bytes=%s url=%s", len(data), url)
        return ""
    return _extract_docx_text(data)


def _first_docx_url(urls: list[str]) -> str | None:
    for url in urls:
        path = urlparse(url).path.lower()
        if path.endswith(".docx") or ".docx" in path:
            return url
    return None


def _drop_leading_duplicate_title(text: str, title: str) -> str:
    lines = [line.strip() for line in (text or "").splitlines() if line.strip()]
    if not lines:
        return ""
    norm_title = re.sub(r"\s+", "", title or "")
    if norm_title and re.sub(r"\s+", "", lines[0]) == norm_title:
        lines = lines[1:]
    return "\n".join(lines).strip()


def _resolve_download_url(format_: str, bbbs: str, file_id: str | None = None) -> str | None:
    params = {"format": format_, "bbbs": bbbs, "fileId": file_id or ""}
    r = httpx.get(DOWNLOAD_PC_URL, params=params, headers=_HEADERS, timeout=20)
    r.raise_for_status()
    payload = r.json()
    if payload.get("code") != 200:
        return None
    data = payload.get("data") or {}
    return data.get("url") or None


def _attachment_urls(bbbs: str, data: dict[str, Any], oss: dict[str, Any]) -> list[str]:
    urls: list[str] = []
    file_id = str(data.get("fileId") or "")
    formats = [
        ("docx", "ossWordPath"),
        ("pdf", "ossPdfPath"),
    ]
    for format_, path_key in formats:
        if not oss.get(path_key):
            continue
        try:
            url = _resolve_download_url(format_, bbbs, file_id)
        except Exception as e:
            log.warning(
                "flk attachment url resolve failed bbbs=%s format=%s err=%s",
                bbbs,
                format_,
                e,
            )
            continue
        if url and url not in urls:
            urls.append(url)
    return urls


def normalize_content_text(text: str) -> str:
    """Public helper for backfilling already-stored FLK article text."""
    if "\n\n" in (text or ""):
        header, body = text.split("\n\n", 1)
        return header.strip() + "\n\n" + _normalize_ofd_text(body)
    return _normalize_ofd_text(text)



def _bbbs_from_url(url: str) -> str | None:
    parsed = urlparse(url)
    return (parse_qs(parsed.query).get("id") or [None])[0]


def fetch_detail(rt, *, url: str, list_item: CrawlItem | None = None) -> tuple[FetchResult, DetailFields]:
    bbbs = (list_item.native_post_id if list_item else None) or _bbbs_from_url(url)
    if not bbbs:
        raise ValueError(f"missing flk bbbs in url={url!r}")

    t0 = time.time()
    r = httpx.get(DETAIL_URL, params={"bbbs": bbbs}, headers=_HEADERS, timeout=20)
    r.raise_for_status()
    payload = r.json()
    if payload.get("code") != 200:
        raise ValueError(f"flk detail failed: {payload.get('msg') or payload}")
    data = payload.get("data") or {}
    oss = data.get("ossFile") or {}
    attachment_urls = _attachment_urls(str(bbbs), data, oss)

    title = data.get("title") or (list_item.title if list_item else "") or "(无标题)"
    detail_url = _detail_page_url(bbbs, title)
    ofd_path = oss.get("ossWordOfdPath") or oss.get("ossPdfOfdPath")
    content_text = ""
    if ofd_path:
        try:
            content_text = _fetch_ofd_text(str(ofd_path))
        except Exception as e:
            log.warning("flk ofd text fetch failed bbbs=%s path=%s err=%s", bbbs, ofd_path, e)

    if len(content_text or "") < MIN_USEFUL_BODY_CHARS:
        docx_url = _first_docx_url(attachment_urls)
        if docx_url:
            try:
                docx_text = _fetch_docx_text(docx_url)
                if len(docx_text) > len(content_text or ""):
                    content_text = docx_text
            except Exception as e:
                log.warning("flk docx text fallback failed bbbs=%s err=%s", bbbs, e)
    content_text = _drop_leading_duplicate_title(content_text, str(title))

    header = [
        str(title),
        f"制定机关：{data.get('zdjgName') or ''}",
        f"法律法规分类：{data.get('flxz') or ''}",
        f"时效性：{_validity_state_label(data.get('sxx')) or ''}",
        f"公布日期：{data.get('gbrq') or ''}",
        f"施行日期：{data.get('sxrq') or ''}",
    ]
    if content_text:
        full_text = "\n".join(x for x in header if x.strip("：")) + "\n\n" + content_text
    else:
        full_text = "\n".join(x for x in header if x.strip("："))

    raw_html = json.dumps(payload, ensure_ascii=False)
    fr = _make_fetch_result(detail_url, r.status_code, raw_html, t0=t0)
    fields = DetailFields(
        title=str(title),
        publish_time_raw=str(data.get("gbrq") or ""),
        source=str(data.get("zdjgName") or ""),
        content_html=raw_html,
        content_text=full_text,
        attachment_urls=attachment_urls,
        used_fallback=False,
        fallback_engine=None,
        publisher=data.get("zdjgName") or None,
        publish_date=_parse_date(data.get("gbrq")),
        effective_date=_parse_date(data.get("sxrq")),
        is_effective=_is_effective(data.get("sxx")),
        open_category=data.get("flxz") or None,
        content_category=data.get("flxz") or "法律",
        content_subcategory=str(data.get("flfgCodeId")) if data.get("flfgCodeId") else None,
        public_meta={
            "flk_bbbs": str(bbbs),
            "flxz": str(data.get("flxz") or ""),
            "zdjgName": str(data.get("zdjgName") or ""),
            "gbrq": str(data.get("gbrq") or ""),
            "sxrq": str(data.get("sxrq") or ""),
            "sxx": str(data.get("sxx") or ""),
            "validity_state_code": str(_validity_state_code(data.get("sxx")) or ""),
            "validity_state_label": str(_validity_state_label(data.get("sxx")) or ""),
            "publish_date": str(data.get("gbrq") or ""),
            "effective_date": str(data.get("sxrq") or ""),
            "ossWordPath": str(oss.get("ossWordPath") or ""),
            "ossWordOfdPath": str(oss.get("ossWordOfdPath") or ""),
            "ossPdfPath": str(oss.get("ossPdfPath") or ""),
            "ossPdfOfdPath": str(oss.get("ossPdfOfdPath") or ""),
        },
    )
    return fr, fields


def build_list_url(site, column_id: str, page: int = 1) -> str:
    return f"{LIST_URL}?pageNum={page}"


def parse_list_response(site, column_id: str, body: str) -> list[CrawlItem]:
    payload = json.loads(body)
    return [_row_to_item(row) for row in (payload.get("rows") or []) if row.get("bbbs")]
