"""HTML sub-column discovery — heuristic counterpart to gkmlpt's TREE-based
discover. Given a portal page URL, scrape the HTML and surface anchors that
look like sub-column index pages (政府文件 / 通知公告 / …) so the operator
can bulk-create crawl_targets without copy-pasting URLs one by one.

Most 政务公开 portals nest 2–3 levels (zwgk → 政府文件 → 规范性文件 / 其他文件),
so the endpoint does bounded BFS into the subtree and tags each node with
`has_articles` so the UI can mark leaf lists vs intermediate portals.
"""
from __future__ import annotations

import logging
import re
from typing import Any
from urllib.parse import urljoin, urlparse

import httpx
from fastapi import Body, HTTPException
from parsel import Selector

from ._common import _normalize_str, router

log = logging.getLogger(__name__)

# Anchors whose text matches these get rejected — they tend to be "more"
# pagers / pagination links, not new sub-columns.
_REJECT_TEXTS: tuple[str, ...] = ("更多", "more", "More", ">>", ">>>")
# Path markers that identify article (leaf content) URLs, not column index.
_ARTICLE_MARKERS: tuple[str, ...] = ("/post_", "/content/", "/article/")
# Heuristic regex for "this URL looks like an article" — used to detect
# whether a page itself is a leaf list (has many such links) or a portal.
_ARTICLE_HREF_RE = re.compile(r"/(?:post_|content/post_|article/)\d+", re.I)
# Detection thresholds
_HAS_ARTICLES_MIN = 3
# BFS bounds — keep responses under ~20s on slow gov hosts
DEFAULT_MAX_DEPTH = 3
DEFAULT_MAX_NODES = 80


def _column_id_from_path(path: str) -> str | None:
    """Pull a yaml-friendly column_id from the last meaningful segment.

    /zwgk/zfwj/gfxwj/index.html → 'gfxwj'
    /zwgk/qxyw/                 → 'qxyw'
    /zwgk/                      → 'zwgk'
    /                           → None
    """
    segs = [s for s in path.split("/") if s and s != "index.html"]
    if not segs:
        return None
    last = segs[-1].replace(".html", "")
    return last.replace("-", "_") if last else None


def discover_html_sub_columns(url: str, html: str) -> list[dict[str, Any]]:
    """One-level child discovery. Returns candidates strictly *under* the
    URL's path, filtering out external links, article URLs, anchors, and
    'more' pagers. Used as the BFS expansion step."""
    sel = Selector(text=html or "")
    base = urlparse(url)
    base_path = base.path if base.path.endswith("/") else base.path.rsplit("/", 1)[0] + "/"

    out: dict[str, dict[str, Any]] = {}
    for a in sel.css("a"):
        href = a.css("::attr(href)").get() or ""
        text = (a.css("::text").get() or "").strip()
        if not href or not text:
            continue
        if not (2 <= len(text) <= 30):
            continue
        if any(rj in text for rj in _REJECT_TEXTS):
            continue
        full = urljoin(url, href)
        u = urlparse(full)
        full_clean = f"{u.scheme}://{u.netloc}{u.path}"
        if u.netloc and u.netloc != base.netloc:
            continue
        if not u.path.startswith(base_path):
            continue
        if u.path == base.path:
            continue
        if any(m in u.path for m in _ARTICLE_MARKERS):
            continue
        if u.path.endswith(".html") and not u.path.endswith("/index.html"):
            continue
        cid = _column_id_from_path(u.path)
        if not cid:
            continue
        if full_clean in out:
            continue
        rel = u.path[len(base_path):].strip("/")
        depth = rel.count("/") + 1 if rel else 0
        out[full_clean] = {
            "column_id": cid,
            "name": text,
            "list_url": full_clean,
            "depth_in_parent": depth,
        }
    return list(out.values())


def _has_article_links(html: str, threshold: int = _HAS_ARTICLES_MIN) -> bool:
    """Quick check: does the HTML contain at least `threshold` href values
    that look like article URLs? Used to mark leaf list pages."""
    if not html:
        return False
    return len(_ARTICLE_HREF_RE.findall(html)) >= threshold


def discover_html_tree(
    entry_url: str,
    *,
    max_depth: int = DEFAULT_MAX_DEPTH,
    max_nodes: int = DEFAULT_MAX_NODES,
    timeout: float = 12.0,
) -> list[dict[str, Any]]:
    """Bounded BFS from entry_url. Each node carries:
        - column_id, name, list_url, depth (0=entry)
        - parent_url (None for entry)
        - has_articles: True when the page itself contains >=3 article-shaped
          hrefs → it's a leaf the user can crawl
        - has_children: True when its sub_columns list isn't empty →
          intermediate portal
    """
    seen_url: set[str] = set()
    queue: list[tuple[str, int, str | None, str]] = [
        (entry_url, 0, None, urlparse(entry_url).path.rstrip("/").split("/")[-1] or "root")
    ]
    nodes: list[dict[str, Any]] = []

    # Use the fetcher chain (httpx → playwright fallback) so ctct-shielded
    # hosts (gdqy.gov.cn, future-similar prefecture sites) get unlocked
    # before we try to parse anchors. Cookies persist within one chain
    # call, so the BFS naturally reuses them across child pages without
    # us having to wire a Client manually.
    from govcrawler.fetcher.chain import fetch_html as _fetch_html_chain
    import time as _time
    # Polite spacing between BFS iterations so a 80-node tree doesn't fire
    # 80 requests in seconds against a single host (省级 / ctct sites tend
    # to slap IP rate-limits on bursts). 1.5s is a reasonable default for
    # an admin-triggered probe — operators can tolerate the wait, and it
    # keeps us well under any plausible threshold.
    BFS_DELAY_SEC = 1.5
    first = True
    while queue and len(nodes) < max_nodes:
        url, depth, parent_url, link_text = queue.pop(0)
        if url in seen_url:
            continue
        seen_url.add(url)
        if not first:
            _time.sleep(BFS_DELAY_SEC)
        first = False
        fr = _fetch_html_chain(url)
        if fr.error or not fr.html or fr.is_challenge:
            err = fr.error or ("ctct_challenge" if fr.is_challenge else "empty")
            log.warning("discover_html fetch failed url=%s err=%s", url, err)
            if depth == 0:
                raise Exception(err)
            continue
        html = fr.html

        sub_cols = discover_html_sub_columns(url, html)
        has_articles = _has_article_links(html)
        cid = _column_id_from_path(urlparse(url).path) or "root"
        nodes.append({
            "column_id": cid,
            "name": link_text,
            "list_url": url,
            "depth": depth,
            "parent_url": parent_url,
            "has_articles": has_articles,
            "has_children": bool(sub_cols),
        })

        if depth < max_depth:
            for sc in sub_cols:
                if sc["list_url"] in seen_url:
                    continue
                queue.append((sc["list_url"], depth + 1, url, sc["name"]))

    return nodes


@router.post("/api/targets/discover-html")
def discover_html_endpoint(payload: dict[str, Any] = Body(...)) -> dict[str, Any]:
    """Fetch a portal URL, recursively discover sub-column index pages
    (max_depth levels), and return the flattened tree the UI can render
    as a checkbox picker. Counterpart to gkmlpt's `/api/targets/discover`
    for non-adapter (yaml-path) sites."""
    entry_url = _normalize_str(payload.get("entry_url"))
    if not entry_url:
        raise HTTPException(400, "entry_url is required")
    parsed = urlparse(entry_url)
    if not parsed.scheme or not parsed.netloc:
        raise HTTPException(400, f"URL must include scheme + host: {entry_url}")

    max_depth = int(payload.get("max_depth") or DEFAULT_MAX_DEPTH)
    max_nodes = int(payload.get("max_nodes") or DEFAULT_MAX_NODES)
    max_depth = max(1, min(max_depth, 5))
    max_nodes = max(5, min(max_nodes, 200))

    try:
        nodes = discover_html_tree(
            entry_url, max_depth=max_depth, max_nodes=max_nodes
        )
    except Exception as e:
        raise HTTPException(502, f"fetch portal failed: {e}")

    return {
        "entry_url": entry_url,
        "max_depth": max_depth,
        "count": len(nodes),
        "nodes": nodes,
    }