from __future__ import annotations
from lxml import html as lxml_html

BLOCK_TAGS = {
    "p", "div", "br", "li", "tr",
    "h1", "h2", "h3", "h4", "h5", "h6",
    "section", "article", "blockquote",
}
REMOVE_TAGS = {"script", "style", "noscript", "iframe"}


def html_to_text(html_str: str) -> str:
    """Convert HTML fragment to paragraph-preserving plain text.
    - Remove script/style/noscript/iframe
    - Insert '\\n' after every block-level tag's tail
    - Collapse runs of blank lines, strip per-line whitespace
    """
    if not html_str or not html_str.strip():
        return ""
    try:
        doc = lxml_html.fragment_fromstring(html_str, create_parent="div")
    except Exception:
        # Fallback — wrap in minimal html; lxml is tolerant
        doc = lxml_html.fromstring(f"<div>{html_str}</div>")
    # Drop script/style subtrees
    for el in list(doc.iter()):
        if el.tag in REMOVE_TAGS:
            parent = el.getparent()
            if parent is not None:
                parent.remove(el)
    # Tail-inject newlines on block tags
    for el in doc.iter():
        if el.tag in BLOCK_TAGS:
            el.tail = ("\n" + el.tail) if el.tail else "\n"
    text = doc.text_content()
    lines = [ln.strip() for ln in text.splitlines()]
    return "\n".join(ln for ln in lines if ln)