from govcrawler.parser.detail_parser import parse_detail
from govcrawler.sites.gdqy import DETAIL_SELECTORS

SAMPLE_HTML = """
<html><body>
  <h1 class="article-title">关于无人驾驶航空器安全管控的公告</h1>
  <div class="info">
    <span class="time">2026-04-10 16:34:22</span>
    <span class="source">清远市人民政府</span>
  </div>
  <div class="article-content">
    <p>第一段：为保障 2026 年第四届全国轻型飞机锦标赛航空嘉年华活动期间空域安全，现就相关事项公告如下。</p>
    <p>第二段：管控时间自 2026 年 5 月 1 日起至 5 月 10 日止。</p>
    <p>第三段：管控区域为清远市清城区、清新区以北半径 30 公里内空域。</p>
    <p>附件：<a href="/attach/notice.pdf">公告全文.pdf</a></p>
  </div>
</body></html>
"""


def test_main_xpath_path():
    r = parse_detail(SAMPLE_HTML, "https://www.gdqy.gov.cn/x/post_1.html", DETAIL_SELECTORS)
    assert "无人驾驶航空器安全管控" in r.title
    assert "2026-04-10" in r.publish_time_raw
    assert "清远市人民政府" in r.source
    assert "第一段" in r.content_text
    assert r.used_fallback is False
    assert r.fallback_engine is None


def test_attachment_absolute():
    r = parse_detail(SAMPLE_HTML, "https://www.gdqy.gov.cn/x/post_1.html", DETAIL_SELECTORS)
    assert len(r.attachment_urls) == 1
    assert r.attachment_urls[0].startswith("https://www.gdqy.gov.cn/")
    assert r.attachment_urls[0].endswith("notice.pdf")


def test_content_text_no_html_tags():
    r = parse_detail(SAMPLE_HTML, "https://www.gdqy.gov.cn/x/post_1.html", DETAIL_SELECTORS)
    assert "<p>" not in r.content_text
    assert "\n" in r.content_text  # paragraph preserved
