from govcrawler.parser.detail_parser import parse_detail
from govcrawler.sites.gdqy import DETAIL_SELECTORS

SAMPLE_HTML = """
<html><body>
  <h1 class="article-title">关于无人驾驶航空器安全管控的公告</h1>
  <div class="info">
    <span class="time">2026-04-10 16:34:22</span>
    <span class="source">清远市人民政府</span>
  </div>
  <div class="article-content">
    <p>第一段：为保障 2026 年第四届全国轻型飞机锦标赛航空嘉年华活动期间空域安全，现就相关事项公告如下。</p>
    <p>第二段：管控时间自 2026 年 5 月 1 日起至 5 月 10 日止。</p>
    <p>第三段：管控区域为清远市清城区、清新区以北半径 30 公里内空域。</p>
    <p>附件：<a href="/attach/notice.pdf">公告全文.pdf</a></p>
  </div>
</body></html>
"""


def test_main_xpath_path():
    r = parse_detail(SAMPLE_HTML, "https://www.gdqy.gov.cn/x/post_1.html", DETAIL_SELECTORS)
    assert "无人驾驶航空器安全管控" in r.title
    assert "2026-04-10" in r.publish_time_raw
    assert "清远市人民政府" in r.source
    assert "第一段" in r.content_text
    assert r.used_fallback is False
    assert r.fallback_engine is None


def test_attachment_absolute():
    r = parse_detail(SAMPLE_HTML, "https://www.gdqy.gov.cn/x/post_1.html", DETAIL_SELECTORS)
    assert len(r.attachment_urls) == 1
    assert r.attachment_urls[0].startswith("https://www.gdqy.gov.cn/")
    assert r.attachment_urls[0].endswith("notice.pdf")


def test_content_text_no_html_tags():
    r = parse_detail(SAMPLE_HTML, "https://www.gdqy.gov.cn/x/post_1.html", DETAIL_SELECTORS)
    assert "<p>" not in r.content_text
    assert "\n" in r.content_text  # paragraph preserved


def test_source_prefix_is_normalized():
    html = """
    <html><body>
      <h1 class="article-title">标题</h1>
      <span class="time">2026-04-10 16:34:22</span>
      <span class="source">文章来源：新华社</span>
      <div class="article-content"><p>正文内容足够长，足够长，足够长，足够长，足够长，足够长。</p></div>
    </body></html>
    """
    selectors = dict(DETAIL_SELECTORS)
    selectors["source"] = "span.source::text"
    r = parse_detail(html, "https://www.news.cn/x.html", selectors)
    assert r.source == "新华社"


def test_extracts_government_public_metadata_from_label_table():
    html = """
    <html><body>
      <h1 class="article-title">广东省人民政府关于印发规划纲要的通知</h1>
      <span class="time">2026-04-28 10:00</span>
      <div class="introduce">
        <div><label>索引号：</label><span>006939748/2026-00100</span></div>
        <div><label>分类：</label><span>国民经济管理、国有资产监管</span></div>
        <div><label>发布机构：</label><span>广东省人民政府</span></div>
        <div><label>成文日期：</label><span>2026-03-26</span></div>
        <div><label>施行日期：</label><span>2026-07-01</span></div>
        <div><label>效力状态：</label><span>现行有效</span></div>
        <div><label>废止日期：</label><span>2031-06-30</span></div>
        <div><label>文号：</label><span>粤府〔2026〕24号</span></div>
        <div><label>发布日期：</label><span>2026-04-28</span></div>
      </div>
      <div class="article-content"><p>正文内容足够长，足够长，足够长，足够长，足够长，足够长。</p></div>
    </body></html>
    """

    r = parse_detail(html, "https://www.gd.gov.cn/zwgk/wjk/qbwj/yf/content/post_1.html", DETAIL_SELECTORS)

    assert r.index_no == "006939748/2026-00100"
    assert r.publisher == "广东省人民政府"
    assert r.doc_no == "粤府〔2026〕24号"
    assert r.publish_date.isoformat() == "2026-03-26"
    assert r.effective_date.isoformat() == "2026-07-01"
    assert r.is_effective is True
    assert r.expiry_date.isoformat() == "2031-06-30"
    assert r.content_category == "国民经济管理"
    assert r.content_subcategory == "国有资产监管"
    assert r.open_category == "国民经济管理、国有资产监管"
    assert r.public_meta["索引号"] == "006939748/2026-00100"


def test_extracts_government_public_metadata_from_gov_cn_table_aliases():
    html = """
    <html><body>
      <title>国务院关于试点实施方案的批复</title>
      <span class="time">2025-09-11 15:00</span>
      <table>
        <tr>
          <td><b>索 引 号：</b></td><td>000014349/2025-00075</td>
          <td><b>主题分类：</b></td><td>国民经济管理、国有资产监管\\经济体制改革</td>
        </tr>
        <tr>
          <td><b>发文机关：</b></td><td>国务院</td>
          <td><b>成文日期：</b></td><td>2025年09月08日</td>
        </tr>
        <tr>
          <td><b>发文字号：</b></td><td>国函〔2025〕86号</td>
          <td><b>发布日期：</b></td><td>2025年09月11日</td>
        </tr>
        <tr>
          <td><b>有效期：</b></td><td>2025年10月01日至2030年09月30日</td>
          <td><b>效力状态：</b></td><td>已失效</td>
        </tr>
      </table>
      <div class="article-content"><p>正文内容足够长，足够长，足够长，足够长，足够长，足够长。</p></div>
    </body></html>
    """

    r = parse_detail(html, "https://www.gov.cn/zhengce/content/x.htm", DETAIL_SELECTORS)

    assert r.index_no == "000014349/2025-00075"
    assert r.publisher == "国务院"
    assert r.doc_no == "国函〔2025〕86号"
    assert r.publish_date.isoformat() == "2025-09-08"
    assert r.effective_date.isoformat() == "2025-10-01"
    assert r.is_effective is False
    assert r.expiry_date.isoformat() == "2030-09-30"
    assert r.content_category == "国民经济管理、国有资产监管"
    assert r.content_subcategory == "经济体制改革"
    assert r.open_category == "国民经济管理、国有资产监管、经济体制改革"


def test_public_metadata_does_not_treat_next_label_as_empty_doc_no():
    html = """
    <html><body>
      <table>
        <tr>
          <td><b>文号：</b></td>
          <td><b>发布日期：</b></td>
          <td>2026-04-28</td>
        </tr>
      </table>
      <div class="article-content"><p>正文内容足够长，足够长，足够长，足够长，足够长，足够长。</p></div>
    </body></html>
    """

    r = parse_detail(html, "https://example.com/a.html", DETAIL_SELECTORS)
    assert r.doc_no is None
