from datetime import datetime
from pathlib import Path

from govcrawler.models import Article
from scripts.backfill_article_detail_metadata import (
    _public_values_from_html,
    apply_values,
    values_for_article,
)


def test_public_values_from_html_normalizes_gd_wjk_labels():
    html = """
    <label>索引号：</label><span>006939748/2026-00100</span>
    <label>分类：</label><span>国民经济管理、国有资产监管</span>
    <label>发布机构：</label><span>广东省人民政府</span>
    <label>成文日期：</label><span>2026-03-26</span>
    <label>文号：</label><span>粤府〔2026〕24号</span>
    """

    values = _public_values_from_html(html)

    assert values["index_no"] == "006939748/2026-00100"
    assert values["publisher"] == "广东省人民政府"
    assert values["doc_no"] == "粤府〔2026〕24号"
    assert values["publish_date"].isoformat() == "2026-03-26"
    assert values["content_category"] == "国民经济管理"
    assert values["content_subcategory"] == "国有资产监管"
    assert values["metadata_json"]["public_meta"]["分类"] == "国民经济管理、国有资产监管"


def test_values_for_article_falls_back_to_source_and_publish_time(tmp_path: Path):
    article = Article(
        site_id=1,
        url="https://example.com/a.html",
        url_hash="h",
        source_raw="文章来源：新华社",
        publish_time=datetime(2026, 4, 10, 16, 34, 22),
    )

    values = values_for_article(article, data_dir=tmp_path)

    assert values["publisher"] == "新华社"
    assert values["publish_date"].isoformat() == "2026-04-10"


def test_apply_values_merges_metadata_json():
    article = Article(
        site_id=1,
        url="https://example.com/a.html",
        url_hash="h",
        metadata_json={"raw": {"id": 1}},
    )

    changed = apply_values(
        article,
        {"metadata_json": {"public_meta": {"索引号": "idx"}}, "publisher": "国务院"},
        only_missing=False,
    )

    assert changed == ["metadata_json", "publisher"]
    assert article.metadata_json == {"raw": {"id": 1}, "public_meta": {"索引号": "idx"}}
    assert article.publisher == "国务院"
