"""One-shot: backfill crawl_target.channel_path from entry_url for rows
created via the yaml-site bulk-create flow before the channel_path fix.

Affected rows: those with channel_path IS NULL but entry_url SET.
We lift the URL path (e.g. /zwgk/msjs/jzkw/index.html) and store it as
channel_path, so the admin 采集目标管理 list shows '栏目路径' instead of '-'.

Idempotent — running again skips rows that already have a value.
"""
from __future__ import annotations

import sys
from pathlib import Path
from urllib.parse import urlparse

# Make `govcrawler` importable when running as a script
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from govcrawler.db import get_sessionmaker  # noqa: E402
from govcrawler.models import CrawlTarget  # noqa: E402


def main() -> int:
    Session = get_sessionmaker()
    n_updated = 0
    n_skipped = 0
    with Session() as s:
        rows = (
            s.query(CrawlTarget)
            .filter(CrawlTarget.channel_path.is_(None))
            .filter(CrawlTarget.entry_url.isnot(None))
            .all()
        )
        for t in rows:
            if not t.entry_url:
                continue
            try:
                p = urlparse(t.entry_url).path or t.entry_url
            except Exception:
                p = t.entry_url
            if not p:
                n_skipped += 1
                continue
            t.channel_path = p
            n_updated += 1
        s.commit()

    print(f"updated {n_updated} crawl_target rows; skipped {n_skipped}.")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
