"""robots.txt gate (COMP-02).

Per-host RobotFileParser with a TTL cache. Network failures default to ALLOW
so that transient robots.txt errors don't silently kill the crawl — but a
clear `Disallow: /` always wins. Policy is loose-but-explicit: we log when
a URL is disallowed and skip it.
"""
from __future__ import annotations

import logging
import time
import urllib.error
import urllib.request
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser

log = logging.getLogger(__name__)

CACHE_TTL_S = 3600  # 1 hour
FETCH_TIMEOUT_S = 10.0


class RobotsCache:
    """Per-host robots.txt cache. Thread-unsafe — one cache per process is fine."""

    def __init__(self, ttl_s: float = CACHE_TTL_S):
        self.ttl_s = ttl_s
        self._cache: dict[str, tuple[RobotFileParser | None, float]] = {}

    def _fetch(self, scheme: str, host: str) -> RobotFileParser | None:
        url = f"{scheme}://{host}/robots.txt"
        rp = RobotFileParser()
        rp.set_url(url)
        try:
            with urllib.request.urlopen(url, timeout=FETCH_TIMEOUT_S) as resp:
                content = resp.read().decode("utf-8", errors="replace")
            rp.parse(content.splitlines())
            return rp
        except (urllib.error.URLError, TimeoutError, OSError) as e:
            log.info("robots.txt fetch failed host=%s err=%s — default allow", host, e)
            return None

    def get(self, url: str) -> RobotFileParser | None:
        p = urlparse(url)
        if not p.netloc:
            return None
        host = p.netloc.lower()
        scheme = (p.scheme or "https").lower()
        now = time.time()
        cached = self._cache.get(host)
        if cached and (now - cached[1]) < self.ttl_s:
            return cached[0]
        rp = self._fetch(scheme, host)
        self._cache[host] = (rp, now)
        return rp

    def is_allowed(self, url: str, user_agent: str) -> bool:
        """True if crawling is allowed; default ALLOW when robots.txt unreachable."""
        rp = self.get(url)
        if rp is None:
            return True
        return rp.can_fetch(user_agent, url)


default_cache = RobotsCache()


def is_allowed(url: str, user_agent: str) -> bool:
    """Convenience: use the module-level default cache."""
    return default_cache.is_allowed(url, user_agent)