"""
Azure AVA (Cognitive Services) Text-to-Speech transformation

Maps OpenAI TTS spec to Azure Cognitive Services TTS API
"""

from typing import TYPE_CHECKING, Any, Coroutine, Dict, Optional, Tuple, Union
from urllib.parse import urlparse

import httpx

import litellm
from litellm.llms.base_llm.text_to_speech.transformation import (
    BaseTextToSpeechConfig,
    TextToSpeechRequestData,
)
from litellm.secret_managers.main import get_secret_str

if TYPE_CHECKING:
    from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
    from litellm.types.llms.openai import HttpxBinaryResponseContent
else:
    LiteLLMLoggingObj = Any
    HttpxBinaryResponseContent = Any


class AzureAVATextToSpeechConfig(BaseTextToSpeechConfig):
    """
    Configuration for Azure AVA (Cognitive Services) Text-to-Speech
    
    Reference: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech
    """

    # Azure endpoint domains
    DEFAULT_VOICE = "en-US-AriaNeural"
    COGNITIVE_SERVICES_DOMAIN = "api.cognitive.microsoft.com"
    TTS_SPEECH_DOMAIN = "tts.speech.microsoft.com"
    TTS_ENDPOINT_PATH = "/cognitiveservices/v1"

    # Voice name mappings from OpenAI voices to Azure voices
    VOICE_MAPPINGS = {
        "alloy": "en-US-JennyNeural",
        "echo": "en-US-GuyNeural",
        "fable": "en-GB-RyanNeural",
        "onyx": "en-US-DavisNeural",
        "nova": "en-US-AmberNeural",
        "shimmer": "en-US-AriaNeural",
    }

    # Response format mappings from OpenAI to Azure
    FORMAT_MAPPINGS = {
        "mp3": "audio-24khz-48kbitrate-mono-mp3",
        "opus": "ogg-48khz-16bit-mono-opus",
        "aac": "audio-24khz-48kbitrate-mono-mp3",  # Azure doesn't have AAC, use MP3
        "flac": "audio-24khz-48kbitrate-mono-mp3",  # Azure doesn't have FLAC, use MP3
        "wav": "riff-24khz-16bit-mono-pcm",
        "pcm": "raw-24khz-16bit-mono-pcm",
    }

    def dispatch_text_to_speech(
        self,
        model: str,
        input: str,
        voice: Optional[Union[str, Dict]],
        optional_params: Dict,
        litellm_params_dict: Dict,
        logging_obj: "LiteLLMLoggingObj",
        timeout: Union[float, httpx.Timeout],
        extra_headers: Optional[Dict[str, Any]],
        base_llm_http_handler: Any,
        aspeech: bool,
        api_base: Optional[str],
        api_key: Optional[str],
        **kwargs: Any,
    ) -> Union[
        "HttpxBinaryResponseContent",
        Coroutine[Any, Any, "HttpxBinaryResponseContent"],
    ]:
        """
        Dispatch method to handle Azure AVA TTS requests
        
        This method encapsulates Azure-specific credential resolution and parameter handling
        
        Args:
            base_llm_http_handler: The BaseLLMHTTPHandler instance from main.py
        """
        # Resolve api_base from multiple sources
        api_base = (
            api_base
            or litellm_params_dict.get("api_base")
            or litellm.api_base
            or get_secret_str("AZURE_API_BASE")
        )
        
        # Resolve api_key from multiple sources (Azure-specific)
        api_key = (
            api_key
            or litellm_params_dict.get("api_key")
            or litellm.api_key
            or litellm.azure_key
            or get_secret_str("AZURE_OPENAI_API_KEY")
            or get_secret_str("AZURE_API_KEY")
        )
        
        # Convert voice to string if it's a dict (for Azure AVA, voice must be a string)
        voice_str: Optional[str] = None
        if isinstance(voice, str):
            voice_str = voice
        elif isinstance(voice, dict):
            # Extract voice name from dict if needed
            voice_str = voice.get("name") if voice else None
        
        litellm_params_dict.update({
            "api_key": api_key,
            "api_base": api_base,
        })
        # Call the text_to_speech_handler
        response = base_llm_http_handler.text_to_speech_handler(
            model=model,
            input=input,
            voice=voice_str,
            text_to_speech_provider_config=self,
            text_to_speech_optional_params=optional_params,
            custom_llm_provider="azure",
            litellm_params=litellm_params_dict,
            logging_obj=logging_obj,
            timeout=timeout,
            extra_headers=extra_headers,
            client=None,
            _is_async=aspeech,
        )
        
        return response

    def get_supported_openai_params(self, model: str) -> list:
        """
        Azure AVA TTS supports these OpenAI parameters
        
        Note: Azure also supports additional SSML-specific parameters (style, styledegree, role)
        which can be passed but are not part of the OpenAI spec
        """
        return ["voice", "response_format", "speed"]

    def _convert_speed_to_azure_rate(self, speed: float) -> str:
        """
        Convert OpenAI speed value to Azure SSML prosody rate percentage
        
        Args:
            speed: OpenAI speed value (0.25-4.0, default 1.0)
        
        Returns:
            Azure rate string with percentage (e.g., "+50%", "-50%", "+0%")
        
        Examples:
            speed=1.0 -> "+0%" (default)
            speed=2.0 -> "+100%"
            speed=0.5 -> "-50%"
        """
        rate_percentage = int((speed - 1.0) * 100)
        return f"{rate_percentage:+d}%"
    
    def _build_express_as_element(
        self,
        content: str,
        style: Optional[str] = None,
        styledegree: Optional[str] = None,
        role: Optional[str] = None,
    ) -> str:
        """
        Build mstts:express-as element with optional style, styledegree, and role attributes
        
        Args:
            content: The inner content to wrap
            style: Speaking style (e.g., "cheerful", "sad", "angry")
            styledegree: Style intensity (0.01 to 2)
            role: Voice role (e.g., "Girl", "Boy", "SeniorFemale", "SeniorMale")
        
        Returns:
            Content wrapped in mstts:express-as if any attributes provided, otherwise raw content
        """
        if not (style or styledegree or role):
            return content
        
        express_as_attrs = []
        if style:
            express_as_attrs.append(f"style='{style}'")
        if styledegree:
            express_as_attrs.append(f"styledegree='{styledegree}'")
        if role:
            express_as_attrs.append(f"role='{role}'")
        
        express_as_attrs_str = " ".join(express_as_attrs)
        return f"<mstts:express-as {express_as_attrs_str}>{content}</mstts:express-as>"
    
    def _get_voice_language(
        self,
        voice_name: Optional[str],
        explicit_lang: Optional[str] = None,
    ) -> Optional[str]:
        """
        Get the language for the voice element's xml:lang attribute
        
        Args:
            voice_name: The Azure voice name (e.g., "en-US-AriaNeural")
            explicit_lang: Explicitly provided language code (takes precedence)
        
        Returns:
            Language code if available (e.g., "es-ES"), or None
        
        Examples:
            - explicit_lang="es-ES" → "es-ES" (explicit takes precedence)
            - voice_name="en-US-AriaNeural", explicit_lang=None → None (use default from voice)
            - voice_name="en-US-AvaMultilingualNeural", explicit_lang="fr-FR" → "fr-FR"
        """
        # If explicit language is provided, use it (for multilingual voices)
        if explicit_lang:
            return explicit_lang
        
        # For non-multilingual voices, we don't need to set xml:lang on the voice element
        # The voice name already encodes the language (e.g., en-US-AriaNeural)
        # Only return a language if explicitly set
        return None

    def map_openai_params(
        self,
        model: str,
        optional_params: Dict,
        voice: Optional[Union[str, Dict]] = None,
        drop_params: bool = False,
        kwargs: Dict = {},
    ) -> Tuple[Optional[str], Dict]:
        """
        Map OpenAI parameters to Azure AVA TTS parameters
        """
        mapped_params = {}
        ##########################################################
        # Map voice
        # OpenAI uses voice as a required param, hence not in optional_params
        ##########################################################
        # If it's already an Azure voice, use it directly
        mapped_voice: Optional[str] = None
        if isinstance(voice, str):
            if voice in self.VOICE_MAPPINGS:
                mapped_voice = self.VOICE_MAPPINGS[voice]
            else:
                # Assume it's already an Azure voice name
                mapped_voice = voice
        
        # Map response format
        if "response_format" in optional_params:
            format_name = optional_params["response_format"]
            if format_name in self.FORMAT_MAPPINGS:
                mapped_params["output_format"] = self.FORMAT_MAPPINGS[format_name]
            else:
                # Try to use it directly as Azure format
                mapped_params["output_format"] = format_name
        else:
            # Default to MP3
            mapped_params["output_format"] = "audio-24khz-48kbitrate-mono-mp3"
        
        # Map speed (OpenAI: 0.25-4.0, Azure: prosody rate)
        if "speed" in optional_params:
            speed = optional_params["speed"]
            if speed is not None:
                mapped_params["rate"] = self._convert_speed_to_azure_rate(speed=speed)
        
        # Pass through Azure-specific SSML parameters
        if "style" in kwargs:
            mapped_params["style"] = kwargs["style"]
        
        if "styledegree" in kwargs:
            mapped_params["styledegree"] = kwargs["styledegree"]
        
        if "role" in kwargs:
            mapped_params["role"] = kwargs["role"]
        
        if "lang" in kwargs:
            mapped_params["lang"] = kwargs["lang"]
        return mapped_voice, mapped_params

    def validate_environment(
        self,
        headers: dict,
        model: str,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
    ) -> dict:
        """
        Validate Azure environment and set up authentication headers
        """
        validated_headers = headers.copy()
        
        # Azure AVA TTS requires either:
        # 1. Ocp-Apim-Subscription-Key header, or
        # 2. Authorization: Bearer <token> header
        
        # We'll use the token-based auth via our token handler
        # The token will be added later in the handler
        
        if api_key:
            # If subscription key is provided, use it directly
            validated_headers["Ocp-Apim-Subscription-Key"] = api_key
        
        # Content-Type for SSML
        validated_headers["Content-Type"] = "application/ssml+xml"
        
        # User-Agent
        validated_headers["User-Agent"] = "litellm"
        
        return validated_headers

    def get_complete_url(
        self,
        model: str,
        api_base: Optional[str],
        litellm_params: dict,
    ) -> str:
        """
        Get the complete URL for Azure AVA TTS request
        
        Azure TTS endpoint format:
        https://{region}.tts.speech.microsoft.com/cognitiveservices/v1
        """
        if api_base is None:
            raise ValueError(
                f"api_base is required for Azure AVA TTS. "
                f"Format: https://{{region}}.{self.COGNITIVE_SERVICES_DOMAIN} or "
                f"https://{{region}}.{self.TTS_SPEECH_DOMAIN}"
            )
        
        # Remove trailing slash and parse URL
        api_base = api_base.rstrip("/")
        parsed_url = urlparse(api_base)
        hostname = parsed_url.hostname or ""
        
        # Check if it's a Cognitive Services endpoint (convert to TTS endpoint)
        if self._is_cognitive_services_endpoint(hostname=hostname):
            region = self._extract_region_from_hostname(
                hostname=hostname, 
                domain=self.COGNITIVE_SERVICES_DOMAIN
            )
            return self._build_tts_url(region=region)
        
        # Check if it's already a TTS endpoint
        if self._is_tts_endpoint(hostname=hostname):
            if not api_base.endswith(self.TTS_ENDPOINT_PATH):
                return f"{api_base}{self.TTS_ENDPOINT_PATH}"
            return api_base
        
        # Assume it's a custom endpoint, append the path
        return f"{api_base}{self.TTS_ENDPOINT_PATH}"

    def _is_cognitive_services_endpoint(self, hostname: str) -> bool:
        """Check if hostname is a Cognitive Services endpoint"""
        return (
            hostname == self.COGNITIVE_SERVICES_DOMAIN 
            or hostname.endswith(f".{self.COGNITIVE_SERVICES_DOMAIN}")
        )

    def _is_tts_endpoint(self, hostname: str) -> bool:
        """Check if hostname is a TTS endpoint"""
        return (
            hostname == self.TTS_SPEECH_DOMAIN 
            or hostname.endswith(f".{self.TTS_SPEECH_DOMAIN}")
        )

    def _extract_region_from_hostname(self, hostname: str, domain: str) -> str:
        """
        Extract region from hostname
        
        Examples:
            eastus.api.cognitive.microsoft.com -> eastus
            api.cognitive.microsoft.com -> ""
        """
        if hostname.endswith(f".{domain}"):
            return hostname[:-len(f".{domain}")]
        return ""

    def _build_tts_url(self, region: str) -> str:
        """Build the complete TTS URL with region"""
        if region:
            return f"https://{region}.{self.TTS_SPEECH_DOMAIN}{self.TTS_ENDPOINT_PATH}"
        return f"https://{self.TTS_SPEECH_DOMAIN}{self.TTS_ENDPOINT_PATH}"

    
    def is_ssml_input(self, input: str) -> bool:
        """
        Returns True if input is SSML, False otherwise

        Based on https://www.w3.org/TR/speech-synthesis/ all SSML must start with <speak>
        """
        return "<speak>" in input or "<speak " in input

    def transform_text_to_speech_request(
        self,
        model: str,
        input: str,
        voice: Optional[str],
        optional_params: Dict,
        litellm_params: Dict,
        headers: dict,
    ) -> TextToSpeechRequestData:
        """
        Transform OpenAI TTS request to Azure AVA TTS SSML format
        
        Note: optional_params should already be mapped via map_openai_params in main.py
        
        Supports Azure-specific SSML features:
        - style: Speaking style (e.g., "cheerful", "sad", "angry")
        - styledegree: Style intensity (0.01 to 2)
        - role: Voice role (e.g., "Girl", "Boy", "SeniorFemale", "SeniorMale")
        - lang: Language code for multilingual voices (e.g., "es-ES", "fr-FR")
        
        Auto-detects SSML:
        - If input contains <speak>, it's passed through as-is without transformation
        
        Returns:
            TextToSpeechRequestData: Contains SSML body and Azure-specific headers
        """
        # Get voice (already mapped in main.py, or use default)
        azure_voice = voice or self.DEFAULT_VOICE
        
        # Get output format (already mapped in main.py)
        output_format = optional_params.get(
            "output_format", "audio-24khz-48kbitrate-mono-mp3"
        )
        headers["X-Microsoft-OutputFormat"] = output_format
        
        # Auto-detect SSML: if input contains <speak>, pass it through as-is
        # Similar to Vertex AI behavior - check if input looks like SSML
        if self.is_ssml_input(input=input):
            return TextToSpeechRequestData(
                ssml_body=input,
                headers=headers,
            )
        
        # Build SSML from plain text
        rate = optional_params.get("rate", "0%")
        style = optional_params.get("style")
        styledegree = optional_params.get("styledegree")
        role = optional_params.get("role")
        lang = optional_params.get("lang")
        
        # Escape XML special characters in input text
        escaped_input = (
            input.replace("&", "&amp;")
            .replace("<", "&lt;")
            .replace(">", "&gt;")
            .replace('"', "&quot;")
            .replace("'", "&apos;")
        )
        
        # Determine if we need mstts namespace (for express-as element)
        use_mstts = style or role or styledegree
        
        # Build the xmlns attributes
        if use_mstts:
            xmlns = "xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts'"
        else:
            xmlns = "xmlns='http://www.w3.org/2001/10/synthesis'"
        
        # Build the inner content with prosody
        prosody_content = f"<prosody rate='{rate}'>{escaped_input}</prosody>"
        
        # Wrap in mstts:express-as if style or role is specified
        voice_content = self._build_express_as_element(
            content=prosody_content,
            style=style,
            styledegree=styledegree,
            role=role,
        )
        
        # Build voice element with optional xml:lang attribute
        voice_lang = self._get_voice_language(
            voice_name=azure_voice,
            explicit_lang=lang,
        )
        voice_lang_attr = f" xml:lang='{voice_lang}'" if voice_lang else ""
        
        ssml_body = f"""<speak version='1.0' {xmlns} xml:lang='en-US'>
    <voice name='{azure_voice}'{voice_lang_attr}>
        {voice_content}
    </voice>
</speak>"""
        
        return {
            "ssml_body": ssml_body,
            "headers": headers,
        }

    def transform_text_to_speech_response(
        self,
        model: str,
        raw_response: httpx.Response,
        logging_obj: "LiteLLMLoggingObj",
    ) -> "HttpxBinaryResponseContent":
        """
        Transform Azure AVA TTS response to standard format
        
        Azure returns the audio data directly in the response body
        """
        from litellm.types.llms.openai import HttpxBinaryResponseContent

        # Azure returns audio data directly in the response body
        # Wrap it in HttpxBinaryResponseContent for consistent return type
        return HttpxBinaryResponseContent(raw_response)