o
    0 i                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z% d dl&Z&d dl'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9 ddl1m:Z: e8 rd dl;Z;d dl&m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZB e, rd dlCZCe0 rd dlDmEZE e. oe+ oe/ oe- ZFeFrd dlGZGd dlHmIZImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[ d dl\m]Z] d dl^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZm d dlnmoZo d d lpmqZqmrZrmsZs G d!d" d"eod#d$ZtG d%d& d&e]d#d$ZuG d'd( d(eSd#d$ZveretZwereuZxerevZyh d)Zzh d*Z{h d+Z|e9}e~Zd,d-d.d/iZee Zd0ZG d1d2 d2ejZd3efd4d5Zd6ed7d8d9d8fd:d;ZG d<d= d=ZG d>d? d?ZeG d@dA dAZG dBdC dCe:Ze~dDkre Ze  dS dS )E    N)ArgumentParser	Namespace)AsyncGenerator	GeneratorIterable)asynccontextmanager)	dataclassfield)BytesIO)Thread)Optional	TypedDictUnion
model_infoHF_HUB_OFFLINE)DecodeStream)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )
AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)is_torch_availablelogging   )BaseTransformersCLICommand)AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)ContinuousBatchingManagerRequestStatus)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoiceChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                   @      e Zd ZU dZeed< dS ))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__ rX   rX   i/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/transformers/commands/serving.pyrO   {      
 rO   F)totalc                   @   rN   )+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
        rP   NrQ   rX   rX   rX   rY   r\      rZ   r\   c                   @   s.   e Zd ZU dZeed< eed< dZeed< dS )%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerP   FstreamN)	rR   rS   rT   rU   bytesrW   rV   r_   boolrX   rX   rX   rY   r]      s
   
 r]   >   storeZmax_tool_callsuserprevious_response_id
backgroundincludeservice_tiertextZ	reasoningtop_logprobsZ
truncationprompttool_choice>   Zmax_completion_tokensnZ
modalitiesrk   stoprb   Zfunction_calllogprobsZpresence_penaltyrg   response_formatrc   Zreasoning_effortmetadataZstream_optionsZ	functionsparallel_tool_callsZ
predictionZweb_search_optionsri   Zaudio>   Ztimestamp_granularitiesrf   Zchunking_strategylanguagerj   ro   Zqwenz<tool_call>z</tool_call>)startendzx-request-idc                   @   s   e Zd ZdZdZdZdZdS )ModalityLLMVLMSTTTTSN)rR   rS   rT   rv   rw   rx   ry   rX   rX   rX   rY   ru      s
    ru   argsc                 C   s   t | S )z~
    Factory function used to instantiate serving server from provided command line arguments.

    Returns: ServeCommand
    )ServeCommand)rz   rX   rX   rY   serve_command_factory   s   r|   reqmodel_generation_configr(   returnc                 K   sZ  |  ddurtdi t| d }nt|}|jdi |}| D ]\}}|dur3t||| q%|  ddurBt	| d |_
|  ddurPt	| d |_
|  ddur^t| d |_|  ddurj| d |_|  ddurv| d |_|  ddurt| d |_t| d d	krd
|_|  ddurt| d |_|  ddurt| d  |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rP   NZmax_output_tokensZ
max_tokensZfrequency_penaltyZ
logit_biasrm   temperatureg        Ftop_pseedrX   )getr(   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatZrepetition_penaltyZsequence_biasZstop_stringsr   	do_sampler   torchmanual_seed)r}   r~   kwargsrP   Znon_standard_kwargskvrX   rX   rY   !create_generation_config_from_req   s6   


r   c                   @   s    e Zd ZdZdd Zdd ZdS )	ToolStatez7Lightweight class to keep track of the tool call state.c                 C   s   |    d S N)resetselfrX   rX   rY   __init__'  s   zToolState.__init__c                 C   s   d| _ d| _d| _d| _dS )z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   rX   rX   rY   r   *  s   
zToolState.resetN)rR   rS   rT   rU   r   r   rX   rX   rX   rY   r   $  s    r   c                	   @   sR   e Zd ZdZ	ddddedeed  fdd	Zd
d Zdd Z	dd Z
dd ZdS )
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr)   timeout_seconds	processor)r    r   c                 C   s>   || _ t|j| _|| _|| _t| j| j| _	| j	
  d S r   )r   rV   Zname_or_path_name_or_pathr   r   	threadingTimertimeout_reached_timerrs   )r   r   r   r   rX   rX   rY   r   8  s   zTimedModel.__init__c                 C   s*   | j   t| j| j| _ | j   dS )z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   rs   r   rX   rX   rY   reset_timerE  s   
zTimedModel.reset_timerc                 C   sZ   t | dr)| jdur+| `| `d| _d| _t  tj r"tj  | j	
  dS dS dS )z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r   gcZcollectr   cudaZis_availableZempty_cacher   r   r   rX   rX   rY   delete_modelK  s   

zTimedModel.delete_modelc                 C   s&   |    t| j d| j d d S )Nz was removed from memory after z seconds of inactivity)r   loggerinfor   r   r   rX   rX   rY   r   [  s   zTimedModel.timeout_reachedc                 C   s   t | d p
| jdu S )z)Check if the instances have been deleted.r   N)r   r   r   rX   rX   rY   
is_deleted_  s   zTimedModel.is_deletedr   )rR   rS   rT   rU   r   r   r   r   r   r   r   r   rX   rX   rX   rY   r   2  s    	

r   c                   @   s  e Zd ZU dZedddidZeed< edddidZe	ed	< ed
dg dddZ
ee	 ed< eddg dddZee	 ed< edddidZeed< ed
ddidZee	 ed< edddidZeed< edddidZeed< eddddgddZe	ed< edddidZeed< eddd idZe	ed!< ed"dd#idZeed$< ed%dd&idZeed'< ed(dd)idZe	ed*< ed
dd+idZee ed,< eddd-idZeed.< eddd/idZeed0< ed
dd1idZee	 ed2< d3d4 Zd
S )5ServeArgumentsz
    Arguments for the serve CLI.

    See the metadata arg for each argument's description -- the metadata will be printed with
    `transformers serve --help`
    Fhelpz8Whether to use continuous batching for chat completions.)defaultrp   continuous_batchingautozfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.deviceNzA`torch_dtype` is deprecated! Please use `dtype` argument instead.)r   Zbfloat16Zfloat16Zfloat32)r   choicestorch_dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.dtypez2Whether to trust remote code when loading a model.trust_remote_codezWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA.load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA.load_in_4bitZnf4zQuantization type.Zfp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant	localhostz$Interface the server will listen to.hosti@  zPort the server will listen to.porti,  z@Time in seconds after which a model will be removed from memory.model_timeoutr   z8Logging level as a string. Example: 'info' or 'warning'.	log_levelz1The default seed for torch, should be an integer.default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.force_modelc                 C   sN   | j dur#| jdu r| j | _dS | j | jkr%td| j  d| j ddS dS )z(Only used for BC `torch_dtype` argument.Nz`torch_dtype` z and `dtype` zn have different values. `torch_dtype` is deprecated and will be removed in 4.59.0, please set `dtype` instead.)r   r   
ValueErrorr   rX   rX   rY   __post_init__  s   

zServeArguments.__post_init__)rR   rS   rT   rU   r	   r   ra   rW   r   rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rX   rX   rX   rY   r   d  s   
 
r   c                   @   s$  e Zd ZedefddZdefddZdede	d	d
de
fddZdefddZdefddZdefddZ								dHdedee dee dee dee deed  dee dee defd d!Zd"d#defd$d%Zd&d' Zejdeeeef  fd(d)Zd*ededeedf fd+d,Zedd-defd.d/Zed0efd1d2Z d*ede!eddf fd3d4Z"d*ede!eddf fd5d6Z#d*ede!eddf fd7d8Z$d*ede%fd9d:Z&ededed; fd<d=Z'd>edefd?d@Z(dAefdBdCZ)dAede*d-ef fdDdEZ+dAede*d-e,f fdFdGZ-dS )Ir{   parserc                 C   s$   t f}| jd|d}|jtd dS )z
        Register this command to argparse so it's available for the transformer-cli

        Args:
            parser: Root parser to register command-specific arguments
        serve)dataclass_types)funcN)r   
add_parserset_defaultsr|   )r   r   Zserve_parserrX   rX   rY   register_subcommand  s   z ServeCommand.register_subcommandrz   c                 C   s  t std|| _| jj| _| jrAt }| jjd u r'|| j_t	d|  t
 }| jj|vrAtd| d| jj d| d| jj| _| jjd urSt| jj td}|tj| jj   td}|tj| jj   i | _d | _d | _d | _d | _d S )	NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`z-No attn_implementation passed, defaulting to z"Continuous batching only supports z as attn_implementation, got z#Try setting `--attn_implementation=`transformersz+transformers.generation.continuous_batching)serve_dependencies_availableImportErrorrz   r   use_continuous_batchingr*   Z default_attention_implementationr   r   r   Z#supported_attention_implementationsr   r   r   r   r   r#   
get_loggersetLevelZ
log_levelsr   lowerloaded_models#running_continuous_batching_managerlast_messageslast_kv_cache
last_model)r   rz   Zdefault_attn_implZsupported_attn_implZtransformers_loggerZ	cb_loggerrX   rX   rY   r     s@   




zServeCommand.__init__requestschema	validatorrL   unused_fieldsc           
   
   C   s   t d|  t| }|j}|| }|r(t d|  tdd| d| jjriz|	| W n t
yQ } zt d|   td| dd}~ww ||@ }	|	rkt d|	  tdd|	 ddS dS )a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`TypedDict`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   debugsetkeysZ__mutable_keys__errorr.   rz   r   Zvalidate_pythonrM   errors)
r   r   r   r   r   Z
input_keysZpossible_keysZunexpected_keyseZunused_fields_in_requestrX   rX   rY   _validate_request  s.   

zServeCommand._validate_requestc                 C      | j |tttd d S N)r   r   r   r   )r   rO   response_validatorUNUSED_RESPONSE_FIELDSr   r   rX   rX   rY   validate_response_requestA     
z&ServeCommand.validate_response_requestc                 C   r   r   )r   r\   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr   rX   rX   rY    validate_chat_completion_requestI  r   z-ServeCommand.validate_chat_completion_requestc                 C   r   r   )r   r]   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr   rX   rX   rY   validate_transcription_requestQ  r   z+ServeCommand.validate_transcription_requestr   N
request_idcontentr   rolefinish_reason
tool_callsr8   decode_stream	tokenizerr   c	           
   
   C   sl   |dur|dur|dur| |j|}t|tt |tt|||dd|dgddd}	d|	jd	d
 dS )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        N)r   r   r   r   )deltaindexr   r   zchat.completion.chunk)idcreatedr   r   Zsystem_fingerprintobjectdata: TZexclude_none

)stepZ
_tokenizerr5   r   timer6   r7   model_dump_json)
r   r   r   r   r   r   r   r   r   chunkrX   rX   rY   build_chat_completion_chunkY  s(   "
z(ServeCommand.build_chat_completion_chunkresponserK   c                 C   s   d|j dd dS )a  
        Builds a event of a streaming OpenAI Response response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            response (`BaseModel`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        r  Tr  r  )r  )r   r  rX   rX   rY   build_response_event  s   z!ServeCommand.build_response_eventc           
         s  t dtf fdd}t|d} jr%|jtdgddgdgd td d	d
lm} |	dd|dt
f fdd}|	ddt
f fdd}|	dd|f fdd}|d|d fdd}|ddd }|dd|fdd}	tj| jj jj jjd d S )!a  
        Setup and run the FastAPI server for transformers serve.

        Models will be loaded and unloaded automatically based on usage and a timeout.

        The server will expose the following endpoints:
        - POST /v1/chat/completions: Generates chat completions.
        - POST /v1/responses: Generates responses.
        - POST /v1/audio/transcriptions: Generates transcriptions from audio.
        - GET /v1/models: Lists available models for 3rd party tools.

        Requires FastAPI and Uvicorn to be installed.
        appc                   sB   d V   j  D ]}|  q	 jd ur jjddd d S d S )NT   blocktimeout)r   valuesr   r   rm   )r  r   r   rX   rY   lifespan  s   

z"ServeCommand.run.<locals>.lifespan)r  *T)Zallow_originsZallow_credentialsZallow_methodsZallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsr   bodyc                    s:    j |d  jr || jj}n |}t|ddS Nr   text/event-stream
media_type)r   r   #continuous_batching_chat_completionstater   generate_chat_completionr1   )r   r  outputr   rX   rY   chat_completion  s
   
z)ServeCommand.run.<locals>.chat_completionz/v1/responsesc                    s"    j | d  | }t|ddS r  )r   generate_responser1   )r   r"  r   rX   rY   	responses  s   
z#ServeCommand.run.<locals>.responsesz/v1/audio/transcriptionsc              
      s   |   4 I d H 5}t|d  I d H |d d}td|d j d|d j d|d jd dd	 W d   I d H  n1 I d H sDw   Y   j|d
  	|}t
|ddS )Nr^   r   )r^   r   zReceived file: z; MIME type: z; size:    z.2fz KiBr  r  r  )formr]   readr   r   filenamecontent_typesizer   generate_transcriptionr1   )r   r'  Zparsed_requestr"  r   rX   rY   audio_transcriptions  s   (

z.ServeCommand.run.<locals>.audio_transcriptionsz
/v1/modelsc                      s   t d  dS )Nlist)r  data)r0   get_gen_modelsrX   r   rX   rY   get_all_models  s   z(ServeCommand.run.<locals>.get_all_modelsz/healthc                   S   s   t ddiS )Nstatusok)r0   rX   rX   rX   rY   healthcheck  s   z%ServeCommand.run.<locals>.healthcheckhttpc                    s>   | j tptt }|| j_|| I d H }||j t< |S r   )headersr   X_REQUEST_IDrV   uuiduuid4r   r   )r   Z	call_nextr   r  rX   rX   rY   get_or_set_request_id  s   
z/ServeCommand.run.<locals>.get_or_set_request_id)r   r   r   N)r   r-   r   Zadd_middlewarer/   r   Zwarning_oncefastapir  postdictoptionsr   Z
middlewareuvicornrunrz   r   r   r   )
r   r  r  r  r#  r%  r-  r1  r4  r:  rX   r   rY   r@    s:   
	
"zServeCommand.runc                 C   s6   g d}t rdd |D S dd |D }dd |D S )a.  
        This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
        model working with generate can work.

        This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
        integrations.
        )zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructc                 S   s.   g | ]}|d t j   |dd dqS )r   /r   r  r  r  Zowned_by)datetimenow	timestampsplit.0r   rX   rX   rY   
<listcomp>  s    z/ServeCommand.get_gen_models.<locals>.<listcomp>c                 S   s   g | ]}t |qS rX   r   rG  rX   rX   rY   rI  #  s    c                 S   s$   g | ]}|j d |j |jdqS )r   rB  )r  
created_atrE  ZauthorrG  rX   rX   rY   rI  $  s    r   )r   modelsZmodel_infosrX   rX   rY   r0     s   	
zServeCommand.get_gen_modelsr}   c              	      s    |d jk}_|r!jdur!jjddd d_\}}t|dr0|jn|t||jj	j
dddd	 jdu rW|j dd
_t j_j  |j|d ddd|j}fdd fdd}||d |S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   r  r   FZfifo)r~   eos_token_idpad_token_idZ	use_cacher   Z	scheduler)rP   Z	streamingmessagespt)return_tensorsadd_generation_promptc              
   3   s    z6j | d dV  j| D ]#}|jtjkr&j | d dV   W d S j | |jd  |dV  qW d S  ty` } zt	t
| j|  dt
| dV  W Y d }~d S d }~ww )	N	assistantr   r   rm   r   r   )r   r   r   r   r   data: {"error": ""})r  r   Zrequest_id_iterr2  r+   FINISHEDZgenerated_tokens	Exceptionr   r   rV   cancel_request)r   r   resultr   )model_id_and_revisionr   r   rX   rY   stream_chat_completion^  s2   
 zPServeCommand.continuous_batching_chat_completion.<locals>.stream_chat_completionc                   s   z't |  d}jj| | jd}||D ]}|V  tdI d H  qW d S  tjyB   j| t	
d| d Y d S w )NF)r   r   r   zRequest z was cancelled.)r   tolistr   Zadd_requestr   asynciosleepCancelledErrorrZ  r   warning)Z_inputsr   r   r  )rP   r   r]  rX   rY   cancellation_wrapperz  s   zNServeCommand.continuous_batching_chat_completion.<locals>.cancellation_wrapperr   )process_model_namer   r   rm   load_model_and_processorr   r   r   rP   rL  rM  Zinit_continuous_batchingr   Zlogit_processorrs   apply_chat_templatetor   )r   r}   r   must_discard_cacher   r   inputsrc  rX   )rP   r\  r   r]  r   rY   r  .  s<   





z0ServeCommand.continuous_batching_chat_completionr)   c                 C   sB   | j j}|t v rtj}|S |t v rtj}|S td| )NzUnknown modality: )		__class__rR   r   r  ru   rw   r   rv   r   )r   Zmodel_classnamemodalityrX   rX   rY   get_model_modality  s   zServeCommand.get_model_modalityrk  c                 C   s~  g }| D ]}|d g d}|t jkrEt|d tr|d }n"t|d tr@g }|d D ]}|d dkr:||d  q+d|}||d< nr|t jkrt|d tr^|d d|d d nY|d D ]T}|d dkrr|d | qb|d dkrd	|d d
 v rt	dd|d d
 }t
tt|}tjddd}	|	j}
||	j n|d d
 }
|d d|
d qb|| q|S )Nr   r   r   r   typerh    )rn  rh   Z	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)rn  rq  )ru   rv   
isinstancerV   r.  appendjoinrw   resubr,   openr
   rp  	b64decodetempfileNamedTemporaryFilenamesave)rN  rk  processor_inputsmessageZparsed_messageZparsed_contentr   Z
image_datart  r^   rq  rX   rX   rY   *get_processor_inputs_from_inbound_messages  s@   




z7ServeCommand.get_processor_inputs_from_inbound_messagesc                    sv  j jdurj j|d< |d }|d d dkrdS |d jk}_\}}||}dtD ]}|jj	d 
 v rO| nq?|j|d|d	d
ddd}|j}|ddd}	djj	d 
 v rxd}	t||	dd}
t|jd}d}|r|sj }|d jd |krj}i ||
|d|d  fdd}||
S )a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   rN  rU  r   rR  r   TtoolsrO  )rQ  r  rP  Zreturn_dicttokenizer   req_0gptossFskip_special_tokensZskip_promptr~   	input_ids)streamerrP   return_dict_in_generatepast_key_valuesc              
   3   s   d}d }dj jd  v rd}d}fdd}t| d}d	}z#z|  t }jd
dV  | D ]}dj jd  v rH|d}||7 }|rV||v rUd}q7q7d ur| t	 d krhd|_
q7| t	 d kr|  j|d ddV  q7|j
r| j|7  _|jstd|j}	|	d u rq7|	d}	d|_tt|	ddd|d d}
n<|d	krq7d|jvrq7| j|d7  _| j|d8  _|jdk rd	|dd d d }tt|dddd}
j|d |
gdV  q7|d	krj||dV  q7j|dd V  |  W n# ty8 } ztt| d!t| d"V  W Y d }~nd }~ww W |  d S W |  d S |  w )#NFr  r   T<|channel|>final<|message|>c                         j di | }|j_d S NrX   generater  r   r   Zgenerate_outputr   r   rX   rY   generate_with_cache     zbServeCommand.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cachetargetr   r   rR  rS  
<|return|>rs   rt   r   )r   r   r   r   z\"name\": \"(.*?)\"r$   )r~  functionZ
_tool_call)r  r  rn  r  z"arguments": {{})	arguments)r  r  rn  )r   r   r   r   )r   r   rm   rT  rV  rW  )configarchitecturesr   r   rs   r   r  removesuffixstrip_TOOL_CALL_TOKENSr   r   r   r   rx  searchgroupr8   r9   r   countrw  rF  rY  r   r   rV   )r  _request_id
filter_cotcot_trace_endr  threadresultsZ
tool_stater[  Z	tool_nameZtoolr   generation_kwargsr   r\  r   r   Ztool_model_familyrX   rY   r]    s   





zEServeCommand.generate_chat_completion.<locals>.stream_chat_completion)rz   r   rd  r   re  rl  r  _MODELS_WITH_TOOL_SUPPORTr  r  r   rf  r   rg  r   r!   r   rP   is_continuationr   get_seq_lengthshape)r   r}   rN  rh  r   rk  r  Zsupported_model_familiesri  r  generation_streamerrP   r   seq_lenr]  rX   r  rY   r!    sf   




zz%ServeCommand.generate_chat_completionc                    s   d jk}_\}td tr6dv r)dd dgng }|dd d nUtd trjdv red d d dkrXdd dgd }n3d }d |d d	< n&d }n!td trdv r}dd dgng }|d  ntd
|j	|ddd}|
j}ddd}djjd  v rd}t||dd}tjd}d}r|sՈj }	|d jd |	krՈj}|t|||d|d  fdd}
|
|S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemrm  rc   r   r   r   z%inputs should be a list, dict, or strTrO  )rQ  rP  rd   r  r  Fr  r  Nr  rU  )ri  Zattention_maskr  rP   r  r  c                 3   s4   d}d }dj jd  v rd}d}fdd}t| d}d}d}d}zzz|  t }	td	|td
 |	dddddiidg g dddddd}
|d7 }	|
V  t
d|td
 |	dddddiidg g dddddd}|d7 }	|V  td||td dddg dd}|d7 }	|V  tdd |||td d!g d"d#}|d7 }	|V  d!}| D ]=}dj jd  v r|d$}||7 }|r||v rd}d!}qqtd%d ||||d!d&d'gd(}|d7 }	|V  qtd)d ||d|d!d&d'gd*}|d7 }	|V  td+d |||td |jg d"d#}|d7 }|d7 }	|V  td,||td dd-d|jgg d.d}|d7 }|d7 }	|V  td/|td
 |	d-ddddii|jgdg ddddd0d}|d7 }	|V  |  W nc ty } zVtd1t|  td2|t|d3}|d7 }	|V  td4|td
 |	d5ddddiig dg dddtd6t|d7d8d}|d7 }	|V  W Y d }~nd }~ww W |  d S W |  d S |  w )9NFr  r   Tr  c                     r  r  r  r  r  rX   rY   r    r  zTServeCommand.generate_response.<locals>.stream_response.<locals>.generate_with_cacher  zresponse.createdZresp_Zqueuedr  formatrn  rh   r  rq   r   rp   )r  rJ  r2  r   r  rh   r  r  r"  rq   rk   rp   )rn  sequence_numberr  r$   zresponse.in_progressZin_progresszresponse.output_item.addedZmsg_r  rR  )r  rn  r2  r   r   )rn  r  output_indexitemzresponse.content_part.addedZoutput_textr   )rn  rh   annotations)rn  item_idr  r  content_indexpartr  zresponse.output_text.deltagX@)tokenZlogprob)rn  r  r  r  r  r  rn   zresponse.output_text.done)rn  r  r  r  r  rh   rn   zresponse.content_part.donezresponse.output_item.done	completed)r  rn  r2  r   r   r  zresponse.completed)r  rJ  r2  r   r  rh   r"  r  r  rq   rk   rp   z"Exception in response generation: r   )rn  r  r  zresponse.failedfailedserver_error)coder  )r  rJ  r2  r   r  rh   r"  r  r  rq   rk   rp   r   ) r  r  r   r   rs   r
  r?   r;   r   r  rC   rD   rF   r=   rG   r  rH   rI   r>   rh   rE   r  r<   r  rw  rY  r   r   rV   rA   rB   r@   )r  r  r  r  r  r  r  r  r  rJ  Zresponse_createdZresponse_in_progressZresponse_output_item_addedZresponse_content_part_addedr  r[  Zresponse_output_text_deltaZresponse_output_text_doneZresponse_content_part_doneZresponse_output_item_doneZresponse_completedr   Zerror_eventZresponse_failedr  r   r\  r}   r   r   rX   rY   stream_response  s  





	
	


%z7ServeCommand.generate_response.<locals>.stream_response)rd  r   re  ru  rV   rv  r.  r=  r   rf  rg  r   r   r  r  r   r!   r   rP   r  r   r  r  r   Z	ones_like)r   r}   rh  r   ri  r  r  rP   r   r  r  rX   r  rY   r$    sZ   


	 
czServeCommand.generate_responsec           
         s   t  std| |d }| |\tjddd}t|jd}jj	}t
|d }tj||dd\}}||dd	j  d
 j d
< ||dd fdd}	|	 S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr  r  r^   )srmonorO  )sampling_raterP  Zinput_features)r  rP   r  c                  3   sH    j di  } j| jddd }t|d}|jdd V  d S )NT)r  r   )rh   r  rX   )r  Zbatch_decode	sequencesr2   r  )Zgenerated_idsZtranscription_textZtranscriptionZaudio_inputsaudio_modelaudio_processorr  rX   rY   _generate_transcription  s
   
zDServeCommand.generate_transcription.<locals>._generate_transcription)r   r   rd  load_audio_model_and_processorr!   r   r   rP   Zfeature_extractorr  ior
   librosaloadrg  r   r   )
r   r}   r\  r  rP   Zmodel_sampling_rateZaudio_bytesZaudio_array_r  rX   r  rY   r,    s2   z#ServeCommand.generate_transcriptionc                 C   sx   | dp	| d}d}| jdu rd}n#t| jt|kr d}ntt| jD ]}| j| || kr6d} nq'|| _|S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        rN  r  TNF)r   r   lenrange)r   r}   rN  Zreq_continues_last_messagesirX   rX   rY   r    s   
zServeCommand.is_continuationr'   c                 C   s@   | j rtd| j| j| j| jd}|S | jrtdd}|S d}|S )a  
        Returns the quantization config for the given CLI arguments.

        Args:
            args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        T)r   Zbnb_4bit_compute_dtyper   Zbnb_4bit_use_double_quantZbnb_4bit_quant_storage)r   N)r   r'   r   r   r   r   )rz   quantization_configrX   rX   rY   get_quantization_config   s    z$ServeCommand.get_quantization_configmodel_idc                 C   s*   | j jdur
| j j}d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        N@z@main)rz   r   )r   r  rX   rX   rY   rd    s
   
zServeCommand.process_model_namer\  c                 C   s>  | j }td|  d|v r|dd\}}n|d}}tj|||jd}|jdv r.|jntt	|j}| 
|}||j|d|jd}|d	urK||d
< tj|fi |}	tt|	jd }
|
j|fi |}t|dd	d	u rs||j}|jjd	u o~|jjdk}|jjd	uo|jjdk }|s|rd|j_td|  ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        zLoading r  r$   main)revisionr   )r   Nr   )r  r   r   Z
device_mapr   Nr  r   Zhf_device_map   r&  zLoaded model )rz   r   r   rF  r&   Zfrom_pretrainedr   r   getattrr   r  r   r   r   r  rg  r   rP   r   
max_length)r   r\  rz   r  r  Zdata_processorr   r  Zmodel_kwargsr  architecturer   Zhas_default_max_lengthZhas_short_max_new_tokensrX   rX   rY   _load_model_and_data_processor.  sB   

z+ServeCommand._load_model_and_data_processorc                 C   t   || j vs| j |  r#| |\}}t|| jj|d| j |< ||fS | j |   | j | j}| j | j}||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   	r   r   r  r   rz   r   r   r   r   )r   r\  r   r   rX   rX   rY   re  k  s   
z%ServeCommand.load_model_and_processorc                 C   r  )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r  r  )r   r\  r  r  rX   rX   rY   r    s   
z+ServeCommand.load_audio_model_and_processor)r   NNNNNNN).rR   rS   rT   staticmethodr   r   r   r   r=  r   r   r   r   r   r   rV   r   r   r.  r   r   r  r  r@  	functoolscacheanyr0  r   r  ru   rl  r  r   r!  r$  r,  ra   r  r  rd  r  tuplere  r    r  rX   rX   rX   rY   r{     s    -
1

	

8_-\- J  '0=

r{   __main__)r_  rp  r   rC  enumr  r   r  r   rx  r|  r   r
  r8  argparser   r   collections.abcr   r   r   
contextlibr   dataclassesr   r	   r
   r   typingr   r   r   Zhuggingface_hubr   Zhuggingface_hub.constantsr   Ztokenizers.decodersr   r   Z&transformers.models.auto.modeling_autor   r   Ztransformers.utils.import_utilsr   r   r   r   r   r   r   r   r   r   r    r!   utilsr"   r#   r%   r   r&   r'   r(   r)   Zgeneration.continuous_batchingr*   r+   r  ZPILr,   r   r?  r;  r-   r.   Zfastapi.middleware.corsr/   Zfastapi.responsesr0   r1   Z openai.types.audio.transcriptionr2   Z.openai.types.audio.transcription_create_paramsr3   Zopenai.types.chatr4   Z'openai.types.chat.chat_completion_chunkr5   r6   r7   r8   r9   Z*openai.types.chat.completion_create_paramsr:   Zopenai.types.responsesr;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   Z-openai.types.responses.response_create_paramsrJ   ZpydanticrK   rL   rM   rO   r\   r]   r   r   r   r   r   r   r   rR   r   r  r.  r   r  r7  Enumru   r|   r=  r   r   r   r   r{   r   r@  rX   rX   rX   rY   <module>   s    	D


	
;2q         
S