o
    )i                    @   s  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlZd dlZd dl Z!d dl"Z"d dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d d	l*m+Z+ d d
l,m-Z- d dl.m/Z/m0Z0m1Z1 d dlm2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZAmBZBmCZC d dlDmEZE d dlFmGZG d dlHmIZI d dlJmKZK d dlLmMZM d dlNmOZO d dlPmQZQ d dlRmSZS d dlTmUZUmVZVmWZW d dlXmYZY d dlZm[Z[ d dl\m]Z]m^Z^ d dl_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{ d dl|m}Z} d dl~mZ d d lmZ d d!lmZ d d"lmZ d d#lmZmZmZ d d$lmZ d d%lmZ d d&lmZ d d'lmZ d d(lmZmZ d d)lmZ d d*lmZmZmZ d d+lmZmZmZmZ d d,lmZ d d-lmZ d d.lmZ d d/lmZ d d0lmZ d d1lmZmZmZmZmZmZ d d2lmZ d d3lmZ ejed4< ed5Ze Zee j ed6< ed7e&fd8d9Zeejddd:d;ed<ed=ee d>eeeef  d?eeS f
d@dAZeejdBdd:dCeKd<ed=ed>eeeef  d?eeS f
dDdEZdFe)fdGdHZe$ ZG dIdJ dJe0Zd7e&fdKdLZdMe)d?efdNdOZdMe)d?efdPdQZdMe)d?ee fdRdSZdMe)d?ee} fdTdUZdMe)d?ee fdVdWZdMe)d?ee fdXdYZdMe)d?ee fdZd[ZdMe)d?ee fd\d]ZdMe)d?ee fd^d_ZdMe)d?ee fd`daZdMe)d?efdbdcZdMe)d?efdddeZdMe)d?efdfdgZdMe)d?eSfdhdiZejdje0dkdFe)d?e0fdldmZeĠdndMe)fdodpZejdqe0dkejdqe0dkdFe)d?e0fdrdsZejdte%eÃgejjduekiejjduekiejjduekiejjduekiidvedMeudFe)fdwdxZejdye%eÃgejjduekiejjduekiejjduekiidvedMefdFe)fdzd{Zd|d} ZeĠd~dFe)fddZeĠddd Zejde%eÃgejjddi iiejjduekiejjduekiejjduekiidvedMeqdFe)fddZeĠddedFe)fddZeĠddedFe)fddZejde%eÃgejjddi iiejjduekiejjduekiejjduekiidveedMe`dFe)fddZejde%eÃgejjddi iiejjduekiejjduekiejjduekiidveedMeddFe)fddZejde%eÃgejjduekiejjduekiidveedMehdFe)fddZejde%eÃgejjduekiejjduekiidveedMemdFe)fddZejde%eÃgdeedMebdFe)fddZejde%eÃgejjduekiejjduekiidveedMesdFe)fddZejde%eÃgejjduekiejjduekiidveedMesdFe)fddZejdejjddi iiejjduekiejjduekiejjduekiideedFe)dMeewe' f fddZejdejjddi iiejjduekiejjduekiejjduekiideedMeeye' f dFe)fddZejde%eÃgejjduekiejjduekiidveedMeodFe)fddZejde%eÃgejjduekiejjduekiidvedMeodFe)fddZejde%eÃgejjduekiejjduekiidvedMeodFe)fddZeGjred eĠddFe)fddZeĠddFe)fddZeĠddFe)fddZeĠddFe)fddZeĠdádFe)fddńZejde%eÃgejjdueiejjduekiejjduekiejjduekiidvdFe)fddȄZeĠdɡdFe)fdd˄ZeZee)gee f Z eee)gee f Ze`eeffedeeffeheeffebeeffeseeffeoeeffemeeffgZeeeee ef f  ed< dd΄ eD Zejde%eÃgejjduekiejjduekiejjduekiidvdFe)fddфZeGjredҡ eĠdӡdFe)fddՄZ	eĠd֡dFe)fdd؄Z
eGjred١ ejde%eÃgddMeldFe)fdd܄Zejde%eÃgddMe{dFe)fdd߄Zdee d?ee fddZG dd dZG dd dZdBaG dd dZded?efddZG dd dZded?dfddZded?dfddZd;ed?e&fddZdieSdeIde;d;ed?df
ddZdeeef d?ejfddZded?ejfddZdd  Zdd Zd
ddZ	d	d
ddZe dkre  edd	Z!e]e!Z!e!" Z#e^e# e"$ee# dS dS (      N)	Namespace)AsyncIterator	Awaitable)asynccontextmanager)partial)
HTTPStatus)	AnnotatedAnyCallableOptional)	APIRouterDependsFastAPIFormHTTPExceptionRequest)RequestValidationError)CORSMiddleware)JSONResponseResponseStreamingResponse)make_asgi_app)Instrumentatoriterate_in_threadpool)URLHeadersMutableHeadersState)Mount)ASGIAppMessageReceiveScopeSend)assert_never)
VllmConfig)AsyncEngineArgs)AsyncLLMEngine)MQLLMEngineClient)run_mp_engine)EngineClient)load_chat_templateresolve_hf_chat_templateresolve_mistral_chat_template)
serve_http)RequestLogger)make_arg_parservalidate_parsed_serve_args)ChatCompletionRequestChatCompletionResponseClassificationRequestClassificationResponseCompletionRequestCompletionResponseDetokenizeRequestDetokenizeResponseEmbeddingRequestEmbeddingResponse	ErrorInfoErrorResponseLoadLoRAAdapterRequestPoolingRequestPoolingResponseRerankRequestRerankResponseResponsesRequestResponsesResponseScoreRequestScoreResponseTokenizeRequestTokenizeResponseTranscriptionRequestTranscriptionResponseTranslationRequestTranslationResponseUnloadLoRAAdapterRequest)OpenAIServingChat)ServingClassification)OpenAIServingCompletion)OpenAIServingEmbedding)OpenAIServing)BaseModelPathLoRAModulePathOpenAIServingModels)OpenAIServingPooling)OpenAIServingResponses)ServingScores)OpenAIServingTokenization)OpenAIServingTranscriptionOpenAIServingTranslation)ToolParserManager)DemoToolServerMCPToolServer
ToolServer)cli_env_setupload_aware_calllog_non_default_argswith_cancellation)init_logger)ReasoningParserManager)(maybe_register_config_serialize_by_value)MistralTokenizer)UsageContext)DeviceFlexibleArgumentParserdecorate_logsget_open_zmq_ipc_pathis_valid_ipv6_address
set_ulimit)get_prometheus_registry)__version__prometheus_multiproc_dirz"vllm.entrypoints.openai.api_server_running_tasksappc                   s   zG| j jr"| j j  fdd}t| }t| |tj nd }t	
  t	  zd V  W |d ur9|  n
|d urC|  w w W | ` d S | ` w )Nc                      s&   	 t tjI d H    I d H  qN)asynciosleepenvsZVLLM_LOG_STATS_INTERVALZdo_log_stats engine_clientry   n/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py
_force_log   s
   zlifespan.<locals>._force_log)state	log_statsr{   rv   create_taskrs   addadd_done_callbackremovegcZcollectfreezecancel)rt   r}   taskry   rz   r|   lifespany   s*   

r   usage_context disable_frontend_multiprocessingclient_configargsr   r   r   returnc             	   C  s   t ddkr!td td tdg t  td t	
| }|d u r/t| j}t||||d4 I d H }|V  W d   I d H  d S 1 I d H sPw   Y  d S )NZVLLM_WORKER_MULTIPROC_METHOD
forkserverz!Setup forkserver with pre-importszvllm.v1.engine.async_llmzForkserver setup complete!r   )osgetenvloggerdebugmultiprocessingZset_start_methodZset_forkserver_preloadr   Zensure_runningr'   Zfrom_cli_argsboolr   *build_async_engine_client_from_engine_args)r   r   r   r   engine_argsZenginery   ry   r|   build_async_engine_client   s*   	



.r   Fr   c             	     s  | j |d}tjrW|rtd ddlm} d}|r |dnd}|r)|dnd}z#|j||| j	| j
|||d	}| I dH  |V  W |rM|  dS dS |rV|  w w t|s^|rd}	z tj||| j	| j
d
}	|	V  W |	r}t|	dr|	  dS dS dS |	rt|	dr|	  w w w dtjvrt atjtjd< ntd t  td  td}
t  tjdddd}|
jt|tj  | j
| j	|fd}|!  |j"}|dusJ dt#d|  fdd}t$%| t&t ||}t'( )d|I dH }zN	 z	|* I dH  W n t+y"   |, r|j-s t.ddY nw q|V  W |/  |0  |1d |j2du r@|3  ddl4m5} |6|j" dS |/  |0  |1d |j2du re|3  ddl4m5} |6|j" w )z
    Create EngineClient, either:
        - in-process using the AsyncLLMEngine Directly
        - multiprocess using AsyncLLMEngine RPC

    Returns the Client or None if the creation failed.
    )r   zrV1 is enabled, but got --disable-frontend-multiprocessing. To disable frontend multiprocessing, set VLLM_USE_V1=0.r   )AsyncLLMNclient_count   client_index)vllm_configr   enable_log_requestsdisable_log_statsZclient_addressesr   r   )r   r   r   r   shutdownZPROMETHEUS_MULTIPROC_DIRzFound PROMETHEUS_MULTIPROC_DIR was set by user. This directory must be wiped between vLLM runs or you will find inaccurate metrics. Unset the variable and vLLM will properly handle cleanup.z0Multiprocessing frontend to use %s for IPC Path.ZspawnbTF)lock)targetr   zEngine process failed to start.z"Started engine process with PID %dc                     s*     dd} tj| rt|  d S d S )Nzipc:// )replacer   pathexistsr   )Zsocket_pathZipc_pathry   r|   _cleanup_ipc_path)  s   zEbuild_async_engine_client_from_engine_args.<locals>._cleanup_ipc_pathzCEngine process failed to start. See stack trace for the root cause.   )multiprocess)7Zcreate_engine_configrx   VLLM_USE_V1r   warningZvllm.v1.engine.async_llmr   popZfrom_vllm_configr   r   Zreset_mm_cacher   r)   Zis_unsupported_configr(   hasattrr   environtempfileTemporaryDirectoryrr   namerm   r   r   Zget_contextrg   ValueProcessr*   ri   OPENAI_API_SERVERstartpidinfoatexitregisterr   rv   get_running_looprun_in_executorsetupTimeoutErroris_alivevalueRuntimeError	terminateclosejoinexitcodekillprometheus_clientr   Zmark_process_dead)r   r   r   r   r   r   Z	async_llmr   r   r{   contextZengine_aliveZengine_processZ
engine_pidr   Zbuild_clientZmq_engine_clientr   ry   r   r|   r      s   










r   raw_requestc                    s>   | j dd }|jdddd }|dkrtdgd	d S )
Ncontent-typer   ;r   )maxsplitr   zapplication/jsonz:Unsupported Media Type: Only 'application/json' is allowederrors)headersgetlowersplitr   )r   content_type
media_typery   ry   r|   validate_json_requestX  s   r   c                   @   s   e Zd ZejZdS )PrometheusResponseN)__name__
__module____qualname__r   ZCONTENT_TYPE_LATESTr   ry   ry   ry   r|   r   d  s    
r   c                 C   sV   t  }tg d|d | j| td tdt|d}t	d|_
| j| dS )z*Mount prometheus metrics to a FastAPI app.)/metrics/health/load/ping/version/server_info)Zexcluded_handlersregistryresponse_classr   )r   z^/metrics(?P<path>.*)$N)rp   r   r   Z
instrumentZexposer   r   r   recompileZ
path_regexZroutesappend)rt   r   Zmetrics_routery   ry   r|   mount_metricsh  s   
r   requestc                 C   s   t | S ru   )tokenizationr   ry   ry   r|   base     r   c                 C   
   | j jjS ru   )rt   r~   openai_serving_modelsr   ry   ry   r|   models     
r   c                 C   r   ru   )rt   r~   openai_serving_responsesr   ry   ry   r|   	responses  r   r   c                 C   r   ru   )rt   r~   openai_serving_chatr   ry   ry   r|   chat  r   r   c                 C   r   ru   )rt   r~   openai_serving_completionr   ry   ry   r|   
completion  r   r   c                 C   r   ru   )rt   r~   openai_serving_poolingr   ry   ry   r|   pooling  r   r   c                 C   r   ru   )rt   r~   openai_serving_embeddingr   ry   ry   r|   	embedding  r   r   c                 C   r   ru   rt   r~   openai_serving_scoresr   ry   ry   r|   score  r   r   c                 C   r   ru   )rt   r~   openai_serving_classificationr   ry   ry   r|   classify  r   r   c                 C   r   ru   r   r   ry   ry   r|   rerank  r   r   c                 C   r   ru   )rt   r~   openai_serving_tokenizationr   ry   ry   r|   r     r   r   c                 C   r   ru   )rt   r~   openai_serving_transcriptionr   ry   ry   r|   transcription  r   r  c                 C   r   ru   )rt   r~   openai_serving_translationr   ry   ry   r|   translation  r   r  c                 C   r   ru   )rt   r~   r{   r   ry   ry   r|   r{     r   r{   r   r   c                    s   t |  I dH  tddS )zHealth check.N   status_code)r{   Zcheck_healthr   r   ry   ry   r|   health  s   
r
  r   c                    s   t d| jjjidS )NZserver_loadcontent)r   rt   r~   server_load_metricsr   ry   ry   r|   get_server_load_metrics  s   r  r   c                    s   t | I dH S )z+Ping check. Endpoint required for SageMakerN)r
  r	  ry   ry   r|   ping  s   r  z	/tokenizemodel)dependenciesr   c              
      s   t |}z|| |I d H }W n- ty' } zttjjt|d|d }~w ty= } zttj	jt|d|d }~ww t
|trMt| |jjdS t
|trYt| dS t| d S )Nr  detailr  r  r  )r   Zcreate_tokenizeNotImplementedErrorr   r   NOT_IMPLEMENTEDr   str	ExceptionINTERNAL_SERVER_ERROR
isinstancer>   r   
model_dumperrorcoderI   r%   r   r   handler	generatorery   ry   r|   tokenize  s4   

r"  z/detokenizec              
      s   t |}z|| |I d H }W n+ ty% } z	tt|gd|d }~w ty; } zttjj	t|d|d }~ww t
|trKt| |jjdS t
|trWt| dS t| d S )Nr   r  r  r  )r   Zcreate_detokenizeOverflowErrorr   r  r  r   r   r  r   r  r>   r   r  r  r  r:   r%   r  ry   ry   r|   
detokenize  s,   

r$  c                 C   s,   t | ddrtddtfdd}dS dS )z>Conditionally register the tokenizer info endpoint if enabled.Zenable_tokenizer_info_endpointFz/tokenizer_infor   c                    s:   t |  I dH }t| t|tr|jjdS ddS )z(Get comprehensive tokenizer information.Nr  r  )r   get_tokenizer_infor   r  r  r>   r  r  )r   resultry   ry   r|   r%  ,  s   
zBmaybe_register_tokenizer_info_endpoint.<locals>.get_tokenizer_infoN)getattrrouterr   r   )r   r%  ry   ry   r|   &maybe_register_tokenizer_info_endpoint(  s   r)  z
/v1/modelsc                    s&   t | }| I d H }t| dS )Nr  )r   show_available_modelsr   r  )r   r  Zmodels_ry   ry   r|   r*  5  s   r*  r   c                     s   dt i} t| dS )Nversionr  )VLLM_VERSIONr   )verry   ry   r|   show_version=  s   
r.  z/v1/responsesr  text/event-streamc                    v   t |}|d u rt|jddS || |I d H }t|tr)t| |jj	dS t|t
r5t| dS t|ddS )N(The model does not support Responses APImessager  r  r/  r  r   )r   r   create_error_responsecreate_responsesr  r>   r   r  r  r  rE   r   r   r   r  r   ry   ry   r|   r6  C  s   

r6  z/v1/responses/{response_id}response_idc                    ^   t |}|d u rt|jddS || I d H }t|tr(t| |jj	dS t| dS Nr1  r2  r  r  )
r   r   r5  retrieve_responsesr  r>   r   r  r  r  r8  r   r  responsery   ry   r|   r;  f     
r;  z"/v1/responses/{response_id}/cancelc                    r9  r:  )
r   r   r5  cancel_responsesr  r>   r   r  r  r  r<  ry   ry   r|   r?  u  r>  r?  z/v1/chat/completionsc                    r0  )Nz/The model does not support Chat Completions APIr2  r  r  r/  r4  )r   r   r5  create_chat_completionr  r>   r   r  r  r  r4   r   r7  ry   ry   r|   r@    s   

r@  z/v1/completionsc              
      s   t |}|d u rt|jddS z|| |I d H }W n- ty3 } zttjjt	|d|d }~w t
yI } zttjjt	|d|d }~ww t|trYt| |jjdS t|tret| dS t|ddS )Nz*The model does not support Completions APIr2  r  r  r  r/  r4  )r   r   r5  create_completionr#  r   r   BAD_REQUESTr   r  r  r  r  r>   r   r  r  r  r8   r   r  ry   ry   r|   rA    s<   

rA  z/v1/embeddingsc                    v   t |}|d u rt|jddS || |I d H }t|tr)t| |jj	dS t|t
r5t| dS t| d S )Nz)The model does not support Embeddings APIr2  r  r  )r   r   r5  create_embeddingr  r>   r   r  r  r  r<   r%   r7  ry   ry   r|   rD    s   

rD  z/poolingc                    rC  )Nz&The model does not support Pooling APIr2  r  r  )r   r   r5  create_poolingr  r>   r   r  r  r  rA   r%   r7  ry   ry   r|   rE       

rE  z	/classify)r  c                    rC  )Nz-The model does not support Classification APIr2  r  r  )r   r   r5  create_classifyr  r>   r   r  r  r  r6   r%   r7  ry   ry   r|   rG    s   

rG  z/scorec                    rC  )Nz$The model does not support Score APIr2  r  r  )r   r   r5  create_scorer  r>   r   r  r  r  rG   r%   r7  ry   ry   r|   rH  %  rF  rH  z	/v1/scorec                       t d t| |I d H S )NzTo indicate that Score API is not part of standard OpenAI API, we have moved it to `/score`. Please update your client accordingly.)r   r   rH  r   r   ry   ry   r|   create_score_v1A  s
   rK  z/v1/audio/transcriptions)r   c                    s   t | }|d u rt| jddS |j I d H }|||| I d H }t|tr2t|	 |j
jdS t|tr>t|	 dS t|ddS )Nz-The model does not support Transcriptions APIr2  r  r  r/  r4  )r  r   r5  filereadZcreate_transcriptionr  r>   r   r  r  r  rK   r   )r   r   r  
audio_datar   ry   ry   r|   create_transcriptionsU  "   


rO  z/v1/audio/translationsc                    s   t |}|d u rt|jddS | j I d H }||| |I d H }t|tr2t|	 |j
jdS t|tr>t|	 dS t|ddS )Nz+The model does not support Translations APIr2  r  r  r/  r4  )r  r   r5  rL  rM  Zcreate_translationr  r>   r   r  r  r  rM   r   )r   r   r  rN  r   ry   ry   r|   create_translations~  rP  rQ  z/rerankc                    rC  )Nz-The model does not support Rerank (Score) APIr2  r  r  )r   r   r5  	do_rerankr  r>   r   r  r  r  rC   r%   r7  ry   ry   r|   rR    s   

rR  z
/v1/rerankc                    rI  )NzTo indicate that the rerank API is not part of the standard OpenAI API, we have located it at `/rerank`. Please update your client accordingly. (Note: Conforms to JinaAI rerank API))r   Zwarning_oncerR  rJ  ry   ry   r|   do_rerank_v1  s
   rS  z
/v2/rerankc                    s   t | |I d H S ru   )rR  rJ  ry   ry   r|   do_rerank_v2  s   rT  z[SECURITY WARNING: Development endpoints are enabled! This should NOT be used in production!r   c                    s   dt | jjji}t|dS )Nr   r  )r  rt   r~   r   r   )r   Zserver_infory   ry   r|   show_server_info  s   
rU  z/reset_prefix_cachec                    sT   d}| j d}|durt|  }tdt| t| |I dH  t	ddS )z
        Reset the prefix cache. Note that we currently do not check if the
        prefix cache is successfully reset in the API server.
        Ndevicez*Resetting prefix cache with specific %s...r  r  )
query_paramsr   rj   upperr   r   r  r{   reset_prefix_cacher   )r   rV  Z
device_strry   ry   r|   rY    s   
rY  z/sleepc                    s2   | j dd}t| t|I d H  tddS )Nlevel1r  r  )rW  r   r{   rw   intr   )r   rZ  ry   ry   r|   rw     s   
rw   z/wake_upc                    sD   | j d}|g krd }td| t| |I d H  tddS )Ntagsz wake up the engine with tags: %sr  r  )rW  getlistr   r   r{   wake_upr   )r   r]  ry   ry   r|   r_    s   
r_  z/is_sleepingc                    s,   t d t|  I d H }td|idS )Nz$check whether the engine is sleepingis_sleepingr  )r   r   r{   r`  r   )r   r`  ry   ry   r|   r`    s   
r`  z/scale_elastic_epc              
      sF  z	|   I d H }W n t jy } ztddd|d }~ww |d}|dd}|d u r4tdddt|tr=|dkrCtdd	dt|trL|dkrRtdd
ddat| }zGz|||I d H  t	dd| diW W daS  t
y } ztdd| dd|d }~w ty } ztd| tddd|d }~ww daw )Ni  zInvalid JSON formatr  new_data_parallel_sizedrain_timeoutx   z"new_data_parallel_size is requiredr   z1new_data_parallel_size must be a positive integerz(drain_timeout must be a positive integerTr3  z
Scaled to z data parallel enginesFi  z0Scale failed due to request drain timeout after z secondszScale failed: %si  zScale failed)jsonJSONDecodeErrorr   r   r  r\  _scaling_elastic_epr{   scale_elastic_epr   r   r  r   r  )r   bodyr!  ra  rb  clientry   ry   r|   rg    sj   

rg  z/is_scaling_elastic_epc                    s   t dtiS )Nis_scaling_elastic_ep)r   rf  r	  ry   ry   r|   rj  V  s   rj  INVOCATION_TYPESc                 C   s&   g | ]\}\}}t |||ffqS ry   )pydanticZTypeAdapter).0Zrequest_typeget_handlerendpointry   ry   r|   
<listcomp>m  s    
rp  z/invocationsc           
   
      s   z	   I dH }W n t jy# } zttjjd| d|d}~ww  fddtD }|D ]\}}z||}W n
 tj	yD   Y q/w || I dH   S fdd|D }d| }t
 j|d}	t|	 |	jjd	S )
z9For SageMaker, routes requests based on the request type.NzJSON decode error: r  c                    s(   g | ]\}\}}| d ur||fqS ru   ry   )rm  	validatorrn  ro  r	  ry   r|   rp    s    zinvocations.<locals>.<listcomp>c                    s.   g | ]\}}t |j  tr jnt qS ry   )r  _typetyper   r  )rm  rq  _)try   r|   rp    s    z;Cannot find suitable handler for request. Expected one of: r2  r  )rd  re  r   r   rB  r   INVOCATION_VALIDATORSZvalidate_pythonrl  ValidationErrorr   r5  r   r  r  r  )
r   rh  r!  Zvalid_endpointsZrequest_validatorro  r   Z
type_namesmsgresry   )r   ru  r|   invocationss  s:   

rz  z\Torch Profiler is enabled in the API server. This should ONLY be used for local development!z/start_profilec                    2   t d t|  I d H  t d tddS )NzStarting profiler...zProfiler started.r  r  )r   r   r{   start_profiler   r	  ry   ry   r|   r|    
   


r|  z/stop_profilec                    r{  )NzStopping profiler...zProfiler stopped.r  r  )r   r   r{   stop_profiler   r	  ry   ry   r|   r~    r}  r~  znLoRA dynamic loading & unloading is enabled in the API server. This should ONLY be used for local development!z/v1/load_lora_adapterc                    D   t |}|| I d H }t|trt| |jjdS td|dS Nr  r  )r  r  )	r   load_lora_adapterr  r>   r   r  r  r  r   r   r   r  r=  ry   ry   r|   r       
r  z/v1/unload_lora_adapterc                    r  r  )	r   unload_lora_adapterr  r>   r   r  r  r  r   r  ry   ry   r|   r    r  r  log_config_filec              
   C   sx   | sd S zt | }t|W  d    W S 1 sw   Y  W d S  ty; } ztd| | W Y d }~d S d }~ww )Nz0Failed to load log config from file %s: error %s)openrd  loadr  r   r   )r  fr!  ry   ry   r|   load_log_config  s   
(r  c                	   @   sH   e Zd ZdZdedee ddfddZded	e	d
e
ded fddZdS )AuthenticationMiddlewareaK  
    Pure ASGI middleware that authenticates each request by checking
    if the Authorization header exists and equals "Bearer {api_key}".

    Notes
    -----
    There are two cases in which authentication is skipped:
        1. The HTTP method is OPTIONS.
        2. The request path doesn't start with /v1 (e.g. /health).
    rt   tokensr   Nc                 C   s   || _ dd |D | _d S )Nc                 S   s   h | ]}d | qS )zBearer ry   )rm  tokenry   ry   r|   	<setcomp>  s    z4AuthenticationMiddleware.__init__.<locals>.<setcomp>)rt   
api_tokens)selfrt   r  ry   ry   r|   __init__  s   z!AuthenticationMiddleware.__init__scopereceivesendc                 C   s   |d dvs|d dkr|  |||S |dd}t|dj|}t|d}|drB|d	| jvrBtd
didd}||||S |  |||S )Nrs  httpZ	websocketmethodOPTIONS	root_pathr   r  z/v1Authorizationr  Unauthorizedi  r  )	rt   r   r   r   removeprefixr   
startswithr  r   )r  r  r  r  r  Zurl_pathr   r=  ry   ry   r|   __call__  s    
z!AuthenticationMiddleware.__call__)r   r   r   __doc__r    listr  r  r#   r"   r$   r   r  ry   ry   ry   r|   r    s    
r  c                	   @   @   e Zd ZdZdeddfddZdeded	ede	d fd
dZ
dS )XRequestIdMiddlewarez
    Middleware the set's the X-Request-Id header for each response
    to a random uuid4 (hex) value if the header isn't already
    present in the request, otherwise use the provided request id.
    rt   r   Nc                 C   
   || _ d S ru   rt   r  rt   ry   ry   r|   r    r   zXRequestIdMiddleware.__init__r  r  r  c                    sJ   |d dvr|  ||S t|d dtdd f fdd}|  |||S )Nrs  r  r  r3  r   c                    sL   | d dkrt | d d} dt j}|d| | I dH  dS )zx
            Custom send function to mutate the response headers
            and append X-Request-Id to it.
            rs  zhttp.response.startr   )rawzX-Request-IdN)r   r   uuiduuid4hexr   )r3  response_headersZ
request_idZrequest_headersr  ry   r|   send_with_request_id  s   z;XRequestIdMiddleware.__call__.<locals>.send_with_request_id)rt   r   r!   )r  r  r  r  r  ry   r  r|   r  
  s
   
zXRequestIdMiddleware.__call__r   r   r   r  r    r  r#   r"   r$   r   r  ry   ry   ry   r|   r     s    
r  c                	   @   r  )ScalingMiddlewarez
    Middleware that checks if the model is currently scaling and
    returns a 503 Service Unavailable response if it is.

    This middleware applies to all HTTP requests and prevents
    processing when the model is in a scaling state.
    rt   r   Nc                 C   r  ru   r  r  ry   ry   r|   r  .  r   zScalingMiddleware.__init__r  r  r  c                 C   sH   |d dkr|  |||S trtddidd}||||S |  |||S )Nrs  r  r  z7The model is currently scaling. Please try again later.i  r  )rt   rf  r   )r  r  r  r  r=  ry   ry   r|   r  1  s   zScalingMiddleware.__call__r  ry   ry   ry   r|   r  %  s    
r  
chunk_datac                 C   s  zQddl m}m} | ddkr-|| }|jr'|jd jjr*|jd jjW S W d
S W d
S | ddkrI|| }|jrL|jd jrO|jd jW S W d
S W d
S W d
S  t	j
y   d| v r| d r| d d }d|v rz|d drz|d d  Y S |d	r|d	  Y S Y d
S Y d
S Y d
S w )z0Extract content from a streaming response chunk.r   )ChatCompletionStreamResponseCompletionStreamResponseobjectzchat.completion.chunkZtext_completionchoicesdeltar  textr   ) vllm.entrypoints.openai.protocolr  r  r   Zmodel_validater  r  r  r  rl  rw  )r  r  r  Zchat_responseZcompletion_responsechoicery   ry   r|   _extract_content_from_chunkD  sT   

r  c                   @   s`   e Zd ZdZdd Zdedee fddZdede	fd	d
Z
de	ddfddZde	fddZdS )
SSEDecoderz:Robust Server-Sent Events decoder for streaming responses.c                 C   s   d| _ g | _d S )Nr   )buffercontent_bufferr  ry   ry   r|   r  d  s   
zSSEDecoder.__init__chunkr   c                 C   s   ddl }z|d}W n ty   g  Y S w |  j|7  _g }d| jv rn| jdd\}| _|d}|dri|dd  }|d	krM|d
di n|riz|	|}|d|d W n
 |j
yh   Y q w d| jv s%|S )z4Decode a chunk of SSE data and return parsed events.r   Nzutf-8
r   zdata:    z[DONE]rs  donedata)rs  r  )rd  decodeUnicodeDecodeErrorr  r   rstripr  stripr   loadsre  )r  r  rd  Z	chunk_streventslineZdata_str
event_datary   ry   r|   decode_chunkh  s0   




zSSEDecoder.decode_chunkr  c                 C   s   t |S )z Extract content from event data.)r  )r  r  ry   ry   r|   extract_content  r   zSSEDecoder.extract_contentr  Nc                 C   s   |r
| j | dS dS )zAdd content to the buffer.N)r  r   )r  r  ry   ry   r|   add_content  s   zSSEDecoder.add_contentc                 C   s   d | jS )z"Get the complete buffered content.r   )r   r  r  ry   ry   r|   get_complete_content  s   zSSEDecoder.get_complete_content)r   r   r   r  r  bytesr  dictr  r  r  r  r  ry   ry   ry   r|   r  a  s     r  response_bodyc                    sF   ddl m} t d  fdd}|| | _tdt dS )z/Log streaming response with robust SSE parsing.r   r   c                  3   s    D ]V}  d7  | V   | }|D ]E}|d dkr(|d }| q|d dkrX }|rNt|dkrC|d d d }	 td|    d S td    d S qqd S )	Nr   rs  r  r  i   r   z;response_body={streaming_complete: content='%s', chunks=%d}z9response_body={streaming_complete: no_content, chunks=%d})r  r  r  r  lenr   r   )r  r  eventr  Zfull_contentZchunk_countr  Zsse_decoderry   r|   buffered_iterator  s8   
	z2_log_streaming_response.<locals>.buffered_iteratorz,response_body={streaming_started: chunks=%d}N)starlette.concurrencyr   r  body_iteratorr   r   r  )r=  r  r   r  ry   r  r|   _log_streaming_response  s   !r  c                 C   s>   z| d   }td| W dS  ty   td Y dS w )zLog non-streaming response.r   zresponse_body={%s}zresponse_body={<binary_data>}N)r  r   r   r  )r  Zdecoded_bodyry   ry   r|   _log_non_streaming_response  s   r  c           
      C   sj  | j rtd d d td}nttd}|t | j|_t| |jt| j	| j
| j| jd |tdtdtfdd}|tdtdtfdd	}d
d | jpQtjgD  }r]|jt|d | jre|t |t tjr~td |ddtfdd}| jD ]1}|dd\}}tt||}	t !|	r||	 qt "|	r|d|	 qt#d| d|S )N)Zopenapi_urlZdocs_urlZ	redoc_urlr   )r   )Zallow_originsallow_credentialsZallow_methodsZallow_headersrt  excc                    s4   t t|jt|jj|jdd}t| |jdS )Nr3  rs  r  r  r  )r>   r=   r  r   r  phraser   r  )rt  r  errry   ry   r|   http_exception_handler  s   
z)build_app.<locals>.http_exception_handlerc                    sj   t |}t | }| r|r||kr| d| }n|}tt|tjjtjdd}t| tjdS )N r  r  r  )	r  r   r>   r=   r   rB  r  r   r  )rt  r  Zexc_strZ
errors_strr3  r  ry   ry   r|   validation_exception_handler  s   
z/build_app.<locals>.validation_exception_handlerc                 S   s   g | ]}|r|qS ry   ry   )rm  keyry   ry   r|   rp    s    zbuild_app.<locals>.<listcomp>)r  z}CAUTION: Enabling log response in the API Server. This can include sensitive information and should be avoided in production.r  r   c                    sz   || I d H }dd |j 2 I d H }tt||_ |jdd}|dk}|s.td |S |r7t|| |S t| |S )Nc                    s   g | z3 d H W }|q6 S ru   ry   )rm  sectionry   ry   r|   rp    s    z3build_app.<locals>.log_response.<locals>.<listcomp>r   r   z text/event-stream; charset=utf-8zresponse_body={<empty>})	r  r   iterr   r   r   r   r  r  )r   Z	call_nextr=  r  r   Zis_streamingry   ry   r|   log_response	  s    

zbuild_app.<locals>.log_response.r   zInvalid middleware z . Must be a function or a class.)$Zdisable_fastapi_docsr   r   Zinclude_routerr(  r  r   Zadd_middlewarer   Zallowed_originsr  allowed_methodsZallowed_headersZexception_handlerr   r   r   Zapi_keyrx   ZVLLM_API_KEYr  Zenable_request_id_headersr  r  Z"VLLM_DEBUG_LOG_API_SERVER_RESPONSEr   r   
middlewarersplitr'  	importlibimport_moduleinspectisclassiscoroutinefunction
ValueError)
r   rt   r  r  r  r  r  module_pathZobject_nameZimportedry   ry   r|   	build_app  sR   







r  r   r~   c                    sh   j d ur
 j }n jg} jrt jd}nd } fdd|D }| |_ j |_||_|j	}t
jr<|  I d H }n|j}td| t j}	|	d uru|  I d H }
t|
tr`t|	d}	nt|
d d |j	d}||	krutd|	 j  jdkr~t }n jrt }| jI d H  nd }|jd ur|jjni } j}|rd	d | D } jd u r|}n||7 }t | |||d
|_!|j!" I d H  d|v rt#| ||j!||	 j$ j% j& j'| j( j) j*dnd |_+d|v rt,| ||j! j-||	 j$ j% j& j. j' j( j) j*dnd |_/d|v rt0| ||j!| j% j) j*dnd |_1d|v r2t2| ||j!||	 j$dnd |_3d|v rFt4| ||j!||	 j$dnd |_5d|v rWt6| ||j!|dnd |_7d|v ogt8|j9dddk}d|v sp|ryt:| ||j!|dnd |_;t<| ||j!||	 j$d|_=d|v rt>| ||j!|dnd |_?d|v rt@| ||j!|dnd |_A jB|_Bd|_Cd S )N)max_log_lenc                    s   g | ]	}t | jd qS ))r   Z
model_path)rT   r  )rm  r   r   ry   r|   rp  ;  s    z"init_app_state.<locals>.<listcomp>zSupported_tasks: %s)chat_template)	tokenizerr  Ztoolsmodel_configzUsing supplied chat template: %s
It is different from official chat template '%s'. This discrepancy may lead to performance degradation.democ                 S   s   g | ]
\}}t ||d qS ))r   r   )rU   )rm  ZmodalityZ	lora_pathry   ry   r|   rp  s  s    )r{   r  base_model_pathslora_modulesgenerate)
request_loggerr  chat_template_content_formatreturn_tokens_as_token_idsenable_auto_toolstool_parsertool_serverreasoning_parserenable_prompt_tokens_detailsenable_force_include_usage)
r  r  r  r  r  #exclude_tools_when_tool_choice_noner   r  r  r  )r  r  r  r  encode)r  r  r  Zembedr   )r  Z
num_labelsr   r   r  )DZserved_model_namer  r   r0   r  r{   r   r   r   r  rx   r   Zget_supported_taskssupported_tasksr   r   r,   r  Zget_tokenizerr  rh   r.   r-   r   r  r^   r_   Zadd_tool_serverZlora_configdefault_mm_lorasr  itemsrV   r   Zinit_static_lorasrX   r  r  enable_auto_tool_choicetool_call_parserr  r  r  r   rO   Zresponse_roler  r   rQ   r   rW   r   rR   r   rP   r   r'  Z	hf_configrY   r   rZ   r  r[   r  r\   r  Zenable_server_load_trackingr  )r{   r   r~   r   Zserved_model_namesr  r  r  r  Zresolved_chat_templater  Zhf_chat_templater  r  r  Zdefault_mm_lora_pathsZenable_serving_rerankingry   r  r|   init_app_state+  st  
















r  addrc                 C   sZ   t j}t| d rt j}t j |t jd}|t jt jd |t jt jd |	|  |S )Nr   familyrs  r   )
socketAF_INETrn   AF_INET6SOCK_STREAM
setsockopt
SOL_SOCKETSO_REUSEADDRSO_REUSEPORTbind)r  r  sockry   ry   r|   create_server_socket  s   
r  r   c                 C   s    t j t jt jd}||  |S )Nr  )r  AF_UNIXr  r  )r   r  ry   ry   r|   create_server_unix_socket  s   
r  c                 C   sx   t j }| jr| j|vrtd| j dd| dtj }| j	r8| j	|vr:td| j	 dd| dd S d S )Nzinvalid tool call parser: z (chose from { ,z })zinvalid reasoning parser: )
r]   Ztool_parserskeysr
  r  KeyErrorr   rf   Zreasoning_parsersr  )r   Zvalid_tool_parsesZvalid_reasoning_parsesry   ry   r|   validate_api_server_args  s    





r   c           	      C   s   t dt t|  | jrt| jdkrt| j t|  | j	r't
| j	}n| jp+d| jf}t|}t  ddd}ttj| | j	rOd| j	 }||fS |\}}| joX| j}t|rcd	| d
n|pfd}d|rldnd d| d| }||fS )zRValidate API server args, set up signal handler, create socket
    ready to serve.zvLLM API server version %s   r   r   Nc                  W   s   t d)NZ
terminated)KeyboardInterrupt)rt  ry   ry   r|   signal_handler"  r   z$setup_server.<locals>.signal_handlerzunix:[]z0.0.0.0r  sz://:r   N)r   r   r,  rc   tool_parser_pluginr  r]   import_tool_parserr   Zudsr  hostportr  ro   signalSIGTERMssl_keyfilessl_certfilern   )	r   r  Z	sock_addrr#  listen_addressr  r,  Zis_sslZ	host_partry   ry   r|   setup_server	  s0   
r2  c                    s4   t d t| \}}t||| fi |I dH  dS )zRun a single-worker API server.Z	APIServerN)rl   r2  run_server_worker)r   uvicorn_kwargsr1  r  ry   ry   r|   
run_server3  s   r5  c                    sB  |j rt|j dkrt|j  |r|ddnd}t|j}|dur(||d< t||d4 I dH R}t| t	|}|
 I dH }	t||	|j|I dH  td||  t|f||j|j|j|j|j tj|j|j|j|j|j|jd|I dH }
W d  I dH  n1 I dH sw   Y  z|
I dH  W |  dS |  w )	zRun a single API server worker.r!  r   r   N
log_config)r   z!Starting vLLM API server %d on %s)r  enable_ssl_refreshr+  r,  	log_levelZ
access_logZtimeout_keep_aliver/  r0  ssl_ca_certsssl_cert_reqsh11_max_incomplete_event_sizeh11_max_header_count)r)  r  r]   r*  r   r  r  r   r)  r  Zget_vllm_configr  r~   r   r   r/   r7  r+  r,  Zuvicorn_log_levelZdisable_uvicorn_access_logrx   ZVLLM_HTTP_TIMEOUT_KEEP_ALIVEr/  r0  r9  r:  r;  r<  r   )r1  r  r   r   r4  Zserver_indexr6  r{   rt   r   Zshutdown_taskry   ry   r|   r3  =  sT   
(!r3  __main__z*vLLM OpenAI-Compatible RESTful API server.)descriptionr(  ru   (%  rv   r   r   r  r  rd  r   Zmultiprocessing.forkserverr   r   r-  r  r   r  argparser   collections.abcr   r   
contextlibr   	functoolsr   r  r   typingr   r	   r
   r   r   rl  regexr   ZuvloopZfastapir   r   r   r   r   r   Zfastapi.exceptionsr   Zfastapi.middleware.corsr   Zfastapi.responsesr   r   r   r   Z!prometheus_fastapi_instrumentatorr   r  r   Zstarlette.datastructuresr   r   r   r   Zstarlette.routingr   Zstarlette.typesr    r!   r"   r#   r$   Ztyping_extensionsr%   Z	vllm.envsrx   Zvllm.configr&   Zvllm.engine.arg_utilsr'   Zvllm.engine.async_llm_enginer(   Z"vllm.engine.multiprocessing.clientr)   Z"vllm.engine.multiprocessing.enginer*   Zvllm.engine.protocolr+   Zvllm.entrypoints.chat_utilsr,   r-   r.   Zvllm.entrypoints.launcherr/   Zvllm.entrypoints.loggerr0   Z vllm.entrypoints.openai.cli_argsr1   r2   r  r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   Z$vllm.entrypoints.openai.serving_chatrO   Z.vllm.entrypoints.openai.serving_classificationrP   Z*vllm.entrypoints.openai.serving_completionrQ   Z)vllm.entrypoints.openai.serving_embeddingrR   Z&vllm.entrypoints.openai.serving_enginerS   Z&vllm.entrypoints.openai.serving_modelsrT   rU   rV   Z'vllm.entrypoints.openai.serving_poolingrW   Z)vllm.entrypoints.openai.serving_responsesrX   Z%vllm.entrypoints.openai.serving_scorerY   Z,vllm.entrypoints.openai.serving_tokenizationrZ   Z-vllm.entrypoints.openai.serving_transcriptionr[   r\   Z$vllm.entrypoints.openai.tool_parsersr]   Zvllm.entrypoints.tool_serverr^   r_   r`   Zvllm.entrypoints.utilsra   rb   rc   rd   Zvllm.loggerre   Zvllm.reasoningrf   Zvllm.transformers_utils.configrg   Z!vllm.transformers_utils.tokenizerrh   Zvllm.usage.usage_libri   Z
vllm.utilsrj   rk   rl   rm   rn   ro   Zvllm.v1.metrics.prometheusrp   Zvllm.versionrq   r,  r   __annotations__r   setrs   Taskr   r   r   r  r  r   r   r   r(  r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r{   r   r
  r  postr  rB  r   	NOT_FOUNDr  r  r"  r$  r)  r*  r.  OKr6  r;  r?  r@  rA  rD  rE  rG  rH  rK  UNPROCESSABLE_ENTITYrO  rQ  rR  rS  rT  ZVLLM_SERVER_DEV_MODEr   rU  rY  rw   r_  r`  REQUEST_TIMEOUTrg  rj  ZRequestTypeZGetHandlerFnZ
EndpointFnrk  r  tuplerv  UNSUPPORTED_MEDIA_TYPErz  ZVLLM_TORCH_PROFILER_DIRr|  r~  Z VLLM_ALLOW_RUNTIME_LORA_UPDATINGr  r  r  r  r  rf  r  r  r  r  r  r  r  r\  r  r  r   r2  r5  r3  r   parser
parse_argsr   runry   ry   ry   r|   <module>   s  
 x 
" 	






	

.






,


#"5-	_ "=*8

