o
    )i                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZ
d dlZ
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d dlm'Z'm(Z(m)Z) d dl*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; d d	l<m=Z=m>Z> d d
l?m@Z@mAZAmBZBmCZC d dlmDZD d dlEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQ d dlRmSZS d dlmTZT d dlUZUd dlVZVd dlWZWd dlXZYd dlZmEZ[ d dl\Z\d dl]Z^d dl_Z_d dl`Z`d dlaZ`d dlbZbd dlcZcd dldZcd dlemfZf d dlgmhZh d dlimjZj d dlkmlZl d dlmmnZnmoZompZpmqZq d dlrmsZs d dltmuZumvZv d dlwmxZx eFrd dl myZy d dlzm{Z{m|Z| eve}Z~dZdZdZdZdZdZdZdZd Zd!Zd"Zd#Zeeeeeeeeed$	Zd%Zd&ed'< d(Zd&ed)< d*Zd&ed+< d,Zd&ed-< d.Zd&ed/< d0Zd&ed1< d2Zd&ed3< d4Zd&ed5< d6Z	 d7Z	 d8Zd9Ze`je`je`je`je`je`je`je`je`jd:	Ze`jeYje`jeYje`jeYje`jeYje`jeYje`jeYjiZejdd=d>Zeod?ZeNd@ZeNdAZeNdBe2dCZeNdDZeNdEZG dFdG dGZe ZG dHdI dIejZG dJdK dKejZG dLdM dMZG dNdO dOe+eef ZG dPdQ dQeKZG dRdS dSeUjeef eIeef ZG dTdU dUZe@dddXdYZddZd[Zdd\d]ZG d^d_ d_ZddbdcZddfdgZddldmZddpdqZ	d d!dwdxZd"d{d|Zd#ddZd$ddZÐdddZdd ZŐdddZƐd%ddZǐd&ddZȐd'ddZɐd(ddZʐd(ddZːdddZ̐dddZ͐dddZΐdddZϐd)ddZАd*ddZѐd+ddZҐd,ddZӐdddZԐd-ddZՐd.ddZ֐d.ddZאd/ddÄZ	d d0ddɄZ				ːd1d2dd؄Z			ʐd3d4ddڄZe@d5dd܄Ze@d5ddބZG dd dZddd6ddZddddd7ddZd8ddZd9ddZd9ddZd:ddZd;d dZd<ddZddd=ddZd>ddZd?ddZd@ddZe@dAddZdd d!Ze`jjZe ZdBd$d%Zee`j_dCd&d'ZdDd*d+ZdEd,d-ZeNd.eHd/eGf dCZ	0	dFdGd6d7Zd0dd8dHd:d;ZeAd<d=	d dId?d@ZddAdBZd5dCdDZd5dEdFZ	dJdKdJdKZdLdOdPZdMdSdTZG dUdV dVe!Z G dWdX dXe"e%ZG dYdZ dZe#ZdNd]d^Zdd0d_dOdedfZd0dd_dPdjdkZd5dldmZd5dndoZd5dpdqZG drds dsZ	G dtdu due6e
ef eIe ZG dvdw dwe+ee ef ZdQdydzZdRd~dZdSddZdTddZe@dd ZG dd dZG dd deZG dd deZejddZ				dUdVddZdWddZdXddZe=G dd dZe=G dd dZejdYddZdZddZdd Zd[ddZd d\ddZ 			d]d^dĐdńZ!ej		 	d_d`dǐdȄZ"dɐdʄ Z#dːd̄ Z$	d dadѐd҄Z%dbdِdڄZ&dېd܄ Z'dcdߐdZ(G dd dej)Z*ddddZ+ejd deddZ,dfdgddZ-dhddZ.dddZ/dddZ0diddZ1djddZ2dkddZ3e@dldd Z4d5ddZ5d5ddZ6d5ddZ7d5ddZ8			dmdnddZ9doddZ:d dpddZ;dS (q      )annotationsN)ActionArgumentDefaultsHelpFormatterArgumentParserArgumentTypeErrorRawDescriptionHelpFormatter_ArgumentGroup)FIRST_COMPLETEDAbstractEventLoopTask)UserDictdefaultdict)
AsyncGenerator	Awaitable
Collection	GeneratorHashableIterableIteratorKeysViewMappingSequence)ThreadPoolExecutor)ProcessPoolExecutor)	dataclassfield)cache	lru_cachepartialwraps)MappingProxyType)TYPE_CHECKINGAnyCallableGenericLiteral
NamedTupleOptionalTextIOTypeVarUnioncastoverload)urlparseuuid4)version)Version)Library)BatchEncoding)Never	ParamSpecTypeIsassert_never)enable_trace_function_callinit_logger)is_in_ray_actor)	Namespace)ModelConfig
VllmConfigi   i   i   zOSliding window attention for encoder/decoder models is not currently supported.zEPrefix caching for encoder/decoder models is not currently supported.zFChunked prefill for encoder/decoder models is not currently supported.ztModels with logits_soft_cap require FlashInfer backend, which is currently not supported for encoder/decoder models.z<LoRA is not currently supported with encoder/decoder models.zLPipeline parallelism is not currently supported with encoder/decoder models.zBMultimodal is not currently supported with encoder/decoder models.zLSpeculative decoding is not currently supported with encoder/decoder models.zcXFormers and Flash-Attention are the only backends currently supported with encoder/decoder models.)	STR_NOT_IMPL_ENC_DEC_SWA!STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE$STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL"STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAPSTR_NOT_IMPL_ENC_DEC_LORASTR_NOT_IMPL_ENC_DEC_PPSTR_NOT_IMPL_ENC_DEC_MMSTR_NOT_IMPL_ENC_DEC_SPEC_DECSTR_NOT_IMPL_ENC_DEC_BACKENDZVLLM_ATTENTION_BACKENDstrSTR_BACKEND_ENV_VARZ
FLASHINFERSTR_FLASHINFER_ATTN_VALZ
TORCH_SDPASTR_TORCH_SDPA_ATTN_VALZ
ROCM_FLASHSTR_ROCM_FLASH_ATTN_VALZXFORMERSSTR_XFORMERS_ATTN_VALZ
FLASH_ATTNSTR_FLASH_ATTN_VALZDUAL_CHUNK_FLASH_ATTNSTR_DUAL_CHUNK_FLASH_ATTN_VALINVALIDSTR_INVALID_VALi ʚ;   @z[1;36mz[0;0m)	float32halfbfloat16floatfp8Zfp8_e4m3Zfp8_e5m2int8Zfp8_incnum_threadsintc                 c  s(    t  }t |  dV  t | dS )zBSets the default number of threads for PyTorch to the given value.N)torchZget_num_threadsZset_num_threads)rX   Zold_num_threads r[   _/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/utils/__init__.pyset_default_torch_num_threads   s
   
r]   PTU_K)bound_V_Tc                   @  s   e Zd ZdS )	_SentinelN)__name__
__module____qualname__r[   r[   r[   r\   re      s    re   c                   @  s   e Zd Ze Ze ZdS )DeviceN)rf   rg   rh   enumautoZGPUZCPUr[   r[   r[   r\   ri      s    ri   c                   @  s   e Zd ZdZdZdS )LayerBlockType	attentionmambaN)rf   rg   rh   rm   rn   r[   r[   r[   r\   rl      s    rl   c                   @  s,   e Zd ZddddZddd	Zdd
dZdS )Counterr   startrY   returnNonec                 C  
   || _ d S Ncounter)selfrp   r[   r[   r\   __init__      
zCounter.__init__c                 C  s   | j }|  j d7  _ |S N   ru   )rw   ir[   r[   r\   __next__   s   zCounter.__next__c                 C  s
   d| _ d S Nr   ru   rw   r[   r[   r\   reset   ry   zCounter.resetNr   )rp   rY   rq   rr   rq   rY   rq   rr   )rf   rg   rh   rx   r}   r   r[   r[   r[   r\   ro      s    
ro   c                      s2   e Zd Zd fddZdd	d
ZdddZ  ZS )_MappingOrderCacheViewdataMapping[_K, _V]ordered_keysMapping[_K, None]c                   s   t  | || _d S rt   )superrx   r   )rw   r   r   	__class__r[   r\   rx      s   
z_MappingOrderCacheView.__init__rq   Iterator[_K]c                 C  
   t | jS rt   )iterr   r   r[   r[   r\   __iter__   ry   z_MappingOrderCacheView.__iter__KeysView[_K]c                 C  r   rt   )r   r   r   r[   r[   r\   keys   ry   z_MappingOrderCacheView.keys)r   r   r   r   )rq   r   )rq   r   )rf   rg   rh   rx   r   r   __classcell__r[   r[   r   r\   r      s    
r   c                   @  s6   e Zd ZU ded< ded< edddZdd	d
ZdS )	CacheInforY   hitstotalrq   rU   c                 C     | j dkrdS | j| j  S r~   )r   r   r   r[   r[   r\   	hit_ratio     
zCacheInfo.hit_ratiootherc                 C  s   t | j|j | j|j dS )Nr   r   )r   r   r   rw   r   r[   r[   r\   __sub__  s   

zCacheInfo.__sub__Nrq   rU   )r   r   )rf   rg   rh   __annotations__propertyr   r   r[   r[   r[   r\   r     s   
 r   c                      sB  e Zd Z	dFdG fddZdd	dH fddZdI fddZedJddZedKddZedLddZ	edLddZ
dd dMd#d$ZdId%d&ZedNd(d)ZedOd,d)Z	dFdPd.d)ZedQd/d0ZedRd1d0Z	dFdSd2d0ZdTd4d5ZdId6d7ZdId8d9ZdUd:d;Zdd<dVd>d?ZdWd@dAZdXdYdBdCZdWdDdEZ  ZS )ZLRUCacheNcapacityrU   	getsizeofOptional[Callable[[_V], float]]c                   s8   t  || tt  | _d| _d| _tddd| _d S )Nr   r   )	r   rx   setra   pinned_items_hits_totalr   
_last_info)rw   r   r   r   r[   r\   rx     s
   zLRUCache.__init__Tupdate_infokeyra   r   boolrq   rc   c                  s0   t  |}|r|  jd7  _|  jd7  _|S rz   )r   __getitem__r   r   )rw   r   r   valuer   r[   r\   r   "  s
   zLRUCache.__getitem__rr   c                   sN   || v }| j |dd}t | || jv r| | |r%| || d S d S NFr   )r   r   __delitem__r   _unpin
_on_remove)rw   r   Zrun_on_remover   r   r[   r\   r   +  s   

zLRUCache.__delitem__r   c                 C  s   t | j| jS )z:Return the internal cache dictionary in order (read-only).)r   Z_Cache__dataorderr   r[   r[   r\   r   6  s   zLRUCache.cacher   c                 C  r   )z1Return the internal order dictionary (read-only).)r    _LRUCache__orderr   r[   r[   r\   r   =     
zLRUCache.orderc                 C     | j S rt   maxsizer   r[   r[   r\   r   B     zLRUCache.capacityc                 C  r   r~   )r   currsizer   r[   r[   r\   usageF  r   zLRUCache.usageF)deltar   r   c                C  s,   t | j| jd}|r|| j }|| _|}|S )z
        Gets the cumulative number of hits and queries against this cache.

        If `delta=True`, instead gets these statistics
        since the last call that also passed `delta=True`.
        r   )r   r   r   r   )rw   r   infoZ
info_deltar[   r[   r\   statM  s   
zLRUCache.statc                 C  s2   z	| j | W d S  ty   d | j |< Y d S w rt   )r   move_to_endKeyErrorrw   r   r[   r[   r\   touch]  s
   zLRUCache.touchOptional[_V]c                C     d S rt   r[   r   r[   r[   r\   getc     zLRUCache.getdefaultUnion[_V, _T]c                C  r   rt   r[   rw   r   r   r[   r[   r\   r   g  r   Optional[Union[_V, _T]]c                C  s<   || v r| j |dd}|  jd7  _n|}|  jd7  _|S )NFr   r{   )r   r   r   rw   r   r   r   r[   r[   r\   r   k  s   c                 C  r   rt   r[   r   r[   r[   r\   pop|  r   zLRUCache.popc                 C  r   rt   r[   r   r[   r[   r\   r     r   c                 C  s(   || vr|S | j |dd}| | |S r   )r   r   r   r[   r[   r\   r     s   
r   c                 C  s   |  || d S rt   )__setitem__rw   r   r   r[   r[   r\   put  s   zLRUCache.putc                 C  s(   || vrt d| d| j| dS )zd
        Pins a key in the cache preventing it from being
        evicted in the LRU order.
        zCannot pin key: z not in cache.N)
ValueErrorr   addr   r[   r[   r\   pin  s   zLRUCache.pinc                 C  s   | j | dS )z_
        Unpins a key in the cache allowing it to be
        evicted in the LRU order.
        N)r   remover   r[   r[   r\   r     s   zLRUCache._unpinc                 C  r   rt   r[   r   r[   r[   r\   r     s   zLRUCache._on_removeremove_pinnedr   c                C  s    t | dkrd S | j|d d S )Nr   r   )lenpopitem)rw   r   r[   r[   r\   remove_oldest  s   zLRUCache.remove_oldestc                 C  s(   | j | jkr|   | j | jksd S d S rt   )r   r   r   r   r[   r[   r\   _remove_old_if_needed  s   zLRUCache._remove_old_if_neededc                   sV   |st  fdd jD t}|tu rtdnt t j} tt|}||fS )z>Remove and return the `(key, value)` pair least recently used.c                 3  s    | ]
}| j vr|V  qd S rt   )r   ).0r   r   r[   r\   	<genexpr>  s    z#LRUCache.popitem.<locals>.<genexpr>z:All items are pinned, cannot remove oldest from the cache.)nextr   ALL_PINNED_SENTINELRuntimeErrorr   r   r+   ra   )rw   r   Zlru_keyr   r[   r   r\   r     s   zLRUCache.popitemc                 C  sB   t | dkr| jdd t | dksd| _d| _tddd| _d S )Nr   Tr   r   )r   r   r   r   r   r   r   r[   r[   r\   clear  s   zLRUCache.clearrt   )r   rU   r   r   )r   ra   r   r   rq   rc   )r   ra   rq   rr   )rq   r   )rq   r   r   )r   r   rq   r   )r   ra   rq   r   )r   r   r   ra   rq   r   )r   r   r   ra   rq   r   )r   ra   rq   rc   )r   ra   r   r   rq   r   )r   ra   r   r   rq   r   )r   ra   r   rc   rq   rr   )r   ra   r   r   rq   rr   )r   r   rq   rr   r   F)r   r   )rf   rg   rh   rx   r   r   r   r   r   r   r   r   r   r,   r   r   r   r   r   r   r   r   r   r   r   r[   r[   r   r\   r     sF    	



	

r   c                   @  s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )PyObjectCachez^Used to cache python objects to avoid object allocations
    across scheduler iterations.
    c                 C  s4   || _ d| _g | _tdD ]
}| j|    qd S )Nr      )_obj_builder_index
_obj_cacherangeappend)rw   Zobj_builder_r[   r[   r\   rx     s   zPyObjectCache.__init__c                 C  s,   t | j}t|D ]
}| j|   q	d S rt   )r   r   r   r   r   )rw   Znum_objsr   r[   r[   r\   _grow_cache  s   
zPyObjectCache._grow_cachec                 C  sJ   | j t| jkr|   | j t| jk sJ | j| j  }|  j d7  _ |S )zxReturns a pre-allocated cached object. If there is not enough
        objects, then the cache size will double.
        r{   )r   r   r   r   )rw   objr[   r[   r\   
get_object  s   zPyObjectCache.get_objectc                 C  s
   d| _ dS )zMMakes all cached-objects available for the next scheduler iteration.
        r   N)r   r   r[   r[   r\   r     r   zPyObjectCache.resetN)rf   rg   rh   __doc__rx   r   r   r   r[   r[   r[   r\   r     s    r   gpurq   c                 C  s.   ddl m} || }|dksJ dt|S )z<Returns the maximum shared memory per thread block in bytes.r   _custom_opszmax_shared_mem can not be zero)vllmr   Z0get_max_shared_memory_per_block_device_attributerY   )r   opsZmax_shared_memr[   r[   r\   get_max_shared_memory_bytes  s
   r   c                   C  s
   t  jS )z2Returns the total CPU memory of the node in bytes.)psutilvirtual_memoryr   r[   r[   r[   r\   get_cpu_memory  s   
r   c                   C  s   t t jS rt   )rG   uuidr/   hexr[   r[   r[   r\   random_uuid      r   c                   @  s`   e Zd ZdZ		d(d)d
dZdd Zdd Zd*ddZd+ddZd,ddZ	d-d#d$Z
d%d& Zd'S ).AsyncMicrobatchTokenizerzAsynchronous tokenizer with micro-batching.

    Pulls pending encode/decode requests from a queue and batches them 
    up to reduce overhead. A single-thread ThreadPoolExecutor is used 
    so the event loop stays responsive.
        Mb`?max_batch_sizerY   batch_wait_timeout_srU   rq   rr   c                 C  s8   || _ || _|| _t | _i | _g | _tdd| _	d S )Nr{   )max_workers)
	tokenizerr   r   asyncioget_running_loop_loop_queues_batcher_tasksr   	_executor)rw   r   r   r   r[   r[   r\   rx     s   
z!AsyncMicrobatchTokenizer.__init__c                   sF   | j  }| d|}| | j |}||||fI d H  |I d H S )Nencoder  create_future
_queue_key
_get_queuer   )rw   promptkwargsresult_futurer   queuer[   r[   r\   __call__"  s   

z!AsyncMicrobatchTokenizer.__call__c                   sD   | j  }| d|}| | j |}|||fI d H  |I d H S )Ndecoder  )rw   	token_idsr  r  r   r  r[   r[   r\   r  )  s   

zAsyncMicrobatchTokenizer.decodeloopasyncio.AbstractEventLoopr   tupleXasyncio.Queue[Union[tuple[str, dict, asyncio.Future], tuple[list[int], asyncio.Future]]]c                 C  s   | j |}|du rDt  | j |< }|d dkr&|d dk}| ||}n|d dks6J d|d  d| |}| j|| |S )	zkGet the request queue for the given operation key, creating a new
        queue and batcher task if needed.Nr   r  r{   r   r  zUnknown operation type: .)	r  r   r  Queue_batch_encode_loop_batch_decode_loopr  r   create_task)rw   r  r   r  	can_batchcoror[   r[   r\   r  1  s   
z#AsyncMicrobatchTokenizer._get_queuer  asyncio.Queuer  r   c              
     s  	 |  I dH \}}}|g}|g}|g}j j }	t|jk rc|	j  }
|
dkr0n3z!t|  |
I dH \}}}|| || |sP|| W n
 tj	y[   Y nw t|jk s$zf|rt|dkrt
j|fi |}jj|I dH }t|D ]\ }| s fdd| D }|t| qn'||ffdd	}jj|I dH }t||D ]\}}| s|| qW n  ty } z|D ]}| s|| qW Y d}~nd}~ww q)	z.Batch incoming encode requests for efficiency.TNr   r{   c                   s   i | ]	\}}||  qS r[   r[   )r   kv)r|   r[   r\   
<dictcomp>e      z?AsyncMicrobatchTokenizer._batch_encode_loop.<locals>.<dictcomp>c                   s    fddt | |D S )Nc                   s"   g | ]\}} j |fi |qS r[   )r   )r   pkwr   r[   r\   
<listcomp>h  s    zQAsyncMicrobatchTokenizer._batch_encode_loop.<locals>.<lambda>.<locals>.<listcomp>)zip)promptsr  r   r[   r\   <lambda>h  s   
 z=AsyncMicrobatchTokenizer._batch_encode_loop.<locals>.<lambda>)r   r  timer   r   r   r  wait_forr   TimeoutErrorr   r   run_in_executorr  	enumeratedoneitems
set_resultr3   r&  	Exceptionset_exception)rw   r  r  r  r  r  r'  Zkwargs_listresult_futuresdeadlinetimeoutZ	encode_fnresultsfutr   reser[   )r|   rw   r\   r  D  sj   






z+AsyncMicrobatchTokenizer._batch_encode_loopc              
     s8  	 |  I dH \}}|g}|g}| j | j }t|| jk rW|| j  }|dkr,n+zt|  |I dH \}}|| || W n
 tj	yO   Y nw t|| jk s z#| j
| j| jj|I dH }t||D ]\}	}
|	 sx|	|
 qkW n  ty } z|D ]}	|	 s|	| qW Y d}~nd}~ww q)z.Batch incoming decode requests for efficiency.TNr   )r   r  r)  r   r   r   r  r*  r   r+  r,  r  r   Zbatch_decoder&  r.  r0  r1  r2  )rw   r  r  r  Ztoken_ids_listr3  r4  r5  r6  r7  r8  r9  r[   r[   r\   r  w  sN   




z+AsyncMicrobatchTokenizer._batch_decode_loopoprG   r  dictc                 C  st   |dkrdS | dd}| dd}| d}|sd|dd	fS t| jd
d	}|d	u s2|d	ur8||kr8d|ddfS dS )a  
        Return a normalized key describing operation + kwargs.
        
        - `add_special_tokens`: {True/False}
        - `truncation`: {True/False}
          - If `truncation` is False (`max_length` is None), 
            returns a key for a can_batch queue.
          - If `truncation` is True and `max_length` is None or equals
            `tokenizer.model_max_length`, returns a key for a can_batch queue.
          - Otherwise, returns a key for a cannot_batch queue.
        
        Examples:
          - Decode: ("decode",)
          - Encode typical: 
            ("encode", add_special_tokens, bool_truncation, max_length_label)
          - Fallback: ("encode", "other")
        r  )r  add_special_tokensT
truncationF
max_lengthr  NZmodel_max_length	model_max)r  r   )r   getattrr   )rw   r:  r  r<  r=  r>  r?  r[   r[   r\   r
    s   
z#AsyncMicrobatchTokenizer._queue_keyc                   sN   t | dd   r!t | dd  }r#| s% fdd}|| d S d S d S d S )Nr  r  c                    s    D ]} |    qd S rt   )canceltasktasksr[   r\   cancel_tasks  s   
z6AsyncMicrobatchTokenizer.__del__.<locals>.cancel_tasks)r@  	is_closedcall_soon_threadsafe)rw   r  rF  r[   rD  r\   __del__  s   z AsyncMicrobatchTokenizer.__del__N)r   r   )r   rY   r   rU   rq   rr   )r  r  r   r  rq   r  )r  r  r  r   )r  r  )r:  rG   r  r;  rq   r  )rf   rg   rh   r   rx   r  r  r  r  r  r
  rI  r[   r[   r[   r\   r     s    



3
!$r   rC  r   c                 C  s(   | r|   st|  | j d S d S d S rt   )r.  run_in_loopget_looprA  rB  r[   r[   r\   cancel_task_threadsafe  s   rL  sockets/Sequence[Union[zmq.Socket, zmq.asyncio.Socket]]c                 C  s"   | D ]}|d ur|j dd qd S )Nr   linger)close)rM  sockr[   r[   r\   close_sockets  s
   rS  r  r
   functionr#   c                 G  s6   t | r
||  d S |  s| j|g|R   d S d S rt   )in_looprG  rH  )r  rT  argsr[   r[   r\   rJ    s
   rJ  
event_loopr   c                 C  s$   zt  | kW S  ty   Y dS w )NF)r  r  r   )rW  r[   r[   r\   rU    s
   rU  funcCallable[P, T]executor%Optional[concurrent.futures.Executor]Callable[P, Awaitable[T]]c                   s   d
 fdd}|S )zTake a blocking function, and run it on in an executor thread.

    This function prevents the blocking function from blocking the
    asyncio event loop.
    The code in this function needs to be thread safe.
    rV  P.argsr  P.kwargsrq   asyncio.Futurec                    s,   t  }tg| R i |}|j |dS )NrZ  rX  )r  get_event_loopr   r,  )rV  r  r  Zp_funcr`  r[   r\   _async_wrapper  s   z"make_async.<locals>._async_wrapperN)rV  r]  r  r^  rq   r_  r[   )rX  rZ  rb  r[   r`  r\   
make_async  s   rc  iteratorAsyncGenerator[T, None]c                 C  s   | |  S rt   )r  	__anext__)rd  r  r[   r[   r\   
_next_task  s   rg  	iterators#AsyncGenerator[tuple[int, T], None]c            
       s  t | dkr| d 2 z3 dH W }d|fV  q6 dS t   fddt| D }zj|rftj| tdI dH \}}|D ]'}||}z|I dH }|\}}||t| < ||fV  W q< t	yc   Y q<w |s,W |
 D ]&\}	\}}tt |	  | I dH  W d   n1 sw   Y  qkdS |
 D ]&\}	\}}tt |	  | I dH  W d   n1 sw   Y  qw )zMerge multiple asynchronous iterators into a single iterator.

    This method handle the case where some iterators finish before others.
    When it yields, it yields a tuple (i, item) where i is the index of the
    iterator that yields the item.
    r{   r   Nc                   s   i | ]
}t |d   |qS r{   )rg  )r   pairr  r[   r\   r!    s    z)merge_async_iterators.<locals>.<dictcomp>)return_when)r   r  r  r-  waitr   r	   r   rg  StopAsyncIterationr/  
contextlibsuppressBaseExceptionrA  aclose)
rh  itemZawaitsr.  r   drk  r|   itfr[   rl  r\   merge_async_iterators  sL   	


rx  list[T]c                   s(   g }| 2 z3 dH W }| | q6 |S )z6Collect all items from an async generator into a list.N)r   )rd  r/  rt  r[   r[   r\   collect_from_async_generator!  s   rz  c                  C  s   t j} dtjv rdtjvrtd | r| S ttjtj}z|	d |
 d W S  ty3   Y nw zttjtj}|	d |
 d W S  tyQ   Y nw tjddd	 d
S )NZHOST_IPVLLM_HOST_IPa  The environment variable HOST_IP is deprecated and ignored, as it is often used by Docker and other software to interact with the container's network stack. Please use VLLM_HOST_IP instead to set the IP address for vLLM processes to communicate with each other.)z8.8.8.8P   r   )z2001:4860:4860::8888r|  zFailed to get the IP address, using 0.0.0.0 by default.The value can be set by the environment variable VLLM_HOST_IP or HOST_IP.   
stacklevelz0.0.0.0)envsr{  osenvironloggerwarningsocketAF_INET
SOCK_DGRAMconnectgetsocknamer1  AF_INET6warningswarn)host_ipsr[   r[   r\   get_ip*  s4   

r  c                 C  s@   zt  |t j}|| df |  W dS  ty   Y dS w )Nr   TF)r  r  bindrQ  OSError)addressfamilyr  r[   r[   r\   test_loopback_bindR  s   r  c                  C  s6   t j} | r| S tdtjrdS tdtjrdS td)Nz	127.0.0.1z::1zsNeither 127.0.0.1 nor ::1 are bound to a local interface. Set the VLLM_LOOPBACK_IP environment variable explicitly.)r  ZVLLM_LOOPBACK_IPr  r  r  r  r   )Zloopback_ipr[   r[   r\   get_loopback_ip\  s   r  r  c                 C  s&   zt |  W dS  ty   Y dS w )NTF)	ipaddressIPv6Addressr   )r  r[   r[   r\   is_valid_ipv6_addressm  s   
r  	host_porttuple[str, int]c                 C  sZ   |  dr | dd\}}|dd  }|dd }|t|fS | d\}}|t|fS )N[]r{   :)
startswithrsplitsplitrY   )r  hostportr[   r[   r\   split_host_portu  s   
r  r  r  c                 C  s&   t | rd|  d| S |  d| S )Nr  ]:r  r  )r  r  r[   r[   r\   join_host_port  s   r  ipc                 C  
   t | |S rt   )get_tcp_urir  r  r[   r[   r\   get_distributed_init_method  ry   r  c                 C  s(   t | rd|  d| S d|  d| S )Nztcp://[r  ztcp://r  r  r  r[   r[   r\   r    s   r  c                  C  s   t j} d|  dt  S )Nzipc:///)r  ZVLLM_RPC_BASE_PATHr/   )Zbase_rpc_pathr[   r[   r\   get_open_zmq_ipc_path  s   r  c                   C  s   dt   S )Nz	inproc://r.   r[   r[   r[   r\   get_open_zmq_inproc_path     r  c                  C  s:   dt jv rtj} t| | d }	 t }||vr|S qt S )a<  
    Get an open port for the vLLM process to listen on.
    An edge case to handle, is when we run data parallel,
    we need to avoid ports that are potentially used by
    the data parallel master process.
    Right now we reserve 10 ports for the data parallel master
    process. Currently it uses 2 ports.
    VLLM_DP_MASTER_PORT
   )r  r  r  r  r   _get_open_port)Zdp_master_portZreserved_port_rangeZcandidate_portr[   r[   r\   get_open_port  s   
	r  c                  C  s6  t j} | d urD	 z$ttjtj}|d| f | W  d    W S 1 s&w   Y  W n tyB   | d7 } td| d |  Y nw qz'ttjtj}|d |	 d W  d    W S 1 sdw   Y  W d S  ty   ttj
tj}|d |	 d W  d     Y S 1 sw   Y  Y d S w )NT r{   z)Port %d is already in use, trying port %d)r  r   )r  Z	VLLM_PORTr  r  SOCK_STREAMr  r  r  r   r  r  )r  r  r[   r[   r\   r    s4   &


(

*r  Optional[psutil.Process]c              	   C  sZ   t jdrd S t D ]}|jj| kr*z	t|jW   S  tj	y)   Y  d S w qd S )Ndarwin)
sysplatformr  r   Znet_connectionsladdrr  ProcesspidNoSuchProcess)r  connr[   r[   r\   find_process_using_port  s   r  r  dict[str, str]c                 C  sN   |   D ] \}}|tjv rtj| |krtd|tj| | |tj|< qd S )Nz5Overwriting environment variable %s from '%s' to '%s')r/  r  r  r  r  )r  r  r   r[   r[   r\   update_environment_variables  s   r  lst
chunk_sizec                 c  s.    t dt| |D ]}| |||  V  q	dS )z,Yield successive chunk_size chunks from lst.r   N)r   r   )r  r  r|   r[   r[   r\   
chunk_list  s   r  abc                 C  s   | |   S )zCeiling division.r[   )r  r  r[   r[   r\   cdiv  s   r  c                 C  s   | dk rdS d| d   > S )zThe next power of 2 (inclusive)r{   
bit_lengthnr[   r[   r\   next_power_of_2     r  r  c                 C  s   | dkrdS d|   d > S )z#The previous power of 2 (inclusive)r   r{   r  r  r[   r[   r\   prev_power_of_2  r  r  xyc                 C  s   | | d | | S rz   r[   r  r  r[   r[   r\   round_up  s   r  c                 C  s   | | | S rt   r[   r  r[   r[   r\   
round_down  r  r  tensortorch.TensorlowrU   highrr   c                 C  s:   ddl m} tj| tjd}||| || | ~d S )Nr   r   dtype)r   r   rZ   Z
empty_likefloat16uniform_Zconvert_fp8)r  r  r  r   Z
tensor_tmpr[   r[   r\   _generate_random_fp8  s
   r  cache_dtype!Optional[Union[str, torch.dtype]]model_dtypetorch.dtypec                 C  s   t | tr:| dkr)t |tr|tv rt| }|S t |tjr"|}|S td| | tv r3t|  }|S td|  t | tjrD| }|S td|  )Nrk   zInvalid model dtype: zInvalid kv cache dtype: )
isinstancerG   STR_DTYPE_TO_TORCH_DTYPErZ   r  r   )r  r  torch_dtyper[   r[   r\   get_kv_cache_torch_dtype  s(   
r  cudaNHD
num_blocks
block_size
num_layers	num_heads	head_sizeseedOptional[int]deviceOptional[str]cache_layout-tuple[list[torch.Tensor], list[torch.Tensor]]c
                   s  ddl m}
 |
| t||}| d|||f |	dv sJ |	dkr#dnd}t fdd	|D }|d
 }g }g }t|D ]B}tj|||dj| }|dv rU|	| | n|dkrat
|| | ntd| ||d d df  ||d d df  q<||fS )Nr   current_platformr}  )r  ZHNDr  )r   r{   r}        )r   r{   r  r}  r  c                 3  s    | ]} | V  qd S rt   r[   )r   r|   Zgeneric_kv_cache_shaper[   r\   r   B  s    z5create_kv_caches_with_random_flash.<locals>.<genexpr>      ࿩sizer  r  rk   rS   rT   rU   rV   #Does not support key cache of type r{   )vllm.platformsr  seed_everythingr  r  r   rZ   emptyZpermuter  r  r   r   )r  r  r  r  r  r  r  r  r  r  r  r  Zstride_orderZkv_cache_allocation_shapescale
key_cachesvalue_cachesr   Zkey_value_cacher[   r  r\   "create_kv_caches_with_random_flash-  s:   

r  c	                 C  sV  |dkr|d rt d| ddlm}	 |	| t||}
|d }dtjg |
d  }| ||| ||f}g }t|D ].}tj	||
|d}|d	v rS|
| | n|dkr_t|| | nt d
| || q=| |||f}g }t|D ].}tj	||
|d}|d	v r|
| | n|dkrt|| | nt d| || qx||fS )NrV      z6Does not support key cache of type fp8 with head_size r   r  r  r  r  r  r  z%Does not support value cache of type )r   r  r  r  r  rZ   r  element_sizer   r  r  r  r   )r  r  r  r  r  r  r  r  r  r  r  r  r  Zkey_cache_shaper   r   Z	key_cacheZvalue_cache_shaper  Zvalue_cacher[   r[   r\   create_kv_caches_with_randomY  sP   

r  c                  C  s   ddl m}  |  S Nr   r  )r  r  is_pin_memory_availabler  r[   r[   r\   r    s   r  c                   C  s   t  S )z7Check if Unified Virtual Addressing (UVA) is available.)r  r[   r[   r[   r\   is_uva_available  s   r  c                   @  s2   e Zd ZddddZddd	Zd
d Zdd ZdS )DeviceMemoryProfilerNr  Optional[torch.types.Device]c                 C  rs   rt   )r  )rw   r  r[   r[   r\   rx     ry   zDeviceMemoryProfiler.__init__rq   rU   c                 C  s    ddl m} t  || jS r  )r  r  gccollectZget_current_memory_usager  )rw   r  r[   r[   r\   current_memory_usage  s   z)DeviceMemoryProfiler.current_memory_usagec                 C  s   |   | _| S rt   )r  initial_memoryr   r[   r[   r\   	__enter__  s   
zDeviceMemoryProfiler.__enter__c                 C  s$   |   | _| j| j | _t  d S rt   )r  Zfinal_memoryr  Zconsumed_memoryr  r  )rw   exc_typeexc_valexc_tbr[   r[   r\   __exit__  s   
zDeviceMemoryProfiler.__exit__rt   )r  r
  r   )rf   rg   rh   rx   r  r  r  r[   r[   r[   r\   r	    s
    
r	  max_lenlist[list[T]]padr  npt.DTypeLiker  npt.NDArrayc                C  sl   |du rt tt| dd}tjt| |f||d}t| D ]\}}t||ks)J |||dt|f< q|S )z
    Make a padded array from 2D inputs.

    The padding is applied to the end of each inner list until it reaches
    `max_len`.
    Nr   )r   r  )maxmapr   npfullr-  )r  r  r  r  padded_xindZblocktbr[   r[   r\   make_ndarray_with_pad  s   r   F)r  r  
pin_memory"Optional[Union[str, torch.device]]r!  c          	      C  s8   t | }t| |||d}t||}|r| }|S )z
    Make a padded tensor from 2D inputs.

    The padding is applied to the end of each inner list until it reaches
    `max_len`.
    r  )TORCH_DTYPE_TO_NUMPY_DTYPEr   rZ   Z
from_numpytor!  )	r  r  r  r  r  r!  Znp_dtyper  r  r[   r[   r\   make_tensor_with_pad  s   r%  r   listtarget_deviceUnion[str, torch.device]c                 C  s    t j| ||dd}|j|ddS )z?Asynchronously create a tensor and copy it from host to device.cpu)r  r!  r  T)r  Znon_blocking)rZ   r  r$  )r   r  r'  r!  tr[   r[   r\   async_tensor_h2d  s   r+  c                 C  s   t jg | d S )z'Get the size of the data type in bytes.r  )rZ   r  r  r  r[   r[   r\   get_dtype_size     r,  c                 C  s   | t jk| j | jd  S )Nr}  )rZ   r   is_floating_point
is_complexr  r[   r[   r\   _get_precision_level  s   r0  	src_dtype	tgt_dtypec                 C  s   | |krdS t | }t |}||k rdS ||krdS | js6| js6t| }t|}|j|jko5|j|jkS t| }t|}|j|jkoQ|j|jkoQ|j|jkS )z[
    Test whether it is lossless to cast a tensor from
    `src_dtype` to `tgt_dtype`.
    TF)	r0  r.  r/  rZ   Ziinfominr  Zfinfo
resolution)r1  r2  Z	src_levelZ	tgt_levelZsrc_infoZtgt_infor[   r[   r\   is_lossless_cast  s"   




r5  dtypesCollection[torch.dtype]c                   s   t   fdddS )zv
    Get the common `dtype` where all of the other `dtypes` can be
    cast to it without losing any information.
    c                   s   t  fddD S )Nc                 3      | ]}t | V  qd S rt   )r5  )r   dtr  r[   r\   r   "      z?common_broadcastable_dtype.<locals>.<lambda>.<locals>.<genexpr>)sumr  r6  r  r\   r(  "  s    z,common_broadcastable_dtype.<locals>.<lambda>r   )r  r<  r[   r<  r\   common_broadcastable_dtype  s   
r>  
maybe_listIterable[T]c                 C  s   t | tr| S t| S )z5Convert iterable to list, unless it's already a list.)r  r&  )r?  r[   r[   r\   as_list&  s   rA  first)checkr   objecttyp#Union[type[T], tuple[type[T], ...]]rC  Literal['first', 'all']TypeIs[list[T]]c                  sZ   t | tsdS |dkrt| dkpt | d  S |dkr't fdd| D S t| d S )NFrB  r   allc                 3  r8  rt   )r  )r   r   rE  r[   r\   r   8  r:  zis_list_of.<locals>.<genexpr>)r  r&  r   rI  r7   )r   rE  rC  r[   rJ  r\   
is_list_of,  s   
rK  listsIterable[Iterable[T]]c                 C  s   dd | D S )z)Flatten a list of lists to a single list.c                 S  s   g | ]	}|D ]}|qqS r[   r[   )r   Zsublistrt  r[   r[   r\   r%  ?  r"  z$flatten_2d_lists.<locals>.<listcomp>r[   )rL  r[   r[   r\   flatten_2d_lists=  s   rN  valuesIterable[_V]r   Callable[[_V], _K]c                C  s8   t ttt f t}| D ]}||| | q| S )z[
    Unlike [`itertools.groupby`][], groups are not broken by
    non-contiguous data.
    )r   ra   r&  rc   r   r/  )rO  r   groupsr   r[   r[   r\   full_groupbyB  s   rS  c                  C  s   ddl m}  |   dS )z:
    Lazy initialization of the Hugging Face modules.
    r   init_hf_modulesN)Z!transformers.dynamic_module_utilsrU  rT  r[   r[   r\   init_cached_hf_modulesQ  s   
rV  lib_namec                   sj   t ddg } fdd| D }tj}|s'|r' fdd|dD }|s1td  d|d	 S )
z
    Find the library file in the system.
    `lib_name` is full filename, with both prefix and suffix.
    This function resolves `lib_name` to the full path of the library.
    z/sbin/ldconfigz-pc                   s    g | ]} |v r|  d  qS ))r  r   linerW  r[   r\   r%  g  s     z find_library.<locals>.<listcomp>c                   s0   g | ]}t jt j| rt j| qS r[   )r  pathexistsjoin)r   dirr[  r[   r\   r%  k  s    r  zCannot find z in the system.r   )
subprocesscheck_outputr  
splitlinesr  ZLD_LIBRARY_PATHr  r   )rW  ZlibslocsZenv_ld_library_pathr[   r[  r\   find_libraryY  s   
rd  c                  C  sV   t j} | rtd|  | S tjjdurd} ntjjdurd} ntdtd|  | S )a  
    We either use the library file specified by the `VLLM_NCCL_SO_PATH`
    environment variable, or we find the library file brought by PyTorch.
    After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
    found by `ctypes` automatically.
    z9Found nccl from environment variable VLLM_NCCL_SO_PATH=%sNzlibnccl.so.2zlibrccl.so.1z*NCCL only supports CUDA and ROCm backends.zFound nccl from library %s)	r  ZVLLM_NCCL_SO_PATHr  r   rZ   r0   r  Zhipr   )Zso_filer[   r[   r\   find_nccl_libraryu  s   re  streamtorch.cuda.Streamc                 C  s   | t _t|  d S rt   )_current_stream_tlsr   prev_set_stream)rf  r[   r[   r\   _patched_set_stream  s   rj  c                  C  sD   ddl m}  ttdrtjdu r|  rtj ntj	 t_tjS )a  
    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
    it turns out that `torch.cuda.current_stream()` is quite expensive,
    as it will construct a new stream object at each call.
    here we patch `torch.cuda.set_stream` to keep track of the current stream
    directly, so that we can avoid calling `torch.cuda.current_stream()`.

    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
    from C/C++ code.
    r   r  r   N)
r  r  hasattrrh  r   is_rocmrZ   r  ZStreamcurrent_streamr  r[   r[   r\   rm    s   
rm  vllm_configr=   c                 C  s   t jrEt }tj|t }dt	  dt
  dtj  ddd}tj|dd| j |}tjtj|d	d
 t| dS dS )ztSet up function tracing for the current thread,
    if enabled via the VLLM_TRACE_FUNCTION environment variable
    Z VLLM_TRACE_FUNCTION_for_process_Z_thread_Z_at_z.log r   r   zvllm-instance-T)exist_okN)r  ZVLLM_TRACE_FUNCTIONtempfile
gettempdirr  r\  r^  getpassgetusergetpid	threading	get_identdatetimenowreplaceZinstance_idmakedirsdirnamer8   )rn  Ztmp_dirfilenamelog_pathr[   r[   r\   %enable_trace_function_call_for_thread  s"   

r  c                 K  s   | S )z!Returns the first provided value.r[   )r   r  r[   r[   r\   identity  r   r  F.Tstart_indexis_deprecatedUnion[bool, Callable[[], bool]]additional_messageCallable[[F], F]c                   s(   t s	ttd fdd}|S )Nfnr  rq   c                   sR   t  j}t jjt jjffdd| D t  fdd}|S )Nc                   s   g | ]\}}|j  v r|qS r[   )kind)r   r$  param)	pos_typesr[   r\   r%    s    z3deprecate_args.<locals>.wrapper.<locals>.<listcomp>c                    s\    r't |  }|r'd| d} d ur|d  7 }tjt|dd | i |S )NzThe positional arguments 7 are deprecated and will be removed in a future update.ro  r  r~  )r   r  r  DeprecationWarning)rV  r  Zdeprecated_argsmsg)r  r  r  pos_kwsr  r[   r\   inner  s   
z.deprecate_args.<locals>.wrapper.<locals>.inner)inspect	signature
parameters	ParameterPOSITIONAL_ONLYPOSITIONAL_OR_KEYWORDr/  r   )r  paramsr  r  r  r  )r  r  r  r\   wrapper  s   
zdeprecate_args.<locals>.wrapperr  r  rq   r  )callabler   r  )r  r  r  r  r[   r  r\   deprecate_args  s   
r  )r  r  kwsc                   s0   t |tsttd fdd}|S )Nr  r  rq   c                   s   t   fdd}|S )Nc                    sX    r%|  @ }|r%d| d} d ur|d  7 }tjt|dd | i |S )NzThe keyword arguments r  ro  r  r~  )r   r  r  r  )rV  r  Zdeprecated_kwargsr  )r  deprecated_kwsr  r  r[   r\   r    s   
z0deprecate_kwargs.<locals>.wrapper.<locals>.innerr   )r  r  r  r  r  )r  r\   r    s   z!deprecate_kwargs.<locals>.wrapperr  )r   r  r   r  )r  r  r  r  r[   r  r\   deprecate_kwargs  s
   
r     r   cuda_visible_devicesc                 C  sv   dd l }dd l}ddlm} |j sdS | r't|jdr$|j nd}n|j	 }|dk r7|j
 }|S |}|S )Nr   r  _device_count_amdsmirX  )Z
torch.cudaZtorch.versionr  r  r  _is_compiledrl  rk  r  Z_device_count_nvml_CZ_cuda_getDeviceCount)r  rZ   r  Z	raw_countrr[   r[   r\   _cuda_device_count_stateless  s    


r  c                   C  s
   t tjS )zGet number of CUDA devices, caching based on the value of
    CUDA_VISIBLE_DEVICES at the time of call.

    This should be used instead of torch.cuda.device_count()
    unless CUDA_VISIBLE_DEVICES has already been set to the desired
    value.)r  r  ZCUDA_VISIBLE_DEVICESr[   r[   r[   r\   cuda_device_count_stateless7  s   

r  c                   C     t j sdS t j S )zCheck if CUDA is initialized.F)rZ   r  r  is_initializedr[   r[   r[   r\   cuda_is_initializedD     

r  c                   C  r  )zCheck if XPU is initialized.F)rZ   Zxpur  r  r[   r[   r[   r\   xpu_is_initializedK  r  r  namesSequence[str]tuple[Any, ...]c                   sz   |st  rtj|  t fdd|D S td}td|d}|t	| |d
 W  d   S 1 s6w   Y  dS )z_Get specified CUDA device property values without initializing CUDA in
    the current process.c                 3  s    | ]}t  |V  qd S rt   )r@  )r   namepropsr[   r\   r   Y  r:  z-cuda_get_device_properties.<locals>.<genexpr>forkr{   )r   Z
mp_contextTN)r  rZ   r  Zget_device_propertiesr  multiprocessingget_contextr   submitcuda_get_device_propertiesresult)r  r  Z	init_cudaZmp_ctxrZ  r[   r  r\   r  R  s   


$r  bound_methodCallable[..., Any]Callable[..., None]c                   s&   t | j | jd fdd}|S )zzMake an instance method that weakly references
    its associated instance and no-ops once that
    instance is collected.rq   rr   c                    s(      }r|g| R i | d S d S rt   r[   )rV  r  instrefunboundr[   r\   
weak_boundi  s   
zweak_bind.<locals>.weak_boundNr   )weakrefr  __self____func__)r  r  r[   r  r\   	weak_bindb  s   r  rw  Callable[P, None]c                   s$   d
 fddd	_ t _S )NrV  r]  r  r^  rq   rr   c                    s^   j rd S j j sd_  | i |W  d    S W d    d S 1 s(w   Y  d S NT)has_runlockrV  r  rw  r  r[   r\   r  r  s   "zrun_once.<locals>.wrapperF)rV  r]  r  r^  rq   rr   )r  rv  Lockr  )rw  r[   r  r\   run_oncep  s   	
r  c                   @  s   e Zd ZdddZdS )StoreBooleanNc                 C  sL   |  dkrt|| jd d S |  dkrt|| jd d S td| d)NtrueTfalseFzInvalid boolean value: z. Expected 'true' or 'false'.)lowersetattrdestr   )rw   parser	namespacerO  option_stringr[   r[   r\   r    s
   zStoreBoolean.__call__rt   )rf   rg   rh   r  r[   r[   r[   r\   r    s    r  c                      s(   e Zd ZdZdd Z fddZ  ZS )SortedHelpFormatterzASortedHelpFormatter that sorts arguments by their option strings.c                   sD   t d}t d}|d|}t ||}t fdd|D g S )z
        1. Sentences split across lines have their single newlines removed.
        2. Paragraphs and explicit newlines are split into separate lines.
        3. Each line is wrapped to the specified width (width of terminal).
        z(?<!\n)\n(?!\n)\s*z	\n{2,}\s*ro  c                   s   g | ]}t | qS r[   )textwrapwraprY  widthr[   r\   r%    s    z4SortedHelpFormatter._split_lines.<locals>.<listcomp>)recompilesubr  r;  )rw   textr  Zsingle_newlineZmultiple_newlineslinesr[   r  r\   _split_lines  s
   

z SortedHelpFormatter._split_linesc                   s    t |dd d}t | d S )Nc                 S  r   rt   )option_strings)r  r[   r[   r\   r(    s    z3SortedHelpFormatter.add_arguments.<locals>.<lambda>r=  )sortedr   add_arguments)rw   actionsr   r[   r\   r    s   z!SortedHelpFormatter.add_arguments)rf   rg   rh   r   r  r  r   r[   r[   r   r\   r    s    r  c                      s   e Zd ZU dZe Zded< dZded<  fddZe	j
d	k r8d$ fdd	Z fddZG dd deZdd Zd% fddZ	
	
d$d& fddZdd Zd'dd Zd(d"d#Z  ZS ))FlexibleArgumentParserz=ArgumentParser that allows both underscore and dash in names.zset[Action]_deprecateda}  When passing JSON CLI arguments, the following sets of arguments are equivalent:
   --json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'
   --json-arg.key1 value1 --json-arg.key2.key3 value2

Additionally, list elements can be passed individually using +:
   --json-arg '{"key4": ["value3", "value4", "value5"]}'
   --json-arg.key4+ value3 --json-arg.key4+='value4,value5'

rG   	_json_tipc                   s4   d|vrt |d< |dd| _t j|i | d S )Nformatter_classadd_json_tipT)r  r   r  r   rx   rw   rV  r  r   r[   r\   rx     s   zFlexibleArgumentParser.__init__)r     Nc                   sl   |d urd|v rt d t ||\}}tjD ]}t||j }r1t|||j	kr1t d| q||fS )Nz--disable-log-requestsz{argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.zargument '%s' is deprecated)
r  Zwarning_oncer   parse_known_argsr  r  rk  r  r@  r   )rw   rV  r  actionr  r   r[   r\   r    s   
z'FlexibleArgumentParser.parse_known_argsc                   2   | dd}t j|i |}|rtj| |S N
deprecatedFr   r   add_argumentr  r  r   rw   rV  r  r  r  r   r[   r\   r    
   z#FlexibleArgumentParser.add_argumentc                      s   e Zd Z fddZ  ZS )z-FlexibleArgumentParser._FlexibleArgumentGroupc                   r  r  r  r  r   r[   r\   r    r  z:FlexibleArgumentParser._FlexibleArgumentGroup.add_argument)rf   rg   rh   r  r   r[   r[   r   r\   _FlexibleArgumentGroup  s    r  c                 O  s(   | j | g|R i |}| j| |S rt   )r  _action_groupsr   )rw   rV  r  groupr[   r[   r\   add_argument_group  s   z)FlexibleArgumentParser.add_argument_grouprq   c                   s2   | j pd}| jr|tjstj| | _ t  S Nr  )epilogr  r  r  r  r   format_help)rw   r  r   r[   r\   r    s   


z"FlexibleArgumentParser.format_helprV  list[str] | Noner  Namespace | Nonec              	     s<  |d u rt jdd  }|r"|d dkr"tdd |D }|r"tdd|v r+| |}d.dd}td}tt  }t	|D ]\}}|
drtd|v rf|dd\}	|j|dd| d|	  q>|j||dd| q>|
dr|dkr|d dkr|d dkr|dd  n|dd  }
|d|
  q>|dkr|d t|k r||d  dv r|d q>|| q>d/dd}d0fd"d#tt   tttttf f t}tt  }t	|D ]\}}| v rq|
d$rhd|v rhd|v r|dd\}}d|vrqn||d  } |d  |d%r/|d d& }tt|d'}|d^}zt|}	W n tjjyK   |}	Y nw |||	}| |}|fd(d)|D O } | q fd*d+t	|D }|rtd,d-| | D ]\}}|| |t| qt  ||S )1Nr{   r   servec                 s      | ]}|d kV  qdS z--modelNr[   r   argr[   r[   r\   r         z4FlexibleArgumentParser.parse_args.<locals>.<genexpr>zWith `vllm serve`, you should provide the model as a positional argument or in a config file instead of via the `--model` option.--configmatchre.Matchrq   rG   c                 S  s   |  dddS )z7Replaces underscores with dashes in the matched string.r   r   -)r  rz  )r  r[   r[   r\   repl  r-  z/FlexibleArgumentParser.parse_args.<locals>.replz(?<=--)[^\.]*--=)countz-Or}  r  r  z	-O.level=>   2013z-O.levelr   	list[str]r   dict[str, Any]c                 S  s   |}t | D ]}||i}q|S )zCreates a nested dictionary from a list of keys and a value.

            For example, `keys = ["a", "b", "c"]` and `value = 1` will create:
            `{"a": {"b": {"c": 1}}}`
            )reversed)r   r   nested_dictr   r[   r[   r\   create_nested_dict  s   
z=FlexibleArgumentParser.parse_args.<locals>.create_nested_dictoriginalupdateset[str]c                   s   t t  }| D ]G\ }t|tr-t|  tr-|   |}| fdd|D O }q	t|trCt|  trC|    |7  < q	 | v rL|  ||  < q	|S )zRecursively updates a dictionary with another dictionary.
            Returns a set of duplicate keys that were overwritten.
            c                      h | ]	}  d | qS r  r[   r   ru  r  r[   r\   	<setcomp>1  r"  zSFlexibleArgumentParser.parse_args.<locals>.recursive_dict_update.<locals>.<setcomp>)r   rG   r/  r  r;  r   r&  r   )r  r  
duplicatesr   Znested_duplicates)recursive_dict_updater  r\   r  &  s   


z@FlexibleArgumentParser.parse_args.<locals>.recursive_dict_updater  +rX  ,c                   r  r  r[   r  r=  r[   r\   r  Y  r"  z4FlexibleArgumentParser.parse_args.<locals>.<setcomp>c                   s   g | ]
\}}| vr|qS r[   r[   )r   r|   r  )deleter[   r\   r%  \  s    z5FlexibleArgumentParser.parse_args.<locals>.<listcomp>zFound duplicate keys %sz, )r  r  rq   rG   )r   r  r   rG   rq   r  )r  r  r  r  rq   r  )!r  argvanyr   _pull_args_from_configr  r  r&  rG   r-  r  r  r  r   r   r   rY   r   r;  r"   r   endswithjsondumpsloadsdecoderJSONDecodeErrorr  r  r^  r/  r   
parse_args)rw   rV  r  Zmodel_in_cli_argsr	  patternprocessed_argsr|   r  r   levelr  Z	dict_argsr  Zprocessed_argZ	value_strr   Zarg_dictZarg_duplicatesZdict_argZ
dict_valuer   )r"  r   r  r\   r,    s   




$(








z!FlexibleArgumentParser.parse_argsc                 C  sR   zt |}W n ty   d}t|d w d|  kr"dks'td td|S )NzPort must be an integeri     z#Port must be between 1024 and 65535)rY   r   r   )rw   r   r  r[   r[   r\   
check_porti  s   
z!FlexibleArgumentParser.check_portr  c                 C  s0  | ddksJ d|d}|t|d krtd||d  }| |}|d dkrt|dko:|d d }tdd	 |D }|sL|sLtd
|rj|d g|d g | |d|  ||d d  }|S |d g| |d|  ||d d  }|S |d g| |d|  ||d d  }|S )a  Method to pull arguments specified in the config file
        into the command-line args variable.

        The arguments in config file will be inserted between
        the argument list.

        example:
        ```yaml
            port: 12323
            tensor-parallel-size: 4
        ```
        ```python
        $: vllm {serve,chat,complete} "facebook/opt-12B"             --config config.yaml -tp 2
        $: args = [
            "serve,chat,complete",
            "facebook/opt-12B",
            '--config', 'config.yaml',
            '-tp', '2'
        ]
        $: args = [
            "serve,chat,complete",
            "facebook/opt-12B",
            '--port', '12323',
            '--tensor-parallel-size', '4',
            '-tp', '2'
            ]
        ```

        Please note how the config args are inserted after the sub command.
        this way the order of priorities is maintained when these are args
        parsed by super().
        r  r{   z$More than one config file specified!z`No config file specified!                              Please check your command-line arguments.r   r  r  c                 s  r   r  r[   r  r[   r[   r\   r     r  z@FlexibleArgumentParser._pull_args_from_config.<locals>.<genexpr>z]No model specified! Please specify model either as a positional argument or in a config file.r}  N)r  indexr   r   _load_config_filer  r$  )rw   rV  r2  	file_pathZconfig_argsZmodel_in_cliZmodel_in_configr[   r[   r\   r%  u  sP   "




*z-FlexibleArgumentParser._pull_args_from_configr4  c           
   
   C  s   | dd }|dvrtd|g }i }zt|}t|}W d   n1 s)w   Y  W n tyC } ztd| |d}~ww dd | jD }|	 D ]%\}}	t
|	trg||vrg|	rf|d	|  qP|d	|  |t|	 qP|S )
ac  Loads a yaml file and returns the key value pairs as a
        flattened list with argparse like pattern
        ```yaml
            port: 12323
            tensor-parallel-size: 4
        ```
        returns:
            processed_args: list[str] = [
                '--port': '12323',
                '--tensor-parallel-size': '4'
            ]
        r  rX  )yamlZymlzPConfig file must be of a yaml/yml type.                              %s suppliedNzOUnable to read the config file at %s.                 Make sure path is correctc                 S  s   g | ]
}t |tr|jqS r[   )r  r  r  )r   r  r[   r[   r\   r%    s    z<FlexibleArgumentParser._load_config_file.<locals>.<listcomp>r
  )r  r   openr5  Z	safe_loadr1  r  error_actionsr/  r  r   r   rG   )
rw   r4  	extensionr.  configconfig_fileexZstore_boolean_argumentsr   r   r[   r[   r\   r3    s@   
z(FlexibleArgumentParser._load_config_file)NNrq   rG   )rV  r  r  r  )rV  r  rq   r  )r4  rG   rq   r  )rf   rg   rh   r   r   r  r   r  rx   r  version_infor  r  r   r  r  r  r,  r1  r%  r3  r   r[   r[   r   r\   r    s&   
 
	
	
 
Kr  r  asyncio.Lockc              	     sR   |4 I dH  | |i |I dH W  d  I dH  S 1 I dH s"w   Y  dS )z,Utility function to run async task in a lockNr[   )rC  r  rV  r  r[   r[   r\   _run_task_with_lock  s   0r@  requires_kw_onlyallow_var_kwargsr  Callable[..., object]kw_namerB  rC  c          	      C  s   t | j}|s
dS ||}tt jjt jjt jjf}|r?|j	|v }|r0|r0|j	t jjkr0dS |r9|j	t jjks=|s?|r?dS |rU|t
t| }|j	t jjkoT|j|kS dS )zCheck if a keyword is a valid kwarg for a callable; if requires_kw_only
    disallows kwargs names that can also be positional arguments.
    FT)r  r  r  r   r   r  r  r  KEYWORD_ONLYr  r   r  VAR_KEYWORDr  )	r  rE  rB  rC  r  Z	param_valZpassable_kw_typesZis_sig_paramZ
last_paramr[   r[   r\   supports_kw  s4   


rH  	overridesOptional[Mapping[str, object]]r  c                  sZ   |si S  fdd|  D }| |  }|r+r%td| |S td| |S )a  
    Given a callable which has one or more keyword only params and a dict
    mapping param names to values, drop values that can be not be kwarg
    expanded to overwrite one or more keyword-only args. This is used in a
    few places to handle custom processor overrides for multimodal models,
    e.g., for profiling when processor options provided by the user
    may affect the number of mm tokens per instance.

    Args:
        callable: Callable which takes 0 or more keyword only arguments.
                  If None is provided, all overrides names are allowed.
        overrides: Potential overrides to be used when invoking the callable.
        allow_var_kwargs: Allows overrides that are expandable for var kwargs.

    Returns:
        Dictionary containing the kwargs to be leveraged which may be used
        to overwrite one or more keyword only arguments when invoking the
        callable.
    c                   s&   i | ]\}}t | d r||qS )rA  )rH  )r   Z
kwarg_namevalrC  r  rB  r[   r\   r!  C  s    z4get_allowed_kwarg_only_overrides.<locals>.<dictcomp>zRThe following intended overrides are not keyword-only args and will be dropped: %szMThe following intended overrides are not keyword args and will be dropped: %s)r/  r   r  r  )r  rI  rB  rC  Zfiltered_overridesZdropped_keysr[   rL  r\    get_allowed_kwarg_only_overrides$  s$   
rM  c                  C  s   t t tjj} | t dkS )Nz2.4.0)r1   rZ   __version__base_version)Zbase_torch_versionr[   r[   r\   supports_dynamo^  s   rP  c                   C  s   t dotj S )Nz	2.8.0.dev)is_torch_equal_or_newerrZ   distributedZis_xccl_availabler[   r[   r[   r\   supports_xccld  s
   rS  c                   C  s   t tjdS )NZ	custom_op)rk  rZ   libraryr[   r[   r[   r\   supports_custom_opk  r  rU  c                   @  s:   e Zd ZdZdddZdddZddd	Zed
d ZdS )AtomicCounterzAn atomic, thread-safe counterr   c                 C  s   || _ t | _dS )z6Initialize a new atomic counter to given initial valueN)_valuerv  r  _lock)rw   initialr[   r[   r\   rx   r  s   zAtomicCounter.__init__r{   c                 C  s>   | j  |  j|7  _| jW  d   S 1 sw   Y  dS )z@Atomically increment the counter by num and return the new valueNrX  rW  rw   numr[   r[   r\   incw     $zAtomicCounter.incc                 C  s>   | j  |  j|8  _| jW  d   S 1 sw   Y  dS )z@Atomically decrement the counter by num and return the new valueNrZ  r[  r[   r[   r\   dec}  r^  zAtomicCounter.decc                 C  r   rt   )rW  r   r[   r[   r\   r     r   zAtomicCounter.valueNr   rj  )	rf   rg   rh   r   rx   r]  r_  r   r   r[   r[   r[   r\   rV  o  s    


rV  c                   @  s:   e Zd ZdddZdd	d
ZdddZdd Zdd ZdS )LazyDictfactorydict[str, Callable[[], T]]c                 C  s   || _ i | _d S rt   )_factory_dict)rw   ra  r[   r[   r\   rx     s   
zLazyDict.__init__r   rG   rq   r_   c                 C  s8   || j vr|| jvrt|| j|  | j |< | j | S rt   )rd  rc  r   r   r[   r[   r\   r     s
   


zLazyDict.__getitem__r   Callable[[], T]c                 C  s   || j |< d S rt   )rc  r   r[   r[   r\   r     r   zLazyDict.__setitem__c                 C  r   rt   )r   rc  r   r[   r[   r\   r     ry   zLazyDict.__iter__c                 C  r   rt   )r   rc  r   r[   r[   r\   __len__  ry   zLazyDict.__len__N)ra  rb  )r   rG   rq   r_   )r   rG   r   re  )rf   rg   rh   rx   r   r   r   rf  r[   r[   r[   r\   r`    s    


r`  c                   @  s0   e Zd ZdddZdd	d
ZdddddZdS )ClassRegistryr   type[T]rq   rc   c                 C  s.   |  D ]}|| jv r| j|   S qt|rt   )mror   r   )rw   r   clsr[   r[   r\   r     s
   
zClassRegistry.__getitem__rD  r   c                 C  s
   |  |S rt   )containsr   r[   r[   r\   __contains__  ry   zClassRegistry.__contains__F)strictrm  c                  s6   t |tsdS |r| jv S t fdd| D S )NFc                 3  s    | ]}| j v V  qd S rt   )r   )r   rj  r   r[   r\   r     r:  z)ClassRegistry.contains.<locals>.<genexpr>)r  typer   r$  ri  )rw   r   rm  r[   r   r\   rk    s
   

zClassRegistry.containsN)r   rh  rq   rc   )r   rD  rq   r   )r   rD  rm  r   rq   r   )rf   rg   rh   r   rl  rk  r[   r[   r[   r\   rg    s    

rg  r"   c                 C  s   t | tjrtjj| S | S )z
    Create a weak reference to a tensor.
    The new tensor will share the same data as the original tensor,
    but will not keep the original tensor alive.
    )r  rZ   Tensorr   r  weak_ref_tensor)r  r[   r[   r\   rp    s   rp  tensors<Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]/Union[torch.Tensor, list[Any], tuple[Any], Any]c                 C  sP   t | tjr
t| S t | trdd | D S t | tr$tdd | D S td)z
    Convenience function to create weak references to tensors,
    for single tensor, list of tensors or tuple of tensors.
    c                 S  s   g | ]}t |qS r[   rp  r   r*  r[   r[   r\   r%    s    z$weak_ref_tensors.<locals>.<listcomp>c                 s  s    | ]}t |V  qd S rt   rt  ru  r[   r[   r\   r     r  z#weak_ref_tensors.<locals>.<genexpr>zInvalid type for tensors)r  rZ   ro  rp  r&  r  r   )rq  r[   r[   r\   weak_ref_tensors  s   

rv  
cpu_tensorc                 C  s   |   sJ dtjj| S )zQ
    Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
    zCPU tensor must be pinned)	is_pinnedrZ   r   r  get_cuda_view_from_cpu_tensor)rw  r[   r[   r\   ry    s   ry  module_namer4  Union[str, os.PathLike]c                 C  sZ   t j| |}|du rtd|  d|jdusJ t j|}|tj| < |j| |S )z
    Import a Python file according to its file path.

    Based on the official recipe:
    https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
    NzNo module named '')		importlibutilspec_from_file_locationModuleNotFoundErrorloadermodule_from_specr  modulesexec_module)rz  r4  specmoduler[   r[   r\   import_from_path  s   
r  c                    s6   t jd} | dg  | dg } fdd|D S )Nr   zRequires-DistzProvides-Extrac                   s    i | ]   fd dD qS )c                   s.   g | ]}| d   drtd|d qS )z
extra == ""z
;|>=|<=|==r   )r&  r  r  )r   reqextrar[   r\   r%    s    z=get_vllm_optional_dependencies.<locals>.<dictcomp>.<listcomp>r[   )r   requirementsr  r\   r!    s    z2get_vllm_optional_dependencies.<locals>.<dictcomp>)r}  metadataget_all)r  extrasr[   r  r\   get_vllm_optional_dependencies  s   
r  c                   @  s  e Zd ZdZd]ddZd^d
dZd^ddZd^ddZd^ddZd^ddZ	d^ddZ
dd Zdd Zd_ddZdd Zd`d d!Zdad#d$Zd`d%d&Zd^d'd(Zd^d)d*Zd^d+d,Zd^d-d.Zd^d/d0Zd^d1d2Zd^d3d4Zd^d5d6Zdbdcd9d:Zd^d;d<Zd^d=d>Zd^d?d@Zd^dAdBZd^dCdDZdEdF Z dGdH Z!dIdJ Z"dKdL Z#dMdN Z$dbdddPdQZ%dRdS Z&dTdU Z'dVdW Z(dXdY Z)d_dZd[Z*d\S )e_PlaceholderBaseaP  
    Disallows downstream usage of placeholder modules.

    We need to explicitly override each dunder method because
    [`__getattr__`][vllm.utils._PlaceholderBase.__getattr__]
    is not called when they are accessed.

    Info:
        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
    r   rG   rq   r4   c                 C  s   t )z
        The main class should implement this to throw an error
        for attribute accesses representing downstream usage.
        )NotImplementedErrorr   r[   r[   r\   __getattr__	  s   z_PlaceholderBase.__getattr__r   rD  c                 C  
   |  dS )N__lt__r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__lt__c                 C  r  )N__le__r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__le__c                 C  r  )N__eq__r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__eq__c                 C  r  )N__ne__r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__ne__c                 C  r  )N__gt__r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__gt__c                 C  r  )N__ge__r  r   r[   r[   r\   r   	  ry   z_PlaceholderBase.__ge__c                 C  r  )N__hash__r  r   r[   r[   r\   r  #	  ry   z_PlaceholderBase.__hash__c                 C  r  )N__bool__r  r   r[   r[   r\   r  &	  ry   z_PlaceholderBase.__bool__rV  r  c                 O  r  )Nr  r  r  r[   r[   r\   r  +	  ry   z_PlaceholderBase.__call__c                 C  r  )Nrf  r  r   r[   r[   r\   rf  0	  ry   z_PlaceholderBase.__len__c                 C  r  )Nr   r  r   r[   r[   r\   r   3	  ry   z_PlaceholderBase.__getitem__r   c                 C  r  )Nr   r  r   r[   r[   r\   r   6	  ry   z_PlaceholderBase.__setitem__c                 C  r  )Nr   r  r   r[   r[   r\   r   9	  ry   z_PlaceholderBase.__delitem__c                 C  r  )N__add__r  r   r[   r[   r\   r  D	  ry   z_PlaceholderBase.__add__c                 C  r  )Nr   r  r   r[   r[   r\   r   G	  ry   z_PlaceholderBase.__sub__c                 C  r  )N__mul__r  r   r[   r[   r\   r  J	  ry   z_PlaceholderBase.__mul__c                 C  r  )N
__matmul__r  r   r[   r[   r\   r  M	  ry   z_PlaceholderBase.__matmul__c                 C  r  )N__truediv__r  r   r[   r[   r\   r  P	  ry   z_PlaceholderBase.__truediv__c                 C  r  )N__floordiv__r  r   r[   r[   r\   r  S	  ry   z_PlaceholderBase.__floordiv__c                 C  r  )N__mod__r  r   r[   r[   r\   r  V	  ry   z_PlaceholderBase.__mod__c                 C  r  )N
__divmod__r  r   r[   r[   r\   r  Y	  ry   z_PlaceholderBase.__divmod__.moduloc                 C  r  )N__pow__r  )rw   r   r  r[   r[   r\   r  \	  ry   z_PlaceholderBase.__pow__c                 C  r  )N
__lshift__r  r   r[   r[   r\   r  _	  ry   z_PlaceholderBase.__lshift__c                 C  r  )N
__rshift__r  r   r[   r[   r\   r  b	  ry   z_PlaceholderBase.__rshift__c                 C  r  )N__and__r  r   r[   r[   r\   r  e	  ry   z_PlaceholderBase.__and__c                 C  r  )N__xor__r  r   r[   r[   r\   r  h	  ry   z_PlaceholderBase.__xor__c                 C  r  )N__or__r  r   r[   r[   r\   r  k	  ry   z_PlaceholderBase.__or__c                 C  r  )N__neg__r  r   r[   r[   r\   r  q	  ry   z_PlaceholderBase.__neg__c                 C  r  )N__pos__r  r   r[   r[   r\   r  t	  ry   z_PlaceholderBase.__pos__c                 C  r  )N__abs__r  r   r[   r[   r\   r  w	  ry   z_PlaceholderBase.__abs__c                 C  r  )N
__invert__r  r   r[   r[   r\   r  z	  ry   z_PlaceholderBase.__invert__c                 C  r  )N	__index__r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__index__ndigitsc                 C  r  )N	__round__r  )rw   r  r[   r[   r\   r  	  ry   z_PlaceholderBase.__round__c                 C  r  )N	__trunc__r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__trunc__c                 C  r  )N	__floor__r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__floor__c                 C  r  )N__ceil__r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__ceil__c                 C  r  )Nr  r  r   r[   r[   r\   r  	  ry   z_PlaceholderBase.__enter__c                 O  r  )Nr  r  r  r[   r[   r\   r  	  ry   z_PlaceholderBase.__exit__N)r   rG   rq   r4   )r   rD  )rV  rD  r  rD  )r   rD  )r   rD  r   rD  ).)r   rD  r  rD  )r  rD  )+rf   rg   rh   r   r  r  r  r  r  r  r  r  r  r  rf  r   r   r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r[   r[   r[   r\   r    sR    

	





















r  c                      s6   e Zd ZdZd fddZdd	d
ZdddZ  ZS )PlaceholderModulez
    A placeholder object to use when a module does not exist.

    This enables more informative errors when trying to access attributes
    of a module that does not exists.
    r  rG   rq   rr   c                   s   t    || _d S rt   )r   rx   _PlaceholderModule__name)rw   r  r   r[   r\   rx   	  s   

zPlaceholderModule.__init__	attr_pathc                 C  r  rt   )_PlaceholderModuleAttrrw   r  r[   r[   r\   placeholder_attr	  ry   z"PlaceholderModule.placeholder_attrr   c              
   C  sp   | j }z
t| W td ty7 } zt  D ]\}}||v r0d| d| d}t||q|d }~ww )NzPlease install vllm[z] for z supportMPlaceholderModule should not be used when the original module can be imported)r  r}  import_moduleImportErrorr  r/  AssertionError)rw   r   r  excr  r  r  r[   r[   r\   r  	  s   	
zPlaceholderModule.__getattr__)r  rG   rq   rr   r  rG   r   rG   )rf   rg   rh   r   rx   r  r  r   r[   r[   r   r\   r  	  s
    
r  c                      s2   e Zd Zd fddZdd	d
ZdddZ  ZS )r  r  r  r  rG   rq   rr   c                   s   t    || _|| _d S rt   )r   rx   _PlaceholderModuleAttr__module!_PlaceholderModuleAttr__attr_path)rw   r  r  r   r[   r\   rx   	  s   

z_PlaceholderModuleAttr.__init__c                 C  s   t | j| j d| S )Nr  )r  r  r  r  r[   r[   r\   r  	  s   z'_PlaceholderModuleAttr.placeholder_attrr   c                 C  s    t | j| j d|  td)Nr  r  )r@  r  r  r  r   r[   r[   r\   r  	  s   z"_PlaceholderModuleAttr.__getattr__)r  r  r  rG   rq   rr   r  r  )rf   rg   rh   rx   r  r  r   r[   r[   r   r\   r  	  s    
r  r   ZFRAGMENTCUDAr[   op_nameop_funcmutates_argsr  	fake_implOptional[Callable]
target_libOptional[Library]dispatch_keytagstuple[torch.Tag, ...]c                 C  s   t  sddlm} | rJ ddS ddl}t|jdr&|jj||d}	nddl}|j	j
||}	|p5t}
|
j| |	 |d |
j
| ||d |durS|
| | dS dS )	a  
    `torch.library.custom_op` can have significant overhead because it
    needs to consider complicated dispatching logic. This function
    directly registers a custom op and dispatches it to the CUDA backend.
    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
    for more details.

    By default, the custom op is registered to the vLLM library. If you
    want to register it to a different library, you can pass the library
    object to the `target_lib` argument.

    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
    library object. If you want to bind the operator to a different library,
    make sure the library object is alive when the operator is used.
    r   r  zcuda platform needs torch>=2.4 to support custom op, chances are you are using an old version of pytorch or a custom build of pytorch. It is recommended to use vLLM in a fresh new environment and let it install the required dependencies.Ninfer_schema)r  )r  )r  )rU  r  r  Zis_cuda_aliketorch.libraryrk  rT  r  Ztorch._custom_op.implZ
_custom_opimplvllm_libdefineZ_register_fake)r  r  r  r  r  r  r  r  rZ   Z
schema_strZmy_libr[   r[   r\   direct_register_custom_op	  s&   
r  qualnamec                 C  s$   |  dd\}}t|}t||S )z>
    Resolve an object by its fully-qualified class name.
    r  r{   )r  r}  r  r@  )r  rz  obj_namer  r[   r[   r\   resolve_obj_by_qualname
  s   

r  r  c              	   C  s   zt | }W n t jy   Y dS w |jdd}|D ]}tt t|j	t
j W d   n1 s5w   Y  qtt t| t
j W d   dS 1 sSw   Y  dS )z
    Kills all descendant processes of the given pid by sending SIGKILL.

    Args:
        pid (int): Process ID of the parent process
    NT)	recursive)r   r  r  childrenrp  rq  ProcessLookupErrorr  killr  signalSIGKILL)r  parentr  childr[   r[   r\   kill_process_tree
  s   "r  c                   @  s   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	ded< dZ
ded	< d
Zded< dZded< dd Zdd ZdddZdS )MemorySnapshotzMemory snapshot.r   rY   
torch_peakfree_memorytotal_memorycuda_memorytorch_memorynon_torch_memory        rU   	timestampTr   auto_measurec                 C  s   | j r	|   d S d S rt   )r  measurer   r[   r[   r\   __post_init__1
  s   zMemorySnapshot.__post_init__c                 C  s\   t j dd| _t j \| _| _| j| j | _t j	 | _
| j| j
 | _t | _d S )Nzallocated_bytes.all.peakr   )rZ   r  Zmemory_statsr   r  Zmem_get_infor  r  r  Zmemory_reservedr  r  r)  r  r   r[   r[   r\   r  5
  s   
zMemorySnapshot.measurer   rq   c              
   C  sP   t | j|j | j|j | j|j | j|j | j|j | j|j | j|j ddS )NF)r  r  r  r  r  r  r  r  )r  r  r  r  r  r  r  r  r   r[   r[   r\   r   I
  s   






zMemorySnapshot.__sub__N)r   r  rq   r  )rf   rg   rh   r   r  r   r  r  r  r  r  r  r  r  r  r   r[   r[   r[   r\   r  %
  s   
 r  c                   @  s   e Zd ZU dZdZded< dZded< dZded< dZded< e	e
d	Zd
ed< e	e
d	Zd
ed< e	e
d	Zd
ed< dZded< dddZdS )MemoryProfilingResultz7Memory profiling result. All numbers are in bytes.
    r   rY   non_kv_cache_memorytorch_peak_increasenon_torch_increaserU   weights_memory)default_factoryr  before_createbefore_profileafter_profiler  profile_timerq   rG   c                 C  sH   d| j dd| jt dd| jt dd| jt dd| jt ddS )NzMemory profiling takes z.2fz% seconds. Total non KV cache memory: z!GiB; torch peak memory increase: z(GiB; non-torch forward increase memory: zGiB; weights memory: zGiB.)r  r  	GiB_bytesr  r  r  r   r[   r[   r\   __repr__c
  s   



zMemoryProfilingResult.__repr__Nr=  )rf   rg   rh   r   r  r   r  r  r  r   r  r  r  r  r  r  r[   r[   r[   r\   r  V
  s   
 r  baseline_snapshotr  ,Generator[MemoryProfilingResult, None, None]c                 c  s    t   tj  tj  t }| |_||_|j	
  |V  t   tj  |j
  |j|j	 }|j|j }|j|_|j|_|j|_|j|j |j |_dS )a  Memory profiling context manager.
    baseline_snapshot: the memory snapshot before the current vLLM instance.
    weights_memory: memory used by PyTorch when loading the model weights.
        Note that, before loading the model weights, we also initialize the device
        and distributed environment, which may consume some memory. This part is not
        included in the weights_memory because PyTorch does not control it.

    The memory in one GPU can be classified into 3 categories:
    1. memory used by anything other than the current vLLM instance.
    2. memory used by torch in the current vLLM instance.
    3. memory used in the current vLLM instance, but not by torch.

    A quantitive example:

    Before creating the current vLLM instance:
        category 1: 1 GiB
        category 2: 0 GiB
        category 3: 0 GiB

    After creating the current vLLM instance and loading the model,
    (i.e. before profiling):
        category 1: 1 GiB
        category 2: 2 GiB (model weights take 2 GiB)
        category 3: 0.5 GiB (memory used by NCCL)

    During profiling (peak):
        category 1: 1 GiB
        category 2: 4 GiB (peak activation tensors take 2 GiB)
        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)

    After profiling:
        category 1: 1 GiB
        category 2: 3 GiB (after garbage-collecting activation tensors)
        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)

    In this case, non-kv cache takes 5 GiB in total, including:
    a. 2 GiB used by the model weights (category 2)
    b. 2 GiB reserved for the peak activation tensors (category 2)
    c. 1 GiB used by non-torch components (category 3)

    The memory used for loading weights (a.) is directly given from the argument `weights_memory`.

    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.).

    The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.).
    N)r  r  rZ   r  Zempty_cacheZreset_peak_memory_statsr  r  r  r  r  r  r  r  r  r  r  r  r  )r  r  r  Zdiff_profileZdiff_from_creater[   r[   r\   memory_profilingn
  s$   2




r  r0  c              
   C  s   t jdrtd d S dd l}|j}||\}}|| k rEz||| |f W d S  t	yD } zt
d|| W Y d }~d S d }~ww d S )Nwinz-Windows detected, skipping ulimit adjustment.r   zFound ulimit of %s and failed to automatically increase with error %s. This can cause fd limit errors like `OSError: [Errno 24] Too many open files`. Consider increasing with ulimit -n)r  r  r  r  r   resourceZRLIMIT_NOFILEZ	getrlimitZ	setrlimitr   r  )Ztarget_soft_limitr  Zresource_typeZcurrent_softZcurrent_hardr9  r[   r[   r\   
set_ulimit
  s&   

r   c                  C  s&   t  \} }}dt| ||}|S r  )r  exc_infor^  	tracebackformat_exception)etyper   tbZerr_strr[   r[   r\   get_exception_traceback
  s   r  r\  tuple[str, str, str]c                 C  s   t | }|jstd|  |j}|jpd}t|jpd}|dkr.t||fs.td|  |dkr;|r;td|  |||fS )z Split a zmq path into its parts.zInvalid zmq path: r  tcp)r-   schemer   hostnamerG   r  rI  )r\  parsedr	  r  r  r[   r[   r\   split_zmq_path
  s   

r  r	  c                 C  sF   |du r|  d| S t |r|  d| d| S |  d| d| S )a8  Make a ZMQ path from its parts.

    Args:
        scheme: The ZMQ transport scheme (e.g. tcp, ipc, inproc).
        host: The host - can be an IPv4 address, IPv6 address, or hostname.
        port: Optional port number, only used for TCP sockets.

    Returns:
        A properly formatted ZMQ path string.
    Nz://z://[r  r  r  )r	  r  r  r[   r[   r\   make_zmq_path
  s
   r  ctx'Union[zmq.asyncio.Context, zmq.Context]socket_typer  Optional[bool]Optional[bytes]rP  %Union[zmq.Socket, zmq.asyncio.Socket]c                 C  sP  t  }| |}|jd }|jd }	|dkr |	dkr td}
nd}
|du r0|tjtjtj	fv}|tj
tjtjfv rH|tjd |tj|
 |tjtjtjfv r`|tjd |tj|
 |durk|tj| |durv|tj| |tjkr|tjd t|\}}}|d	krt|r|tjd
 |r|| |S || |S )z9Make a ZMQ socket with the proper bind/connect semantics.rQ   r   r  g      ArX  Nr   Tr  r{   )r   r   r  r   	availablerY   zmqZPUSHZSUBZXSUBZPULLZDEALERZROUTER
setsockoptZRCVHWMZRCVBUFZSNDHWMZSNDBUFZIDENTITYZLINGERZXPUBZXPUB_VERBOSEr  r  ZIPV6r  r  )r  r\  r  r  r  rP  Zmemr  Z	total_memZavailable_memZbuf_sizer	  r  r   r[   r[   r\   make_zmq_socket   s:   







r  Iterator[zmq.Socket]c              	   c  st    t  }z-zt|| |||dV  W n ty    td Y n
w W |j|d dS W |j|d dS |j|d w )z Context manager for a ZMQ socket)r  r  zGot Keyboard Interrupt.rO  N)r  Contextr  KeyboardInterruptr  debugdestroy)r\  r  r  rP  r  r  r[   r[   r\   zmq_socket_ctx;  s    
	r  c                  C  s   t jddkr
dS g } t r ddl}| jt jd< | d t r)| d nt	 r1| d | rCt
d	d
|  dt jd< dS dS )zWCheck if we need to force the use of the `spawn` multiprocessing start
    method.
    VLLM_WORKER_MULTIPROC_METHODZspawnNr   ZRAY_ADDRESSz&In a Ray actor and can only be spawnedzCUDA is initializedzXPU is initializedzWe must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reasons: %sz; )r  r  r   r:   rayZget_runtime_contextZgcs_addressr   r  r  r  r  r^  )reasonsr  r[   r[   r\   _maybe_force_spawnS  s$   

r!  c                  C  s   t   tj} t| S )aJ  Get a multiprocessing context with a particular method (spawn or fork).
    By default we follow the value of the VLLM_WORKER_MULTIPROC_METHOD to
    determine the multiprocessing method (default is fork). However, under
    certain conditions, we may enforce spawn and override the value of
    VLLM_WORKER_MULTIPROC_METHOD.
    )r!  r  r  r  r  )Z	mp_methodr[   r[   r\   get_mp_contextr  s   
r"  kv_cachelist[list[torch.Tensor]]shared_kv_cache_layersOptional[dict[str, str]]c                   s   |d u ri }ddl m  ddlm  fddD }ttfdd|D }|D ](}||}| }t|jt|ksCJ t	|D ]\}}	|	| |j|< qGq+|d uru|
 D ]\}}
|
|k slJ d|
 j| _q\d S d S )	Nr   )AttentionTypeextract_layer_indexc                   sB   g | ]}t | d r| j j jfv r| jdu r|qS )	attn_typeN)rk  r*  ZDECODERZENCODER_DECODERZkv_sharing_target_layer_namer   
layer_name)r'  r  r[   r\   r%    s    
z!bind_kv_cache.<locals>.<listcomp>c                 3  s    | ]} |V  qd S rt   r[   r+  r(  r[   r\   r     s
    
z bind_kv_cache.<locals>.<genexpr>z*v0 doesn't support interleaving kv sharing)Zvllm.attentionr'  Z vllm.model_executor.models.utilsr)  r  r   r2  r   r#  r-  r/  )r  r#  r%  Zlayer_need_kv_cacheZlayer_index_sortedr,  Zkv_cache_idxZforward_ctxveZve_kv_cacheZtarget_layer_namer[   )r'  r  r)  r\   bind_kv_cache~  s>   r.  r   methodUnion[str, bytes, Callable]rV  
tuple[Any]r  c                 C  sn   t |trtt|| }n"t |tr+zt| |}W n ty*   td|ddw t|| }||i |S )a3  
    Run a method of an object with the given arguments and keyword arguments.
    If the method is string, it will be converted to a method using getattr.
    If the method is serialized bytes and will be deserialized using
    cloudpickle.
    If the method is a callable, it will be called directly.
    zMethod z is not implemented.N)	r  bytesr   cloudpickler)  rG   r@  AttributeErrorr  )r   r/  rV  r  rX  r[   r[   r\   
run_method  s   
	

r5  c                  C  s   ddl m  m}  | S )a  
    Historical comments:

    libnvml.so is the library behind nvidia-smi, and
    pynvml is a Python wrapper around it. We use it to get GPU
    status without initializing CUDA context in the current process.
    Historically, there are two packages that provide pynvml:
    - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official
        wrapper. It is a dependency of vLLM, and is installed when users
        install vLLM. It provides a Python module named `pynvml`.
    - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper.
        Prior to version 12.0, it also provides a Python module `pynvml`,
        and therefore conflicts with the official one. What's worse,
        the module is a Python package, and has higher priority than
        the official one which is a standalone Python file.
        This causes errors when both of them are installed.
        Starting from version 12.0, it migrates to a new module
        named `pynvml_utils` to avoid the conflict.
    It is so confusing that many packages in the community use the
    unofficial one by mistake, and we have to handle this case.
    For example, `nvcr.io/nvidia/pytorch:24.12-py3` uses the unofficial
    one, and it will cause errors, see the issue
    https://github.com/vllm-project/vllm/issues/12847 for example.
    After all the troubles, we decide to copy the official `pynvml`
    module to our codebase, and use it directly.
    r   N)Zvllm.third_party.pynvmlZthird_partypynvml)r6  r[   r[   r\   import_pynvml  s   r7  rj  rh  c                   s:   | j ddd td fdd}t| d	| | S )a-  
    A replacement for `abc.ABC`.
    When we use `abc.ABC`, subclasses will fail to instantiate
    if they do not implement all abstract methods.
    Here, we only require `raise NotImplementedError` in the
    base class, and log a warning if the method is not implemented
    in the subclass.
    rw   rD  c              	   S  s   g }t | D ].}|drqzt| |}t|r|j}W n	 ty%   Y qw t|}d|v r4|| q|rKd	|}d| d|  }t
| d S d S )Nr   r  r!  zMethods z not implemented in )r_  r  r@  r  r  r4  r  	getsourcer   r^  r  r  )rw   Zunimplemented_methods	attr_nameattrZ	attr_funcsrcZmethod_namesr  r[   r[   r\   find_unimplemented_methods  s*   




zBwarn_for_unimplemented_methods.<locals>.find_unimplemented_methodsrq   rr   c                   s"   | g|R i |  |  d S rt   r[   r  r<  Zoriginal_initr[   r\   wrapped_init  s   z4warn_for_unimplemented_methods.<locals>.wrapped_initrx   N)rw   rD  r   )rx   r   rn  __setattr__)rj  r>  r[   r=  r\   warn_for_unimplemented_methods  s   

r@  c                      s@   e Zd ZdZd fddZdddZdddZdddZ  ZS )
LazyLoadera  
    LazyLoader module borrowed from Tensorflow
    https://github.com/tensorflow/tensorflow/blob/main/tensorflow/python/util/lazy_loader.py
    with a addition of "module caching".

    Lazily import a module, mainly to avoid pulling in large dependencies.
    Modules such as `xgrammar` might do additional side effects, so we
    only want to use this when it is needed, delaying all eager effects
    
local_namerG   parent_module_globalsr  r  c                   s&   || _ || _d | _t t| d S rt   )_local_name_parent_module_globals_moduler   rx   rG   )rw   rB  rC  r  r   r[   r\   rx     s   zLazyLoader.__init__rq   types.ModuleTypec              
   C  sZ   zt | j}|| j| j< |tj| j< W n ty# } z|d d }~ww | j	|j |S rt   )
r}  r  rf   rE  rD  r  r  r  __dict__r  )rw   r  errr[   r[   r\   _load#  s   zLazyLoader._loadrt  r"   c                 C  s    | j d u r
|  | _ t| j |S rt   )rF  rJ  r@  )rw   rt  r[   r[   r\   r  4  s   

zLazyLoader.__getattr__r  c                 C  s   | j d u r
|  | _ t| j S rt   )rF  rJ  r_  r   r[   r[   r\   __dir__9  s   


zLazyLoader.__dir__)rB  rG   rC  r  r  rG   )rq   rG  )rt  r"   rq   r"   )rq   r  )	rf   rg   rh   r   rx   rJ  r  rK  r   r[   r[   r   r\   rA    s    


rA  dict[_K, _V]key1key2c                 C  sV   |  |}|  |}|dur|| |< n| |d |dur#|| |< dS | |d dS )z5
    Helper function to swap values for two keys
    N)r   r   )r   rM  rN  v1v2r[   r[   r\   swap_dict_values?  s   


rQ  	save_filec              	   c  s    ddl }| }|  zdV  W |  | r#| dkr#||  dS |jdd dS |  | r;| dkr;||  w |jdd w )zRun a cprofile

    Args:
        save_file: path to save the profile result. "1" or
          None will result in printing to stdout.
    r   Nr  Zcumtime)sort)cProfileZProfileenabledisableZ
dump_statsZprint_stats)rR  rT  Zprofr[   r[   r\   cprofile_contextO  s   rW  enabledc                   s   d fdd}|S )zDecorator to profile a Python method using cProfile.

    Args:
        save_file: Path to save the profile result.
            If "1", None, or "", results will be printed to stdout.
        enabled: Set to false to turn this into a no-op
    rX  r#   c                   s   t   fdd}|S )Nc                    sL    s	| i |S t  | i |W  d    S 1 sw   Y  d S rt   )rW  r  )rX  rX  rR  r[   r\   r  q  s
   
$z,cprofile.<locals>.decorator.<locals>.wrapperr  )rX  r  rX  rR  )rX  r\   	decoratoro  s   zcprofile.<locals>.decoratorN)rX  r#   r[   )rR  rX  rZ  r[   rY  r\   cprofilef  s   	r[  model_configr<   c                 C  st   | j }t|ddp9dt| jdg v p9t|dddkp9t|do9t|jtr,|jddp9t|jt o9t|jddS )NZalibiFZBloomForCausalLMZarchitecturesZposition_encoding_typer  attn_config)Zhf_text_configr@  Z	hf_configrk  r  r]  r;  r   )r\  cfgr[   r[   r\   check_use_alibi  s"   

r_  c                 C  s(   t j| t jd}tjt| ddS )u  Hash any picklable Python object using SHA-256.

    The input is serialized using pickle before hashing, which allows
    arbitrary Python objects to be used. Note that this function does
    not use a hash seed—if you need one, prepend it explicitly to the input.

    Args:
        input: Any picklable Python object.

    Returns:
        An integer representing the SHA-256 hash of the serialized input.
    )protocolbig	byteorder)pickler(  HIGHEST_PROTOCOLrY   
from_byteshashlibsha256digest)inputinput_bytesr[   r[   r\   rh    s   rh  c                 C  s.   t j| dd}tjt| dd}|d@ S )a;  
    Hash objects using CBOR serialization and SHA-256, then truncate to 64bits.

    This option is useful for non-Python-dependent serialization and hashing.

    Args:
        input: Object to be serialized and hashed. Supported types include
            basic Python types and complex structures like lists, tuples, and
            dictionaries.
            Custom classes must implement CBOR serialization methods.

    Returns:
        An integer in the range [0, 2^64-1] representing the lower 64 bits
        of the SHA-256 hash of the CBOR serialized input.
    T)	canonicalra  rb  l    )cbor2r(  rY   rf  rg  rh  ri  )rj  rk  Z	full_hashr[   r[   r\   sha256_cbor_64bit  s
   rn  hash_fn_namec                 C  s2   | dkrt S | dkrtS | dkrtS td|  )zGet a hash function by name, or raise an error if
    the function is not found.
    Args:
        hash_fn_name: Name of the hash function.
    Returns:
        A hash function.
    rh  rn  builtinzUnsupported hash function: )rh  rn  hashr   )ro  r[   r[   r\   get_hash_fn_by_name  s   rr  targetc                 C  s>   z	t ttj| W S  ty   ttjdt| k Y S w )zCheck if the installed torch version is >= the target version.

    Args:
        target: a version string, like "2.6.0".

    Returns:
        Whether the condition meets.
    rZ   )	_is_torch_equal_or_newerrG   rZ   rN  r1  r1   r}  r  r0   )rs  r[   r[   r\   rQ    s
   	rQ  torch_versionc                 C  s   t | } | t |kS rt   )r0   parse)ru  rs  r[   r[   r\   rt    s   
rt  c                 C  s   t j| duS )zReturn True if *module_name* can be found in the current environment.

    The result is cached so that subsequent queries for the same module incur
    no additional overhead.
    N)r}  r~  	find_spec)rz  r[   r[   r\   _has_module  s   rx  c                   C     t dS )z9Whether the optional `pplx_kernels` package is available.Zpplx_kernelsrx  r[   r[   r[   r\   has_pplx     r{  c                   C  ry  )z4Whether the optional `deep_ep` package is available.Zdeep_eprz  r[   r[   r[   r\   has_deep_ep  r|  r}  c                   C  ry  )z6Whether the optional `deep_gemm` package is available.Z	deep_gemmrz  r[   r[   r[   r\   has_deep_gemm  r|  r~  c                   C  ry  )z;Whether the optional `triton_kernels` package is available.Ztriton_kernelsrz  r[   r[   r[   r\   has_triton_kernels  r|  r  r  r  suffixr   c                 C  sH   |r	|  d| } |rt   d|  } ntj d|  } t  |  dS )a  
    Set the current process title to a specific name with an
    optional suffix.

    Args:
        name: The title to assign to the current process.
        suffix: An optional suffix to append to the base name.
        append: Whether to append to the existing process title.
    r   z::N)setproctitleZgetproctitler  ZVLLM_PROCESS_NAME_PREFIX)r  r  r   r[   r[   r\   set_process_title  s   r  filer(   worker_namec                   sD   t  d| d| dt d jd fdd}d	 _| _d
S )z5Prepend each output line with process-specific prefix(z pid=)ro  r  rG   c                   s   | sd S  j r d}| d| }dkr>|d7 }| ||  |t| kr.d _ d S  |}| d| }dks| |d   d _ d S )Nr   
rX  r{   TF)start_new_linefindr   )r  idxZnext_idxr  
file_writeprefixr[   r\   write_with_prefix  s    
z&_add_prefix.<locals>.write_with_prefixTN)r  rG   )CYANRESETwriter  )r  r  r  r  r[   r  r\   _add_prefix  s
   
r  process_namec                 C  s<   | du r
t   j} t }ttj| | ttj| | dS )aV  
    Adds a process-specific prefix to each line of output written to stdout and
    stderr.

    This function is intended to be called before initializing the api_server,
    engine_core, or worker classes, so that all subsequent output from the
    process is prefixed with the process name and PID. This helps distinguish
    log output from different processes in multi-process environments.

    Args:
        process_name: Optional; the name of the process to use in the prefix.
            If not provided, the current process name from the multiprocessing
            context is used.
    N)	r"  current_processr  r  ru  r  r  stdoutstderr)r  r  r[   r[   r\   decorate_logs1  s
   r  )rX   rY   r   )r   rY   rq   rY   r   r=  )rC  r   )rM  rN  )r  r
   rT  r#   )rW  r
   rq   r   rt   )rX  rY  rZ  r[  rq   r\  )rd  re  r  r
   rq   r   )rh  re  rq   ri  )rd  re  rq   ry  )r  rG   rq   r   )r  rG   rq   r  )r  rG   r  rY   rq   rG   )r  rG   r  rY   rq   rG   )r  rY   rq   r  )r  r  )r  ry  r  rY   )r  rY   r  rY   rq   rY   )r  rY   rq   rY   )r  rY   r  rY   rq   rY   )r  r  r  rU   r  rU   rq   rr   )r  r  r  r  rq   r  )NNr  r  )r  rY   r  rY   r  rY   r  rY   r  rY   r  r  r  r  r  r  r  r  r  r  rq   r  )NNr  )r  rY   r  rY   r  rY   r  rY   r  rY   r  r  r  r  r  r  r  r  rq   r  )rq   r   )
r  r  r  r_   r  r  r  r  rq   r  )r  r  r  r_   r  r  r  r  r  r"  r!  r   rq   r  )
r   r&  r  r  r'  r(  r!  r   rq   r  )r  r  rq   rY   )r1  r  r2  r  )r6  r7  )r?  r@  rq   ry  )r   rD  rE  rF  rC  rG  rq   rH  )rL  rM  rq   ry  )rO  rP  r   rQ  r   )rW  rG   rq   rG   )rf  rg  rq   rr   )rq   rg  )rn  r=   rq   rr   )r   r_   rq   r_   )TN)r  rY   r  r  r  r  rq   r  )r  rG   r  r  r  r  rq   r  )r  r  rq   rY   r   )r  r  rq   r  )r  r  rq   r  )rw  r  rq   r  )rC  r#   r  r?  )
r  rD  rE  rG   rB  r   rC  r   rq   r   )
r  rD  rI  rJ  rB  r   rC  r   rq   r  )r  r"   rq   r"   )rq  rr  rq   rs  )rw  r  rq   r  )rz  rG   r4  r{  )NNr  r[   )r  rG   r  r#   r  r  r  r  r  r  r  rG   r  r  )r  rG   rq   r"   )r  rY   )r  r  r  rY   rq   r  )r0  )r\  rG   rq   r  )r	  rG   r  rG   r  r  rq   rG   )NNN)r  r  r\  rG   r  r"   r  r  r  r  rP  r  rq   r  )Nr   N)r\  rG   r  r"   r  r  rP  rY   r  r  rq   r  )r  r  r#  r$  r%  r&  rq   rr   )
r   r"   r/  r0  rV  r1  r  r  rq   r"   )rj  rh  rq   rh  )r   rL  rM  ra   rN  ra   rq   rr   )rR  r  r  )rR  r  rX  r   )r\  r<   rq   r   )ro  rG   rq   r#   )rs  rG   rq   r   )ru  rG   rs  rG   rq   r   )rz  rG   rq   r   )r  F)r  rG   r  rG   r   r   rq   rr   )r  r(   r  rG   r  rY   rq   rr   )r  r  rq   rr   (<  
__future__r   r  
concurrentrp  rx  rj   r  rs  rg  r}  importlib.metadataimportlib.utilr  r  r'  r  r  rd  r  r  r`  r  rq  r  rv  r)  r  typesr   r  r  argparser   r   r   r   r   r   r	   r
   r   collectionsr   r   collections.abcr   r   r   r   r   r   r   r   r   r   concurrent.futuresr   concurrent.futures.processr   dataclassesr   r   	functoolsr   r   r   r   r    typingr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   urllib.parser-   r/   Z
cachetoolsrm  r3  numpyr  Znumpy.typingZnptr   regexr  r  rZ   Ztorch.typesr5  r  Zzmq.asyncio	packagingr0   Zpackaging.versionr1   r  r2   Z$transformers.tokenization_utils_baser3   Ztyping_extensionsr4   r5   r6   r7   Z	vllm.envsr  Zvllm.loggerr8   r9   Zvllm.ray.lazy_utilsr:   r;   Zvllm.configr<   r=   rf   r  ZDEFAULT_MAX_NUM_BATCHED_TOKENSZ$POOLING_MODEL_MAX_NUM_BATCHED_TOKENSZ'MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENSr>   r?   r@   rA   rB   rC   rD   rE   rF   ZSTR_NOT_IMPL_ENC_DEC_ERR_STRSrH   r   rI   rJ   rK   rL   rM   rN   rP   ZGB_bytesr  r  r  rR   rS   rT   rU   Zuint8rW   Zfloat8_e4m3fnr  r  Zfloat64Zint32Zint64r#  contextmanagerr]   r^   r_   r`   ra   rc   rd   re   r   Enumri   rl   ro   r   r   r   r   r   r   r   r   rL  rS  rJ  rU  rc  rg  rx  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r   r%  r+  r,  r0  r5  r>  rA  rK  rN  rS  rV  rd  re  r  Z
set_streamri  localrh  rj  rm  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r@  rH  rM  rP  rS  rU  rV  rG   r`  rn  rg  rp  rv  ry  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r!  r"  r.  r5  r7  r@  
ModuleTyperA  rQ  rW  r[  r_  rh  rn  rr  rQ  rt  rx  r{  r}  r~  r  r  r  r  r[   r[   r[   r\   <module>   s   08
$ 5& E	'	(
	36
,#	  P1:(& "1	0M;.+3	