o
    )ij                     @   s  d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	m
Z
mZmZ ddlZddlmZmZ ddlmZ ddlmZ ddlZddlmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z!m"Z" erhddl#m$Z$m%Z% ee&Z'edZ(edZ)e Z*ej+j,-d de	e(e)f de	e(e)f fddZ.G dd de Z/G dd de/Z0G dd de/Z1dZ2zze*3  dZ2W n e4y   dZ2Y nw W e2re*5  ne2re*5  w w e2re0ne1Z6e67  dS )z~Code inside this file can safely assume cuda platform, e.g. importing
pynvml. However, it should not initialize cuda context.
    N)	timedelta)cachewraps)TYPE_CHECKINGCallableOptionalTypeVarUnion)PrefixStoreProcessGroup)is_nccl_available)	ParamSpec)init_logger)cuda_device_count_statelessimport_pynvml   )DeviceCapabilityPlatformPlatformEnum_Backend)ModelConfig
VllmConfig_P_RFfnreturnc                    s*   t  dtjdtjdtf fdd}|S )Nargskwargsr   c                     s,   t   z | i |W t   S t   w N)pynvmlnvmlInitnvmlShutdown)r   r   r    _/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/platforms/cuda.pywrapper*   s   z"with_nvml_context.<locals>.wrapper)r   r   r   r   r   )r   r%   r#   r"   r$   with_nvml_context(   s    r&   c                   @   s2  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< dZeed	< d
Zeed< edeej fddZedejddfddZe	dIdedee fddZedIdedefddZedIdedefddZedee defddZedee defdd Zed!d" ZedJd%d&Z e	dKdeej!j" de#fd'd(Z$edLd*ede%fd+d,Z&edefd-d.Z'edefd/d0Z(edefd1d2Z)edefd3d4Z*ed5d6defd7d8Z+edefd9d:Z,edefd;d<Z-ed=ed>e.d?ed@edAe/de0fdBdCZ1edefdDdEZ2edFedefdGdHZ3dS )MCudaPlatformBasecudadevice_namedevice_typeCUDAdispatch_keyZGPUray_device_keyZnccldist_backendZCUDA_VISIBLE_DEVICESdevice_control_env_varr   c                 C   sB   |  drtjtjtjgS |  ds|  drtjtjgS tjgS )NP   <   )has_device_capabilitytorchbfloat16float16Zfloat32)selfr#   r#   r$   supported_dtypes>   s   

z!CudaPlatformBase.supported_dtypesdeviceNc                 C   s   t j| t jd|d}dS )z:
        Set the device for the current platform.
        r   )r8   N)r3   r(   
set_deviceZzeros)clsr8   _r#   r#   r$   r9   K   s   zCudaPlatformBase.set_devicer   	device_idc                 C      t r   NotImplementedErrorr:   r<   r#   r#   r$   get_device_capabilityV   s   z&CudaPlatformBase.get_device_capabilityc                 C   r=   r   r>   r@   r#   r#   r$   get_device_name\      z CudaPlatformBase.get_device_namec                 C   r=   r   r>   r@   r#   r#   r$   get_device_total_memory`   rC   z(CudaPlatformBase.get_device_total_memoryenforce_eagerc                 C   s   |rt jstd dS dS )NzTo see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be usedFT)envsVLLM_USE_V1loggerwarning)r:   rE   r#   r#   r$   is_async_output_supportedd   s   
z*CudaPlatformBase.is_async_output_supported
device_idsc                 C   r=   r   r>   )r:   rK   r#   r#   r$   is_fully_connectedn   rC   z#CudaPlatformBase.is_fully_connectedc                 C   s   d S r   r#   r:   r#   r#   r$   log_warningsr   rC   zCudaPlatformBase.log_warningsvllm_configr   c           
      C   sh  |j }|j}|jdkr#|jrtjstdd|_n
tjr d|_nd|_|j}|r0|jd u r0d|_|d ur|j	rd}d}tj
d u rN| drKd}d	t_
nd}n
tj
d
k}tj
d	k}ddlm} |rr| d rr|jdkrrd|_td |r|jdkrd|_td ddlm} |j}	tjdkr|jdkr|	j|jkrtd |j|	_|d urd|_d S d S d S d S d S )Nautoz1Speculative decoding is not supported on vLLM V0.z vllm.v1.worker.gpu_worker.Workerzvllm.worker.worker.Worker   Fd   TCUTLASS_MLAZFLASHMLAr   is_flashmla_supported@   z7Forcing kv cache block size to 64 for FlashMLA backend.   z;Forcing kv cache block size to 128 for CUTLASS_MLA backend.)CUDAGraphModeZdeepep_high_throughputr   zData Parallel: disabling cudagraphs since DP with DeepEP high-throughput kernels are not CUDA Graph compatible. The DeepEP low-latency kernels are CUDA Graph compatible. Set the all_to_all backend to deepep_low_latency to use those kernels instead.)parallel_configmodel_configZ
worker_clsZspeculative_configrF   rG   r?   cache_config
block_sizeuse_mlaVLLM_ATTENTION_BACKENDis_device_capabilityZvllm.attention.ops.flashmlarU   rH   infovllm.configrX   compilation_configZVLLM_ALL2ALL_BACKENDZdata_parallel_sizeZcudagraph_modeNONErE   )
r:   rO   rY   rZ   r[   Zuse_flashmlaZuse_cutlass_mlarU   rX   rb   r#   r#   r$   check_and_update_configv   sh   








z(CudaPlatformBase.check_and_update_configc                 C   s"   t j  t j| t j|S r   )r3   r(   Zempty_cacheZreset_peak_memory_statsZmax_memory_allocated)r:   r8   r#   r#   r$   get_current_memory_usage   s   
z)CudaPlatformBase.get_current_memory_usageF
support_fac                 C   s6   |  dr|rddlm} | rtjS td tjS )Nr0   r   )is_flash_attn_2_availablezCurrent `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.)r2   Ztransformers.utilsrg   r   
FLASH_ATTNrH   warning_onceXFORMERS)r:   rf   rg   r#   r#   r$   get_vit_attn_backend   s   z%CudaPlatformBase.get_vit_attn_backendc	                 C   s  |rk|t jks| dr"|d u r"|dkr"|rtd dS td |t jks+|dkr;|r4td dS td	 d
S ddlm	}	 |	 d sPtd|	 d  n|dkr[td| n|rdtd dS td dS |rVd}
d}d}d}d}d}|t j
krtd | drddlm} |d |
S |t jkrtd |S |t jkrtd |S |t jkrtd |S |t jkrtd  |S |t jkrtd! |S dd"lm} | dr||
|| }rddlm} td# |d |
S |jstd$ | d%r&|r| d&std |S ||||d'd( }r%td |S ntd |S |r2J i }|js<||d)< |jsD||d*< td+d,d-d. | D  |S |t j
krxtd/ | drvddlm} td0 |d d1S |t jkrtd2 d3S |t jkrtd4 d5S |t jkrtd6 d7S |t jkrn|rtd8| j d9| d:| t j}| d%std; t j}n!|t j!t j"fvrtd< t j}n|d= dkrtd> t j}|t jkrAz>dd l#}dd?l$m%}m&} |' }||vrtd@| t j}|d uo|(dA}|r-| s-tdB tdC t j}W n t)y@   tdD t j}Y nw |t jkrNtd2 d3S tdE dFS )GNrR   rW   z'Using Cutlass MLA backend on V1 engine.z<vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackendz2Cutlass MLA backend is only supported on V1 enginerV   z&Using Triton MLA backend on V1 engine.z:vllm.v1.attention.backends.mla.triton_mla.TritonMLABackendzUsing Triton MLA backend.z3vllm.attention.backends.triton_mla.TritonMLABackendr   rT   z+FlashMLA backend is not supported due to %sr   z\FlashMLA backend is not supported for block size %d (currently only supports block size 64).z$Using FlashMLA backend on V1 engine.z7vllm.v1.attention.backends.mla.flashmla.FlashMLABackendzUsing FlashMLA backend.z0vllm.attention.backends.flashmla.FlashMLABackendz7vllm.v1.attention.backends.flashinfer.FlashInferBackendz>vllm.v1.attention.backends.flex_attention.FlexAttentionBackendz=vllm.v1.attention.backends.triton_attn.TritonAttentionBackendz;vllm.v1.attention.backends.flash_attn.FlashAttentionBackendz9vllm.v1.attention.backends.tree_attn.TreeAttentionBackendz<vllm.v1.attention.backends.xformers.XFormersAttentionBackendz&Using FlashInfer backend on V1 engine.)set_kv_cache_layoutZHNDz)Using FlexAttention backend on V1 engine.z"Using Triton backend on V1 engine.z+Using Flash Attention backend on V1 engine.z*Using Tree Attention backend on V1 engine.z$Using XFormers backend on V1 engine.)is_attn_backend_supportedzgUsing FlashInfer backend with HND KV cache layout on V1 engine by default for Blackwell (SM 10.0) GPUs.zFlashInfer failed to import for V1 engine on Blackwell (SM 10.0) GPUs; it is recommended to install FlashInfer for better performance.r0   Z   F)Zallow_import_error	head_sizedtypez0Using FlexAttention backend for %s on V1 engine., c                 s   s"    | ]\}}| d | V  qdS )=Nr#   ).0kvr#   r#   r$   	<genexpr>[  s    z8CudaPlatformBase.get_attn_backend_cls.<locals>.<genexpr>zUsing FlashInfer backend.zOUsing HND KV cache layout on V1 engine by default for Blackwell (SM 10.0) GPUs.z4vllm.attention.backends.flashinfer.FlashInferBackendzUsing XFormers backend.z0vllm.attention.backends.xformers.XFormersBackendz&Using DualChunkFlashAttention backend.zLvllm.attention.backends.dual_chunk_flash_attn.DualChunkFlashAttentionBackendz)Using DifferentialFlashAttention backend.zQvllm.attention.backends.differential_flash_attn.DifferentialFlashAttentionBackendzInvalid attention backend for z, with use_v1: z
 use_mla: z>Cannot use FlashAttention-2 backend for Volta and Turing GPUs.zYCannot use FlashAttention-2 backend for dtype other than torch.float16 or torch.bfloat16.rQ   zGCannot use FlashAttention-2 backend for block size not divisible by 16.)FlashAttentionBackendflash_attn_supports_fp8z5Cannot use FlashAttention-2 backend for head size %d.fp8z3Cannot use FlashAttention backend for FP8 KV cache.zPlease use FlashInfer backend with FP8 KV Cache for better performance by setting environment variable VLLM_ATTENTION_BACKEND=FLASHINFERzCannot use FlashAttention-2 backend because the vllm.vllm_flash_attn package is not found. Make sure that vllm_flash_attn was built and installed (on by default).zUsing Flash Attention backend.z8vllm.attention.backends.flash_attn.FlashAttentionBackend)*r   rS   r_   rH   Z	info_oncerI   Z
TRITON_MLAr`   Z vllm.attention.backends.flashmlarU   Z
FLASHINFERr2   Z vllm.v1.attention.backends.utilsrl   ZFLEX_ATTENTIONTRITON_ATTN_VLLM_V1rh   Z	TREE_ATTNZXFORMERS_VLLM_V1Zvllm.attention.selectorrm   Z
can_importri   ro   rp   joinitemsrj   ZDUAL_CHUNK_FLASH_ATTNZDIFFERENTIAL_FLASH_ATTN
ValueErrorr)   r3   r5   r4   Zvllm.vllm_flash_attnZ"vllm.attention.backends.flash_attnrw   rx   Zget_supported_head_sizes
startswithImportError)r:   Zselected_backendro   rp   kv_cache_dtyper\   Zuse_v1r]   Zhas_sinkrU   ZFLASHINFER_V1ZFLEX_ATTENTION_V1rz   ZFLASH_ATTN_V1ZTREE_ATTN_V1ZXFORMERS_V1rl   rm   Zis_default_backend_supportedZuse_flex_attention_reasonZtarget_backendvllmrw   rx   Zsupported_sizesZfp8_kv_cacher#   r#   r$   get_attn_backend_cls   sp  






















	











z%CudaPlatformBase.get_attn_backend_clsc                 C      dS )Nz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPUr#   rM   r#   r#   r$   get_punica_wrapper  rC   z#CudaPlatformBase.get_punica_wrapperc                 C   r   )NzHvllm.distributed.device_communicators.cuda_communicator.CudaCommunicatorr#   rM   r#   r#   r$   get_device_communicator_cls  rC   z,CudaPlatformBase.get_device_communicator_clsc                 C   s
   |  dS )NY   )r2   rM   r#   r#   r$   supports_fp8  s   
zCudaPlatformBase.supports_fp8rZ   r   c                 C   r   NTr#   )r:   rZ   r#   r#   r$   supports_v1  rC   zCudaPlatformBase.supports_v1c                 C   r   r   r#   rM   r#   r#   r$   use_custom_allreduce  rC   z%CudaPlatformBase.use_custom_allreducec                 C   r   )Nz,vllm.compilation.cuda_graph.CUDAGraphWrapperr#   rM   r#   r#   r$   get_static_graph_wrapper_cls  rC   z-CudaPlatformBase.get_static_graph_wrapper_clsbackendprefix_store
group_rank
group_sizetimeoutc                 C   st   t  sJ t|||}ddlm} | }||_|||||}	tjj}
t	d}|
|
 |	  |||
|	 |S )Nr   )ProcessGroupNCCLr(   )r   r   "torch.distributed.distributed_c10dr   Options_timeoutZBackendTypeZNCCLr3   r8   Z_set_default_backendZ_set_sequence_number_for_groupZ_register_backend)r:   r   r   r   r   r   Zpgr   Zbackend_optionsZbackend_classZbackend_typer8   r#   r#   r$   #stateless_init_device_torch_dist_pg  s$   
	

z4CudaPlatformBase.stateless_init_device_torch_dist_pgc                 C   s   t  S r   )r   rM   r#   r#   r$   device_count  s   zCudaPlatformBase.device_countr   c                 C   sT   | d}td ptjdk}d}| drd}|S |r(|r(ddlm} | }|S )	Nry   r^   ZFLASH_ATTN_VLLM_V1FrR   Tr   )rx   )r~   rF   is_setr^   r_   Zvllm.attention.utils.fa_utilsrx   )r:   r   Zfp8_attentionZwill_use_fa	supportedrx   r#   r#   r$   is_kv_cache_dtype_supported  s   

z,CudaPlatformBase.is_kv_cache_dtype_supportedr   )rO   r   r   Nr   )F)4__name__
__module____qualname__r   r+   Z_enumr)   str__annotations__r*   r,   r-   r.   r/   propertylistr3   rp   r7   classmethodr8   r9   intr   r   rA   rB   rD   boolrJ   rL   rN   rd   typesZDevicefloatre   r   rk   r   r   r   r   r   r   r   r
   r   r   r   r   r   r#   r#   r#   r$   r'   5   s   
 
	
N
 \r'   c                	       s  e Zd Zeee	ddedee fddZ	ee	dde
eeef ef dedef fddZeeddedefd	d
ZeeddedefddZeeddedefddZeedee defddZeddedefddZeedd Z  ZS )NvmlCudaPlatformr   r<   r   c                 C   sF   z|  |}t|}t|\}}t||dW S  ty"   Y d S w N)majorminor)device_id_to_physical_device_idr   nvmlDeviceGetHandleByIndexZ"nvmlDeviceGetCudaComputeCapabilityr   RuntimeError)r:   r<   physical_device_idhandler   r   r#   r#   r$   rA     s   

z&NvmlCudaPlatform.get_device_capability
capabilityc                    s&   zt  ||W S  ty   Y dS w )NF)superr2   r   )r:   r   r<   	__class__r#   r$   r2     s
   z&NvmlCudaPlatform.has_device_capabilityc                 C   s   |  |}| |S r   )r   _get_physical_device_name)r:   r<   r   r#   r#   r$   rB     s   

z NvmlCudaPlatform.get_device_namec                 C   s   |  |}t|}t|S r   )r   r   r   ZnvmlDeviceGetUUIDr:   r<   r   r   r#   r#   r$   get_device_uuid%  s   


z NvmlCudaPlatform.get_device_uuidc                 C   s$   |  |}t|}tt|jS r   )r   r   r   r   ZnvmlDeviceGetMemoryInfototalr   r#   r#   r$   rD   ,  s   

z(NvmlCudaPlatform.get_device_total_memoryphysical_device_idsc              
   C   s   dd |D }t |D ]8\}}t |D ]/\}}||k rBzt||tj}|tjkr.W   dS W q tjyA   td Y   dS w qqdS )zP
        query if the set of gpus are fully connected by nvlink (1 hop)
        c                 S   s   g | ]}t |qS r#   )r   r   rs   ir#   r#   r$   
<listcomp>9      
z7NvmlCudaPlatform.is_fully_connected.<locals>.<listcomp>FzONVLink detection failed. This is normal if your machine has no NVLink equipped.T)	enumerater   ZnvmlDeviceGetP2PStatusZNVML_P2P_CAPS_INDEX_NVLINKZNVML_P2P_STATUS_OKZ	NVMLErrorrH   	exception)r:   r   Zhandlesr   r   jZpeer_handleZ
p2p_statusr#   r#   r$   rL   3  s0   


z#NvmlCudaPlatform.is_fully_connectedc                 C   s   t |}t |S r   )r   r   ZnvmlDeviceGetName)r:   r<   r   r#   r#   r$   r   N  s   

z*NvmlCudaPlatform._get_physical_device_namec                    sh   t  }|dkr. fddt|D }tt|dkr0tjddkr2t	dd
| d S d S d S d S )Nr   c                    s   g | ]}  |qS r#   )r   r   rM   r#   r$   r   X  r   z1NvmlCudaPlatform.log_warnings.<locals>.<listcomp>ZCUDA_DEVICE_ORDERZ
PCI_BUS_IDzDetected different devices in the system: %s. Please make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to avoid unexpected behavior.rq   )r   ZnvmlDeviceGetCountrangelensetosenvirongetrH   rI   r{   )r:   rK   Zdevice_namesr#   rM   r$   rN   S  s   
zNvmlCudaPlatform.log_warningsr   )r   r   r   r   r   r&   r   r   r   rA   r	   tupler   r2   r   rB   r   rD   r   rL   r   rN   __classcell__r#   r#   r   r$   r     sJ    
r   c                   @   sr   e Zd ZeeddedefddZeddedefddZ	eddedefdd	Z
ed
ee defddZdS )NonNvmlCudaPlatformr   r<   r   c                 C   s   t j|\}}t||dS r   )r3   r(   rA   r   )r:   r<   r   r   r#   r#   r$   rA   g  s   z)NonNvmlCudaPlatform.get_device_capabilityc                 C   s   t j|S r   )r3   r(   rB   r@   r#   r#   r$   rB   m  s   z#NonNvmlCudaPlatform.get_device_namec                 C   s   t j|}|jS r   )r3   r(   Zget_device_propertiesZtotal_memory)r:   r<   Zdevice_propsr#   r#   r$   rD   q  s   z+NonNvmlCudaPlatform.get_device_total_memoryr   c                 C   s   t d dS )Nz^NVLink detection not possible, as context support was not found. Assuming no NVLink available.F)rH   r   )r:   r   r#   r#   r$   rL   v  s   z&NonNvmlCudaPlatform.is_fully_connectedNr   )r   r   r   r   r   r   r   rA   r   rB   rD   r   r   rL   r#   r#   r#   r$   r   e  s    r   T)8__doc__r   datetimer   	functoolsr   r   typingr   r   r   r   r	   r3   Ztorch.distributedr
   r   r   r   Ztyping_extensionsr   Zvllm._Cr   Z	vllm.envsrF   Zvllm.loggerr   Z
vllm.utilsr   r   Z	interfacer   r   r   r   ra   r   r   r   rH   r   r   r   backendsr(   Zenable_cudnn_sdpr&   r'   r   r   Znvml_availabler    	Exceptionr!   ZCudaPlatformrN   r#   r#   r#   r$   <module>   sZ   "   Qb
