o
    )icF                     @   s  U d dl Z d dlmZ d dlmZmZmZ d dlmZm	Z	 d dl
Z
d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d	d
lmZmZmZmZ erWd dlmZmZ eeZzd dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& W n e'y Z( ze)de( W Y dZ([(ndZ([(ww zd dl*Z+W n e'y Z( ze)de( W Y dZ([(ndZ([(ww zd dl,Z+W n e'y Z( ze)de( W Y dZ([(ndZ([(ww g Z-e.e/ e0d< dZ1e1e1e1dddZ2e3e/e/f e0d< ddddddddZ4e3e/e/f e0d< de j5v re j5d Z6e j57dd Z8re6e8ksJ ne6e j5d< dd Z9ed e:fd!d"Z;ed e:fd#d$Z<ed e:fd%d&Z=e		d4d'e
j>d(e?d)e?d*e?d+e?d,e?d-e/d.e	e
j@ d/e	e
j@ d e:fd0d1ZAG d2d3 d3eZBdS )5    N)	timedelta)cache	lru_cachewraps)TYPE_CHECKINGOptional)PrefixStoreProcessGroup)is_nccl_available)init_loggercuda_device_count_stateless   )DeviceCapabilityPlatformPlatformEnum_Backend)ModelConfig
VllmConfig)AmdSmiExceptionamdsmi_get_gpu_asic_infoamdsmi_get_processor_handlesamdsmi_initamdsmi_shut_downamdsmi_topo_get_link_typez$Failed to import from amdsmi with %rz%Failed to import from vllm._C with %rz*Failed to import from vllm._rocm_C with %r_ROCM_UNSUPPORTED_MODELSzSliding window attention (SWA) is not yet supported in Triton flash attention. For half-precision SWA support, please use CK flash attention by setting `VLLM_USE_TRITON_FLASH_ATTN=0`zMROCm flash attention does not yet fully support 32-bit precision on PaliGemmazROCm Triton flash attention may run into compilation errors due to excessive use of shared memory. If this happens, disable Triton FA by setting `VLLM_USE_TRITON_FLASH_ATTN=0`)ZQwen2ForCausalLMZMistralForCausalLMZMixtralForCausalLMZ!PaliGemmaForConditionalGenerationZPhi3VForCausalLM _ROCM_PARTIALLY_SUPPORTED_MODELSZAMD_Instinct_MI300AZAMD_Instinct_MI300XZAMD_Instinct_MI325XZAMD_Instinct_MI300X_HF)Z0x74a0Z0x74a1Z0x74b5Z0x74a5Z0x74b9Z0x74a9Z0x74bd_ROCM_DEVICE_ID_NAME_MAPZHIP_VISIBLE_DEVICESCUDA_VISIBLE_DEVICESc                    s   t   fdd}|S )Nc                     s&   t   z | i |W t  S t  w N)r   r   )argskwargsfn _/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/platforms/rocm.pywrapper\   s   z$with_amdsmi_context.<locals>.wrapper)r   )r#   r&   r$   r"   r%   with_amdsmi_contextZ   s   r'   returnc                      $   t jdj t fdddD S )Ncudac                 3       | ]}| v V  qd S r   r$   .0archZGPU_ARCHr$   r%   	<genexpr>j       zon_gfx1x.<locals>.<genexpr>Zgfx11gfx12torchr*   get_device_propertiesgcnArchNameanyr$   r$   r/   r%   on_gfx1xg      r9   c                      r)   )Nr*   c                 3   r+   r   r$   r,   r/   r$   r%   r0   p   r1   zon_mi3xx.<locals>.<genexpr>)gfx942gfx950r4   r$   r$   r/   r%   on_mi3xxm   r:   r=   c                      r)   )Nr*   c                 3   r+   r   r$   r,   r/   r$   r%   r0   v   r1   zon_gfx9.<locals>.<genexpr>Zgfx90ar;   r<   r4   r$   r$   r/   r%   on_gfx9s   r:   r?   qtype	head_size
block_size	gqa_ratiomax_seq_lensliding_windowkv_cache_dtypealibi_slopessinksc	                    s4  t jdj t fdddD }	t fdddD }
|	r_tj s+|dks+|dko^| t jkp4| t jko^|d	kp<|d
ko^|dkpD|dko^|dkoL|dko^|dko^tj	o^tj
oYtj o^|d u S |
otj sm|dksm|dko| t jkpv| t jko|d
ko|dko|dko|dko|dko|d u o|dkotj	o|d u S )Nr*   c                 3   r+   r   r$   r,   r/   r$   r%   r0      r1   z2use_rocm_custom_paged_attention.<locals>.<genexpr>r>   c                 3   r+   r   r$   r,   r/   r$   r%   r0      r1   r2   r   )rI   @             r   i      auto)r5   r*   r6   r7   r8   envsVLLM_USE_V1ZhalfZbfloat16ZVLLM_ROCM_CUSTOM_PAGED_ATTNZVLLM_ROCM_USE_AITER_PAGED_ATTNVLLM_ROCM_USE_AITER)r@   rA   rB   rC   rD   rE   rF   rG   rH   ZON_GFX9ZON_GFX11_GFX12r$   r/   r%   use_rocm_custom_paged_attentiony   s\   	rS   c                       s  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< d	Zeed
< dZeed< g dZee ed< edZdedefddZedefddZedejddfddZeedd	d[dedee fddZeed ee defd!d"Zeeeddd[dedefd#d$Zed[dedefd%d&Z ed'ee defd(d)Z!ed\d,d-Z"ed.eddfd/d0Z#ed1eddf fd2d3Z$edefd4d5Z%e	d]deej&j' de(fd6d7Z)edefd8d9Z*edefd:d;Z+edefd<d=Z,edefd>d?Z-edej.fd@dAZ/edBdCdefdDdEZ0edefdFdGZ1ed[dedefdHdIZ2edefdJdKZ3edefdLdMZ4edNedOe5dPedQedRe6de7fdSdTZ8edefdUdVZ9edWedefdXdYZ:  Z;S )^RocmPlatformZrocmdevice_namer*   device_typeCUDAdispatch_keyZGPUray_device_keyZnccldist_backendr   device_control_env_var)	awqZgptqZfp8zcompressed-tensorsZ
fbgemm_fp8ZggufZquarkZptpc_fp8Zmxfp4supported_quantizationF
support_far(   c                 C   s.   |rt jrt jrt rtjS t rtjS tjS r   )rP   rR   VLLM_ROCM_USE_AITER_MHAr?   r   ZROCM_AITER_FA
FLASH_ATTNZ
TORCH_SDPA)clsr^   r$   r$   r%   get_vit_attn_backend   s   z!RocmPlatform.get_vit_attn_backendc	           
      C   sh  |rqddl m}	 |d u r|	 s|dkrtjntj}|tjkr>|dkr2|r+td dS td dS td|j	 d	| d
|tjksH|tj
krh|dkr\|rUtd dS td dS td|j	 d	| dtd|j	 d|d u sz|tjkr}tj}tjrtjrtjrt rtd dS td dS |tjkr| dstd ntd| td dS )Nr   )is_aiter_mla_enabledr   z&Using Triton MLA backend on V1 engine.z:vllm.v1.attention.backends.mla.triton_mla.TritonMLABackendzUsing Triton MLA backend.z3vllm.attention.backends.triton_mla.TritonMLABackendz The selected backend, z,does not support block size .z%Using AITER MLA backend on V1 engine.z=vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackendzUsing AITER MLA backendz6vllm.attention.backends.rocm_aiter_mla.AiterMLABackendz'.(currently only supports block size 1)z1,is not MLA type while requested for MLA backend.z+Using Flash Attention backend on V1 engine.zCvllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackendz,Using Triton Attention backend on V1 engine.z=vllm.v1.attention.backends.triton_attn.TritonAttentionBackendZ   z)flash_attn is not supported on NAVI GPUs.z %s is not supported in AMD GPUs.z!Using ROCmFlashAttention backend.zAvllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend)Z&vllm.attention.backends.rocm_aiter_mlarc   r   ZROCM_AITER_MLAZ
TRITON_MLAloggerZ	info_onceinfo
ValueErrornameZROCM_AITER_MLA_VLLM_V1r`   Z
ROCM_FLASHrP   rQ   rR   r_   r?   Zhas_device_capability)
ra   Zselected_backendrA   dtyperF   rB   Zuse_v1Zuse_mlaZhas_sinkrc   r$   r$   r%   get_attn_backend_cls   sp   













z!RocmPlatform.get_attn_backend_clsdeviceNc                 C   s   t j| dS )z:
        Set the device for the current platform.
        N)r5   r*   
set_devicera   rl   r$   r$   r%   rm     s   zRocmPlatform.set_device   )maxsizer   	device_idc                 C   s   t j|\}}t||dS )N)majorminor)r5   r*   get_device_capabilityr   )ra   rq   rr   rs   r$   r$   r%   rt     s   z"RocmPlatform.get_device_capabilityphysical_device_idsc           	      C   s   dd |D }t |D ]F\}}t |D ]=\}}||k rPzt||}|d dks-|d dkr2W   dS W q tyO } ztjd|d	 W Y d
}~  dS d
}~ww qqdS )zN
        Query if the set of gpus are fully connected by xgmi (1 hop)
        c                 S   s   g | ]}t  | qS r$   )r   )r-   ir$   r$   r%   
<listcomp>  s    
z3RocmPlatform.is_fully_connected.<locals>.<listcomp>Zhopsr   type   Fz AMD 1 hop XGMI detection failed.)exc_infoNT)	enumerater   r   rf   error)	ra   ru   Zhandlesrv   handlejZpeer_handleZ	link_typer|   r$   r$   r%   is_fully_connected  s.   
zRocmPlatform.is_fully_connectedc                 C   s<   |  |}t | }t|}|d }|tv rt| S |d S )Nrq   Zmarket_name)Zdevice_id_to_physical_device_idr   r   r   )ra   rq   Zphysical_device_idr}   Z	asic_inforU   r$   r$   r%   get_device_name(  s   

zRocmPlatform.get_device_namec                 C   s   t j|}|jS r   )r5   r*   r6   Ztotal_memory)ra   rq   Zdevice_propsr$   r$   r%   get_device_total_memory4  s   z$RocmPlatform.get_device_total_memoryenforce_eagerc                 C   s   |rt jstd dS dS )NzTo see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be usedFT)rP   rQ   rf   warning)ra   r   r$   r$   r%   is_async_output_supported9  s   
z&RocmPlatform.is_async_output_supportedvllm_configr   c                 C   sf   |j }|r|jd u rd|_|j}|jdkr1|jr$tjstdd|_d S tjr,d|_d S d|_d S d S )NrL   rO   z1Speculative decoding is not supported on vLLM V0.z vllm.v1.worker.gpu_worker.Workerzvllm.worker.worker.Worker)cache_configrB   parallel_configZ
worker_clsZspeculative_configrP   rQ   NotImplementedError)ra   r   r   r   r$   r$   r%   check_and_update_configC  s    


z$RocmPlatform.check_and_update_config
model_archc                 C   s>   |t v rtd| d|tv rt| }td|| d S d S )NzModel architecture 'z#' is not supported by ROCm for now.z:Model architecture '%s' is partially supported by ROCm: %s)r   rh   r   rf   r   )ra   r   msgr$   r$   r%   verify_model_archW  s   zRocmPlatform.verify_model_archquantc                    s.   t  | |dkrtjstd dt_d S )Nr\   zcUsing AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ is not set, enabling VLLM_USE_TRITON_AWQ.T)superverify_quantizationrP   ZVLLM_USE_TRITON_AWQrf   r   )ra   r   	__class__r$   r%   r   c  s   
z RocmPlatform.verify_quantizationc                 C      dS )Nz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPUr$   ra   r$   r$   r%   get_punica_wrapperl     zRocmPlatform.get_punica_wrapperc                 C   s,   t j| t j|d t j|d  S )Nr   r   )r5   r*   Zreset_peak_memory_statsZmem_get_inforn   r$   r$   r%   get_current_memory_usagep  s   z%RocmPlatform.get_current_memory_usagec                 C   r   )NzHvllm.distributed.device_communicators.cuda_communicator.CudaCommunicatorr$   r   r$   r$   r%   get_device_communicator_clsx  r   z(RocmPlatform.get_device_communicator_clsc                    r)   )Nr   c                 3   r+   r   r$   r-   ZgfxZgcn_archr$   r%   r0     r1   z+RocmPlatform.supports_mx.<locals>.<genexpr>)gfx95r4   r   r$   r   r%   supports_mx|  r:   zRocmPlatform.supports_mxc                    r)   )Nr   c                 3   r+   r   r$   r   r   r$   r%   r0     r1   z,RocmPlatform.supports_fp8.<locals>.<genexpr>)gfx94r   r3   r4   r   r$   r   r%   supports_fp8  r:   zRocmPlatform.supports_fp8c                 C      dt jdjv S )Nr   r   r5   r*   r6   r7   r   r$   r$   r%   is_fp8_fnuz  s   zRocmPlatform.is_fp8_fnuzc                 C   s   |   rtjS tjS r   )r   r5   Zfloat8_e4m3fnuzZfloat8_e4m3fnr   r$   r$   r%   	fp8_dtype  s   zRocmPlatform.fp8_dtypemodel_configr   c                 C   r   NTr$   )ra   r   r$   r$   r%   supports_v1  s   zRocmPlatform.supports_v1c                    s,   t jdj ddg}t fdd|D S )Nr   r   r   c                 3   r+   r   r$   r   r   r$   r%   r0     r1   z4RocmPlatform.use_custom_allreduce.<locals>.<genexpr>r4   )ra   Zsupported_archsr$   r   r%   use_custom_allreduce  s   z!RocmPlatform.use_custom_allreducec                 C   s   t j|jS r   )r5   r*   r6   Zmulti_processor_count)ra   rq   r$   r$   r%   get_cu_count  s
   zRocmPlatform.get_cu_countc                 C   r   )NZgfx1r   r   r   r$   r$   r%   is_navi  s   zRocmPlatform.is_navic                 C   r   )Nz,vllm.compilation.cuda_graph.CUDAGraphWrapperr$   r   r$   r$   r%   get_static_graph_wrapper_cls  r   z)RocmPlatform.get_static_graph_wrapper_clsbackendprefix_store
group_rank
group_sizetimeoutc                 C   st   t  sJ t|||}ddlm} | }||_|||||}	tjj}
t	d}|
|
 |	  |||
|	 |S )Nr   )ProcessGroupNCCLr*   )r
   r	   "torch.distributed.distributed_c10dr   Options_timeoutZBackendTypeZNCCLr5   rl   Z_set_default_backendZ_set_sequence_number_for_groupZ_register_backend)ra   r   r   r   r   r   Zpgr   Zbackend_optionsZbackend_classZbackend_typerl   r$   r$   r%   #stateless_init_device_torch_dist_pg  s$   
	

z0RocmPlatform.stateless_init_device_torch_dist_pgc                 C   s   t  S r   r   r   r$   r$   r%   device_count  s   zRocmPlatform.device_countrF   c                 C   r   r   r$   )ra   rF   r$   r$   r%   is_kv_cache_dtype_supported  r   z(RocmPlatform.is_kv_cache_dtype_supported)F)r   )r   r   r(   Nr   )<__name__
__module____qualname__r   ZROCMZ_enumrU   str__annotations__rV   rX   rY   rZ   r[   r]   listclassmethodboolr   rb   rk   r5   rl   rm   r   intr   r   rt   r'   r   r   r   r   r   r   r   r   typesZDevicefloatr   r   r   r   r   rj   r   r   r   r   r   r   r   r   r	   r   r   r   __classcell__r$   r$   r   r%   rT      s   
 C		
rT   )NN)Cosdatetimer   	functoolsr   r   r   typingr   r   r5   Ztorch.distributedr   r	   r   r
   Z	vllm.envsrP   Zvllm.loggerr   Z
vllm.utilsr   Z	interfacer   r   r   r   Zvllm.configr   r   r   rf   Zamdsmir   r   r   r   r   r   ImportErrorer   Zvllm._CZvllmZvllm._rocm_Cr   r   r   r   Z_ROCM_SWA_REASONr   dictr   environvalgetZcuda_valr'   r   r9   r=   r?   rj   r   ZTensorrS   rT   r$   r$   r$   r%   <module>   s   
$

			(