o
    á)i  ã                   @  s6  U d Z ddlmZ ddlZddlZddlZddlmZmZm	Z	 ddl
Z
ddlmZ ddlmZ ddlmZ ddlmZmZ ejd9d
d„ƒZejd9dd„ƒZd:dd„Zd;dd„Zdaded< daded< daded< d<dd„Zd d!„ Zd"d#„ Zd$d%„ Z d=d(d)„Z!d>d,d-„Z"d.d.gZ#e#d/fd?d4d5„Z$d@d6d7„Z%g d8¢Z&dS )AzmCompatibility wrapper for DeepGEMM API changes.

Users of vLLM should always import **only** these wrappers.
é    )ÚannotationsN)ÚAnyÚCallableÚNoReturn)Úlogger)Úcurrent_platform)ÚcdivÚhas_deep_gemmÚreturnÚboolc                  C  s&   t  ¡ ot  d¡pt  d¡} tƒ o| S )zƒReturn ``True`` if DeepGEMM is supported on the current platform.
    Currently, only Hopper and Blackwell GPUs are supported.
    éZ   éd   )r   Úis_cudaZis_device_capabilityr	   )Zis_supported_arch© r   ú`/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/utils/deep_gemm.pyÚis_deep_gemm_supported   s
   
þ
r   c                  C  sŠ   t js
t d¡ dS tƒ st d¡ dS t jst d¡ dS tƒ  tdu r,t d¡ dS t 	¡ o4t 
d¡} | r>t d¡ | S t d	¡ | S )
zfReturn ``True`` if vLLM is configured to use DeepGEMM "
    "E8M0 scale on a Blackwell-class GPU.
    z-DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM=0.Fz1DeepGEMM E8M0 disabled: DeepGEMM backend missing.z2DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM_E8M0=0.Nz3DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not foundr   z'DeepGEMM E8M0 enabled on Blackwell GPU.z5DeepGEMM E8M0 disabled: not running on Blackwell GPU.)ÚenvsZVLLM_USE_DEEP_GEMMr   Z
debug_oncer	   ZVLLM_USE_DEEP_GEMM_E8M0Ú
_lazy_initÚ_fp8_gemm_nt_implr   r   Zhas_device_capability)Úenabledr   r   r   Ú is_blackwell_deep_gemm_e8m0_used!   s2   


ÿÿ
þÿr   Ú_r   Ú__r   c                  O  s   t dƒ‚)z-Placeholder for unavailable DeepGEMM backend.z`DeepGEMM backend is not available. Please install the `deep_gemm` package to enable FP8 kernels.)ÚRuntimeError)r   r   r   r   r   Ú_missingC   s   ÿr   ÚnewÚstrÚoldúCallable[..., Any] | Nonec                 C  s:   t | |ƒr
t| |ƒS t | |ƒrt d||¡ t| |ƒS dS )z>Return the *new* symbol if it exists, otherwise the *old* one.z¬Found legacy DeepGEMM symbol `%s`. Please upgrade the `deep_gemm` package so that `%s` is available. Support for the legacy symbol will be removed in a future vLLM release.N)ÚhasattrÚgetattrr   Zwarning_once)Úmoduler   r   r   r   r   Ú_resolve_symbolJ   s   


û
r"   r   Ú_grouped_implÚ_grouped_masked_implÚNonec                  C  s€   t dustdustdurdS tƒ sdS d} tj | d¡s'tj t	j
d¡tj| < t d¡}t|ddƒa t|ddƒat|dd	ƒadS )
z2Import deep_gemm and resolve symbols on first use.NZDG_JIT_CACHE_DIRZ	deep_gemmÚfp8_gemm_ntZgemm_fp8_fp8_bf16_ntÚ m_grouped_fp8_gemm_nt_contiguousZ)m_grouped_gemm_fp8_fp8_bf16_nt_contiguousÚfp8_m_grouped_gemm_nt_maskedZ%m_grouped_gemm_fp8_fp8_bf16_nt_masked)r   r#   r$   r	   ÚosÚenvironÚgetÚpathÚjoinr   ZVLLM_CACHE_ROOTÚ	importlibÚimport_moduler"   )ZDEEP_GEMM_JIT_CACHE_ENV_NAMEZ_dgr   r   r   r   `   s,   
ÿ
ÿþþr   c                  O  ó2   t ƒ  td u rt| i |¤ŽS t| dtƒ  i|¤ŽS ©NZdisable_ue8m0_cast)r   r   r   r   ©ÚargsÚkwargsr   r   r   r&   ~   ó   ÿþýr&   c                  O  r0   r1   )r   r#   r   r   r2   r   r   r   r'   ˆ   r5   r'   c                  O  r0   r1   )r   r$   r   r   r2   r   r   r   r(   ’   r5   r(   Úxútorch.Tensorc              	   C  s   t  dt  t  |  ¡ ¡¡¡S )Ng       @)ÚtorchÚpowÚceilÚlog2Úabs)r6   r   r   r   Ú_ceil_to_ue8m0œ   s   r=   ÚintÚyc                 C  s   t | |ƒ| S )N)r   )r6   r?   r   r   r   Ú_align    s   r@   é€   FÚ
block_sizeú	list[int]Ú	use_ue8m0ú!tuple[torch.Tensor, torch.Tensor]c                 C  sò   |   ¡ dksJ ‚| j\}}|\}}tjt||ƒt||ƒf| j| jd}| |d |…d |…f< | d|| d¡| |¡}| 	¡  
¡ jddd d¡}	|	d	 }
|rRt|
ƒn|
}
|d
|
   tj¡}| |¡d |…d |…f  ¡ |
 | d¡| d¡¡fS )Né   )ÚdtypeÚdeviceéÿÿÿÿé   )rJ   é   T)ÚdimZkeepdimg-Cëâ6?g      |@g      ð?r   )rL   Úshaper8   Zzerosr@   rG   rH   ÚviewÚsizer<   ÚfloatZamaxÚclampr=   ÚtoZfloat8_e4m3fnZview_asÚ
contiguous)r6   rB   rD   ÚmÚnZblock_mZblock_nZx_paddedZx_viewZx_amaxZsfZx_scaledr   r   r   Úper_block_cast_to_fp8©   s    
þ ÿrV   c                 C  sB   |   ¡ |  ¡ } }| |  ||   ¡ }d| |  ¡  | }d| S )a€  Return a global difference metric for unit tests.

    DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
    error, causing ``torch.testing.assert_close`` to fail.  Instead of checking
    every element, we compute a cosine-style similarity over the whole tensor
    and report ``1 - sim``.  Once kernel accuracy improves this helper can be
    removed.
    rF   rJ   )ÚdoubleÚsum)r6   r?   ÚdenominatorÚsimr   r   r   Ú	calc_diff½   s   
r[   )r[   r&   r'   r(   rV   r   r   )r
   r   )r   r   r   r   r
   r   )r   r   r   r   r
   r   )r
   r%   )r6   r7   )r6   r>   r?   r>   r
   r>   )r6   r7   rB   rC   rD   r   r
   rE   )r6   r7   r?   r7   )'Ú__doc__Ú
__future__r   Ú	functoolsr.   r)   Útypingr   r   r   r8   Z	vllm.envsr   Zvllm.loggerr   Zvllm.platformsr   Z
vllm.utilsr   r	   Úcacher   r   r   r"   r   Ú__annotations__r#   r$   r   r&   r'   r(   r=   r@   ZDEFAULT_BLOCK_SIZErV   r[   Ú__all__r   r   r   r   Ú<module>   s@   

!






ý
