o
    )i&                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlm	Z	m
Z
mZmZ ddlZddlZddlmZ ddlmZ ddlmZ eeZejddZejdXddZdYddZdZddZefd[ddZeddZeddZ edd Z!ed!d"Z"ed!d#Z#ed!d$Z$ed%d&d'd( d)Z%ejdXd*d+Z&ejdXd,d-Z'ejdXd.d/Z(ejd\d1d2Z)	3d]d^d=d>Z*e rej+j,d?g d@dAd_dKdLZ-ej+.d?d_dMdNZ/d`dUdVZ0g dWZ1dS )azoCompatibility wrapper for FlashInfer API changes.

Users of vLLM should always import **only** these wrappers.
    )annotationsN)AnyCallableNoReturnOptional)init_logger)current_platformFLASHINFER_CUBINS_REPOSITORYzWhttps://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/returnboolc                   C  s   t jdduS )z+Return ``True`` if FlashInfer is available.
flashinferN)	importlibutil	find_spec r   r   a/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/utils/flashinfer.pyhas_flashinfer"   s   r   _r   __r   c                  O  s   t d)z/Placeholder for unavailable FlashInfer backend.zFlashInfer backend is not available. Please install the package to enable FlashInfer kernels: https://github.com/flashinfer-ai/flashinfer)RuntimeError)r   r   r   r   r   _missing*   s   r   module_namestr
Any | Nonec              	   C  s&   zt | W S  ttfy   Y dS w )zBSafely import a submodule and return it, or None if not available.N)r   import_moduleImportErrorModuleNotFoundError)r   r   r   r   _get_submodule2   s
   r   	attr_namefallback_fnCallable[..., Any]c                   s&   t jfdd  fdd}|S )z5Create a lazy import wrapper for a specific function.c                    s&   t  sd S t} | rt|  d S d S N)r   r   getattr)mod)r   r   r   r   	_get_impl@   s   z'_lazy_import_wrapper.<locals>._get_implc                    s*     }|d u r| i |S || i |S r!   r   )argskwargsimpl)r$   r   r   r   wrapperG   s   z%_lazy_import_wrapper.<locals>.wrapper)	functoolscache)r   r   r   r(   r   )r$   r   r   r   r   _lazy_import_wrapper;   s   r+   flashinfer.fused_moeZtrtllm_fp8_block_scale_moeZtrtllm_fp8_per_tensor_scale_moecutlass_fused_moer   fp4_quantizenvfp4_block_scale_interleavetrtllm_fp4_block_scale_moezflashinfer.autotunerautotunec                  O  s   t  S r!   )
contextlibnullcontext)r%   r&   r   r   r   <lambda>a   s    r4   )r   c                   C  s   t  o
tjdduS )z6Return ``True`` if FlashInfer MoE module is available.r,   N)r   r   r   r   r   r   r   r   has_flashinfer_moed   s
   r5   c                  C  s@   t  sdS g d} | D ]\}}t|}|rt||s dS qdS )z=Return ``True`` if FlashInfer CUTLASS fused MoE is available.F))r,   r-   )r   r.   )r   r/   )r,   r0   T)r5   r   hasattr)Zrequired_functionsr   r   r#   r   r   r    has_flashinfer_cutlass_fused_moek   s   r7   c               
   C  st   z t jtdd} | jdk}|rtd |W S td| j |W S  ty9 } ztd| W Y d}~dS d}~ww )	zReturn ``True`` if NVIDIA's artifactory is accessible.

    This checks connectivity to the kernel inference library artifactory
    which is required for downloading certain cubin kernels like TRTLLM FHMA.
       )timeout   z NVIDIA artifactory is accessiblez2NVIDIA artifactory returned failed status code: %dz+Failed to connect to NVIDIA artifactory: %sNF)requestsgetr	   status_codeloggerZ
debug_oncewarning_once	Exception)responseZ
accessibleer   r   r   has_nvidia_artifactory   s    

rC   tuple[bool, Optional[str]]c                  C  sT   t j} tdrt sd| fS | dur(td|  | dk}|r$td || fS dS )z2Cache result which only depends on the environmentd   FNz&VLLM_USE_TRTLLM_ATTENTION is set to %s1zUsing TRTLLM attention.)TN)envsZVLLM_USE_TRTLLM_ATTENTIONr   Zis_device_capabilityrC   r>   	info_once)	env_value
use_trtllmr   r   r   supports_trtllm_attention   s   

rK   F
num_tokensintmax_seq_lenkv_cache_dtypenum_qo_headsOptional[int]num_kv_headsattn_head_size	has_sinksc           	      C  s   t  \}}|s	dS |d u s|d u s|d u s|| dkrdS |r&td dS |d u r?| dko5|dk o5|dk}|r=td |S dS )	NFr   z6Using TRTLLM attention (required for attention sinks).T   i   autoz'Using TRTLLM attention (auto-detected).)rK   r>   rH   r?   )	rL   rN   rO   rP   rR   rS   rT   rJ   rI   r   r   r   use_trtllm_attention   s&   
	
rW   zvllm::flashinfer_mm_fp4cuda)Zmutates_argsZdevice_typesAtorch.TensorBA_scaleB_scaleg_scaledtypetorch.dtypebackendc              
   C  s$   ddl m} || |||||d|dS )Nr   )mm_fp4   )
block_sizera   )r   rb   )rY   r[   r\   r]   r^   r_   ra   Zflashinfer_mm_fp4_r   r   r   flashinfer_mm_fp4   s   re   c                 C  s    t j| jd |jd || jdS )Nr      )r_   device)torchemptyshaperg   )rY   r[   r\   r]   r^   r_   ra   r   r   r   flashinfer_mm_fp4_fake   s
   
rk   abblock_scale_ablock_scale_balpha	out_dtypec              	   C  s   | j dkr
|j dksJ |j dkr|j dksJ | ddkr&|ddks(J | jd |jd ks4J |jd | jd d ksBJ |jd |jd d ksPJ |dkr`|tj}|tj}t| | || |||dS )N   rf      Zcutlass)ra   )ndimZstriderj   viewrh   Zuint8re   t)rl   rm   rn   ro   rp   rq   ra   r   r   r   flashinfer_scaled_fp4_mm   s$    rx   )r   %flashinfer_trtllm_fp8_block_scale_moeflashinfer_cutlass_fused_moer.   r/   r0   r1   r5   r7   rC   rW   rx   )r
   r   )r   r   r   r   r
   r   )r   r   r
   r   )r   r   r   r   r   r    )r
   rD   )F)rL   rM   rN   rM   rO   r   rP   rQ   rR   rQ   rS   rQ   rT   r   r
   r   )rY   rZ   r[   rZ   r\   rZ   r]   rZ   r^   rZ   r_   r`   ra   r   r
   rZ   )rl   rZ   rm   rZ   rn   rZ   ro   rZ   rp   rZ   rq   r`   ra   r   r
   rZ   )2__doc__
__future__r   r2   r)   r   importlib.utilostypingr   r   r   r   r;   rh   Z	vllm.envsrG   Zvllm.loggerr   Zvllm.platformsr   __name__r>   environr<   r	   r*   r   r   r   r+   ry   Z*flashinfer_trtllm_fp8_per_tensor_scale_moerz   r.   r/   r0   r1   r5   r7   rC   rK   rW   ZlibraryZ	custom_opre   Zregister_fakerk   rx   __all__r   r   r   r   <module>   s   


%

