o
    )i'                 6   @   s  d dl Z d dlmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ e	eZe sUe sUzd dlZW n eyT Z zede W Y dZ[ndZ[ww dZe e d dlZdZW d   n1 smw   Y  eryd	d
 Znzd dlmZ W n ey   d dlmZ Y nw 	 	 	 		 d*dejdejdejdejdededejdejdededeej de dejdejdedededed ed!df(d"d#Z!	 	 	 		 d*dejd$ejd%ejd&ejdejdejdejdededejdejdededeej de dejdejdedededed ed!df.d'd(Z"	d+dejd$ejd%ejd&ejdejdejdejdededejdejd)eej dededeej de dejdejd*eej d!df(d+d,Z#dejdejd-ejdedejdejd!dfd.d/Z$	d+d0ejd1ejd2ejd3ejd4ejd5eej d!dfd6d7Z%	d,d8ejd9ejd:ejd;ejd<ed=ed>ed?e&d!e'ejejejejf fd@dAZ(	d,d8ejd9ejd:ejd;ejdBejdCejd<ed=ed>ed?e&d!e'ejejejejf fdDdEZ)dFejdejdGeej dHedIejdJe&d!dfdKdLZ*dFejdejdGeej dHedIejdJe&dMedNejd!dfdOdPZ+dejdQejdRejdSed!df
dTdUZ,dQejdVejdRejdSed!df
dWdXZ-dYejdZejd[ejd\ejd!df
d]d^Z.dYejdZejd[ejd\ejd!df
d_d`Z/dYejdZejd[ejd\ejd!df
dadbZ0		d-dQejdRejdSedcej1ddeej dVeej d!e'ejejf fdedfZ2dgejdhejdiejdjedkedled!ejfdmdnZ3dQejdgejdoejdhejdjed!ejfdpdqZ4drejdsejdtejduejdvejdwe&dxed!ejfdydzZ5e6ej7j8dzred{drejdsejdtejduejdvejdwe&dxed!ejfd|d}Z9d~ejdejdxed!dfddZ:drejdsejdejdejdededed!ejfddZ;drejdsejdejdejdejdedededed!ejfddZ<e6ej7j8dreddrejdsejdejdejdejdedej=dej=dej=d!ejfddZ>ed				d.drejdeej dsejdeej dejdeej deej deej deej dejdedej=dej=dej=de&de&de&de&d!ejf&ddZ?eddrejdsejdejdejdejdejdej=dej=dej=d!ejfddZ@eddrejdsejdejdejdej=dej=dej=d!ejfddZAeddgejdhejdiejdjej=dkedled!ejfddZBeddQejdgejdoejdhejdjej=d!ejfddZCed							d/drejdejdedeej1 deej deej dee deej deej dee  d!ejfddZDeddsejdej1dedeej1 d!ejf
ddZEe6ej7j8dreddrejdejdejdeej dej=dej=dej=dej=dej=de&de&d!ejfddʄZFe6ej7j8d˃r\ed̃	d+dejdedej=dej=deej1 d!ejfdd҄ZGedӃdejdejdedej=d!ejf
ddׄZHed؃dejdejdedej=d!ejf
ddڄZIedۃdejdejdejdejdejdedej=dej=dej=d!ejfddZJe6ej7j8dreddejdejdejdededej=dej=d!ejfddZKded!e&fddZLd0ejdrejdejdejdejdejdejfddZMdrejdejdejdejdejdej1d!ejfddZNded!e&fddZOded!e&fddZP	d+drejdejdejdejdej1deej d!ejfdd ZQ		d-drejdejdejdejdej1dejdeej deej d!ejfddZRded!e&fddZSded!e&fddZTdrejd!e'ejejf fd	d
ZU	d+drejdejdejdejdejdej1deej d!ejfddZV	d+dejdejdejdejdejdejdedededeej fddZWdejdejfddZXdejdejdejdejdedededefdd ZYd!ejd"ejd#ejd$ejdejdejdejd%ejd&ejd'ejd(e&d)e&fd*d+ZZd!ejd"ejd#ejd$ejdejd,ejdejdejd-ejfd.d/Z[dsejdejdeded0ed!ejfd1d2Z\dsejdeded0ed!ejf
d3d4Z]dsejdejdeded0ed!ejfd5d6Z^dsejdejdeded0ed!ejfd7d8Z_				d.drejdeej dsejdeej dejdeej deej deej deej dejdedededede&de&de&de&d!ejf&d9d:Z`				d0dej1dedeej1 d;eej1 d<eej1 d=eej1 deej1 d!eae  fd>d?Zb							d/drejdejdedeej1 deej deej dee deej deej dee  d!ejfd@dAZcdsejdej1dedeej1 d!ejf
dBdCZde6ej7j8dDrjedEdrejdejd!ejfdFdGZedrejdejd!ejfdHdDZfdQejdIejd!e'ejejf fdJdKZgdejdIejdejdejdLed!e'ejejf fdMdNZh					d1dQejdeej dOee ddeej dPe&d0eej d!e'ejejf fdQdRZi		d2dgejdejdSeej de&d!e'ejejejf f
dTdUZjdrejdejdejdeej dededededede&de&d!ejfdVdZk			d3dQejdeej deej dWe&d!e'ejejeej f f
dXdYZldrejdsejdejdejdejdejdededed!ejfdZd[Zmdejdedededeej1 d!ejfd\d˄Zndejdejdeded!ejf
d]d^Zodejdejdeded!ejf
d_d`Zpdejdejdejdejdejdedededed!ejfdadbZqdejdejdejdededej=dej=d!ejfdcdZrded!efdddeZsdfejdgejdhejdiejdjejdkeej dleej dmeej dne&d)eej doeej dpeej dqejdrefdsdtZtdrejdejdued!ejfdvdwZudrejdejdxed!ejfdydzZvdrejdejdej1dejdejdxed!ejfd{d|ZwdQejd0ejfd}d~Zxdejdededejdejdejd!dfddZydQejd0ejdejdejdeej deej dejdejdejdededededxed!ejfddZzdejdejdejdejd!df
ddZ{dQejd0eej dejdeej dejdeej deej deej deej dejdejdejdejdejdedede&de&dedededede&de&de&de&d!ejf6ddZ|ere6ej7j}dreddrejdejdejdejdejdejdejdejdejdejdedej=dej=dej=de&dedLedede&de&d!ejf*ddZ~eddQejd0eej dejdejdeej deej deej dejdejdejdejdejdedede&de&dedededede&de&de&de&d!ejf2ddZdGejdejdejdejdejde dejdejd!dfddZdGejdejdejdejdejde dejdejd!dfddZdejdejd-ejdejde dejd!dfddZdeaej deaej dejd!dfddZdeaej dejd!dfddZdejdejdejd!dfddZ		d4d0ejdQejdede d!df
ddZ	d+dejdejdejdejdedeej d!dfddZdeded!efdÐdĄZded!efdŐdƄZdeaej dejdede&d!ef
dːd̄Zdedejdejdeded!dfdѐd҄Zded!dfdӐdԄZd!efdՐdքZdedeae d!dfdאd؄Zded!e'eae eae f fdِdڄZdedeaeae  deaeae  d!dfdݐdބZded!e'eejf fddZdejfddZded!dfddZ	d+dededee d!efddZded!dfddZ	d5dedejdejdede&d!dfddZded!ejfddZdedeaej d!dfddZd!efddZdejdeded!e'ejejf fddZ		d2dejdejdejdejdedejd ejdee d?e&d!e'ejejf fddZdejdejdejdejdejdejded!ejfdd	Zdejdejdejdejdejdejdejded
ed!ejfddZdededed
ed!ef
ddZe6ej7j8dreddejdejdeej de&d!ejf
ddZe6ej7j8drFeddejdejdejdejdejde&de&de&deej d eej deeae  d!eej d"eej de&d!ejfd#d$Ze6ej7j8d%rwed&dejdejd'ejdeej dej1de&d!ejfd(d)ZdS dS (6      N)TYPE_CHECKINGOptionalUnion)init_logger)current_platform)
ScalarTypez%Failed to import from vllm._C with %rFTc                    s    fddS )Nc                    s    S N )namefnr	   \/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/_custom_ops.py<lambda>   s    zregister_fake.<locals>.<lambda>r	   r   r	   r   r   register_fake      r   )r   )impl_abstract@   outquery	key_cachevalue_cachenum_kv_headsscaleblock_tablesseq_lens
block_sizemax_seq_lenalibi_slopeskv_cache_dtypek_scalev_scaletp_rankblocksparse_local_blocksblocksparse_vert_strideblocksparse_block_sizeblocksparse_head_sliding_stepreturnc                 C   6   t jj| |||||||||	|
|||||||| d S r   )torchops_Cpaged_attention_v1)r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r	   r	   r   r+   '   s   
r+   exp_sum
max_logitstmp_outc                 C   s<   t jj| |||||||||	|
||||||||||| d S r   )r(   r)   r*   paged_attention_v2)r   r,   r-   r.   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r	   r	   r   r/   D   s   
r/   query_start_locfp8_out_scalec                 C   r'   r   )r(   r)   _rocm_CZpaged_attention)r   r,   r-   r.   r   r   r   r   r   r   r   r0   r   r   r   r   r   r    r1   r	   r	   r   paged_attention_rocmd   s   r3   kv_cachec                 C      t jj| ||||| d S r   )r(   r)   Z_C_cpuZmla_decode_kvcache)r   r   r4   r   r   r   r	   r	   r   mla_decode_kvcache_cpu      r6   outputprefix_output
prefix_lsesuffix_output
suffix_lse
output_lsec                 C   s   t jj| ||||| d S r   )r(   r)   r*   merge_attn_states)r8   r9   r:   r;   r<   r=   r	   r	   r   r>         r>   	q_seqlens
kv_seqlensvertical_indexesslash_indexescontext_sizeblock_size_Mblock_size_Ncausalc                 C   s   | d}| d}	| d}
| d}|| d | }tj||	|| j| jd}tj||	||
| j| jd}tj||	|| j| jd}tj||	||| j| jd}tjj||||| ||||||| ||||fS Nr         dtypedevice)sizer(   zerosrL   rM   r)   r*   convert_vertical_slash_indexes)r@   rA   rB   rC   rD   rE   rF   rG   
batch_size	num_heads	nnz_slashnnz_verticalnum_rowsblock_countblock_offsetcolumn_countcolumn_indexr	   r	   r   rP      sJ   





rP   vertical_indices_countslash_indices_countc
                 C   s   | d}
| d}| d}| d}|| d | }tj|
||| j| jd}tj|
|||| j| jd}tj|
||| j| jd}tj|
|||| j| jd}tjj||||| |||||||||	 ||||fS rH   )rN   r(   emptyrL   rM   r)   r*   (convert_vertical_slash_indexes_mergehead)r@   rA   rB   rC   rZ   r[   rD   rE   rF   rG   rQ   rR   rS   rT   rU   rV   rW   rX   rY   r	   r	   r   r]      sJ   





r]   	positionskey	head_sizecos_sin_cacheis_neoxc                 C   r5   r   )r(   r)   r*   rotary_embedding)r^   r   r_   r`   ra   rb   r	   r	   r   rc      r7   rc   rot_dimcos_sin_cache_offsetsc              
   C       t jj| ||||||| d S r   )r(   r)   r*   batched_rotary_embedding)r^   r   r_   r`   ra   rb   rd   re   r	   r	   r   rg     s   rg   inputweightepsilonc                 C   s    |  }tjj| ||| d S r   )
contiguousr(   r)   r*   rms_norm)r   rh   ri   rj   Zinput_contiguousr	   r	   r   rl     s   rl   residualc                 C      t jj| ||| d S r   )r(   r)   r*   fused_add_rms_norm)rh   rm   ri   rj   r	   r	   r   ro     s   ro   logitsprompt_maskoutput_maskrepetition_penaltiesc                 C   sN   |j ddd| d}t||B |d}t| dkd| |}| |9 } d S )NrI   )dim      ?r   )Z	unsqueezerepeatrN   r(   where)rp   rq   rr   rs   Z	penaltiesZscalingr	   r	   r    apply_repetition_penalties_torch  s   
rx   c                 C   rn   r   )r(   r)   r*   Zapply_repetition_penalties_rp   rq   rr   rs   r	   r	   r   apply_repetition_penalties_cuda(     rz   c                 C   s2   | j r|  rt| ||| dS t| ||| dS )aw  Apply repetition penalties to logits in-place.

    Args:
        logits: The logits tensor of shape [num_seqs, vocab_size].
        prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
        output_mask: A boolean tensor indicating which tokens appear in the output.
        repetition_penalties: The repetition penalties of shape (num_seqs, ).
    N)is_cudais_contiguousrz   rx   ry   r	   r	   r   apply_repetition_penalties/  s   r~   quant_dtypescale_ubc              	   C   sV   t j| |d}t j|  | jd  df| jt jd}t jj	|| ||||| ||fS )NrL   rI   rM   rL   )
r(   
empty_liker\   numelshaperM   float32r)   r*    rms_norm_dynamic_per_token_quant)rh   ri   rj   r   r   rm   r8   scalesr	   r	   r   r   C  s   r   qweightr   rO   split_k_itersthxthyc                 C   s6   t jrddlm} || ||S tjj| |||||S )Nr   )awq_dequantize_triton)envsVLLM_USE_TRITON_AWQ2vllm.model_executor.layers.quantization.awq_tritonr   r(   r)   r*   awq_dequantize)r   r   rO   r   r   r   r   r	   r	   r   r   X  s   r   qzerosc                 C   s8   t jrddlm} || ||||S tjj| ||||S )Nr   )awq_gemm_triton)r   r   r   r   r(   r)   r*   awq_gemm)rh   r   r   r   r   r   r	   r	   r   r   c  s   r   a
b_q_weightb_gptq_qzerosb_gptq_scalesb_g_idxuse_exllamabitc              	   C      t jj| ||||||S r   )r(   r)   r*   	gptq_gemmr   r   r   r   r   r   r   r	   r	   r   r   m  s   r   z_C::gptq_gemmc                 C   s$   t j| d|df| j| jdS )Nr   rI   rK   r(   r\   rN   rL   rM   r   r	   r	   r   _gptq_gemm_fakew  s   r   q_weightq_permc                 C      t jj| || d S r   )r(   r)   r*   gptq_shuffle)r   r   r   r	   r	   r   r        r   b_scales	workspacesize_msize_nsize_kc              	   C   r   r   )r(   r)   r*   marlin_gemmr   r   r   r   r   r   r   r	   r	   r   r     s   r   b_metab_q_typec	           	      C   s    t jj| |||||j|||	S r   )r(   r)   r*   gptq_marlin_24_gemmid	r   r   r   r   r   r   r   r   r   r	   r	   r   r     s   r   z_C::gptq_marlin_24_gemmc	           	      C   s   t j||f| j| jdS Nr   r(   r\   rM   rL   r   r	   r	   r   _gptq_marlin_24_gemm_fake     r   z_C::gptq_marlin_gemmcb_biasglobal_scaleb_zerosg_idxpermb_q_type_id	is_k_fulluse_atomic_adduse_fp32_reduceis_zp_floatc                 C   s   t j||f| j| jdS r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r	   r   _gptq_marlin_gemm_fake  s   r   z_C::marlin_qqq_gemms_toks_chs_groupc	           	      C   s   t j||ft j| jdS NrK   r(   r\   float16rM   	r   r   r   r   r   r   r   r   r   r	   r	   r   _marlin_qqq_gemm_fake  s   
r   z_C::marlin_gemmc                 C   s   t j||ft j| jdS r   r   r   r	   r	   r   _marlin_gemm_fake  s   
r   z_C::awq_dequantizec           	      C   s4   |  d}|  d}|d }tj||f|j|jdS Nr   rI      rK   rN   r(   r\   rL   rM   )	r   r   rO   r   r   r   Zin_cZqout_cZout_cr	   r	   r   _awq_dequantize_fake  s   


r   z_C::awq_gemmc                 C   s4   |  d}tj||| dd f| j| jddS r   )rN   r(   r\   rL   rM   sum)rh   r   r   r   r   Znum_in_featsr	   r	   r   _awq_gemm_fake  s   
r   z_C::machete_mmb_qb_typeout_typeb_group_scalesb_group_zerosb_group_sizeb_channel_scalesa_token_scalesschedulec
                 C   s,   |  d}
| d}tj|
|f| j| jdS )Nr   rI   r   rN   r(   r\   rM   rL   )r   r   r   r   r   r   r   r   r   r   mnr	   r	   r   machete_mm_fake  s   

r   z_C::machete_prepack_Ba_typegroup_scales_typec                 C   s   t j| t jdS )N)Zmemory_format)r(   r   Zcontiguous_formatr   r   r   r   r	   r	   r   machete_prepack_B_fake  s   r   allspark_w8a16_gemmz_C::allspark_w8a16_gemm	b_qweightb_qzerosr   
group_sizesm_count
sm_versionCUBLAS_M_THRESHOLDhas_zpn32k16_reorderc                 C   s"   |  d}tj||f| j| jdS )Nr   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r	   r	   r   _allspark_w8a16_gemm_fake  s   

r   ggml_dequantizez_C::ggml_dequantizeW
quant_typer   rL   c                 C   s   t j||ft j| jdS r   r   r   r   r   r   rL   r	   r	   r   _ggml_dequantize_fake  r   r   z_C::ggml_mul_mat_vec_a8Xrowc                 C   s   t j|jd |f|j| jdS Nr   rK   )r(   r\   r   rL   rM   r   r   r   r   r	   r	   r   _ggml_mul_mat_vec_a8_fake  s   r   z_C::ggml_mul_mat_a8c                 C   s"   | d}tj||f|j| jdS r   r   )r   r   r   r   batchr	   r	   r   _ggml_mul_mat_a8_fake  s   
r   z_C::ggml_moe_a8sorted_token_ids
expert_idsnum_tokens_post_paddedtop_ktokensc	           	      C   s&   |  d}tj|| |ftj|jdS r   )rN   r(   r\   r   rM   	r   r   r   r   r   r   r   r   r   r	   r	   r   _ggml_moe_a8_fake'  s
   
r   ggml_moe_a8_vecz_C::ggml_moe_a8_vectopk_idsc                 C   s&   |  d}tj|| |f| j|jdS r   r   r   r   r  r   r   r   r   r	   r	   r   _ggml_moe_a8_vec_fake;  s
   

r  cuda_device_capabilityc                 C      t jj| S r   )r(   r)   r*   cutlass_scaled_mm_supports_fp4r  r	   r	   r   r  L     r  bscales_ascales_bproblem_sizesexpert_offsetsc              	   C   s   t jj| |||||| d S r   )r(   r)   r*   #cutlass_blockwise_scaled_grouped_mm)r8   r   r	  r
  r  r  r  r	   r	   r   r  P  s   	r  block_scale_ablock_scale_balpha	out_dtypec           	      C   s`   | j dkr
|j dksJ | jd |jd }}tj||f|| jd}tjj|| |||| |S )NrJ   r   rK   )ndimr   r(   r\   rM   r)   r*   cutlass_scaled_fp4_mm)	r   r	  r  r  r  r  r   r   r   r	   r	   r   r  ^  s   r  c                 C   r  r   )r(   r)   r*   cutlass_scaled_mm_supports_fp8r  r	   r	   r   r  j  r  r  c                 C   r  r   )r(   r)   r*   $cutlass_scaled_mm_supports_block_fp8r  r	   r	   r   r  n     r  scale_ascale_bbiasc           
      C   s  |t ju s|t ju sJ |du s | |jd kr|j|ks J g | jdd |jd R }| d| jd } |jd d dkoI|jd d dk}t sP|s`ddl	m
} || |||||}	nt j| jd |jd f|| jd}	t jj|	| |||| |	j| S )a  
    `cutlass_scaled_mm` implements a fused version of
        `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
    where scale_a * a and scale_b * b are implemented using numpy-style
    broadcasting.

    In order to support blockwise scaling like found in DeepSeek V3 we also
    support extended "group" broadcast rules. We extend the numpy-style
    broadcasting rules with the following rule:
        "if the extent of a dimension in the source shape is between 1 and
        corresponding extent in the target shape we repeat each element along
        that dimension  src_shape[dim] // target_shape[dim] times consecutively"
    example if we have:
          a = [[1, 2], and target_shape = (2, 4)
               [3, 4]]
    then we would expand a to:
          a = [[1, 1, 2, 2],
               [3, 3, 4, 4]]
    currently we only support the case:
        scale_a.shape * [1, 128] == a.shape
        scale_b.shape * [128, 128] == b.shape
    NrI   r   r      )triton_scaled_mmrK   )r(   bfloat16r   r   r   rL   viewr   is_rocmZKvllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mmr  r\   rM   r)   r*   cutlass_scaled_mm)
r   r	  r  r  r  r  target_shapeZcutlass_compatible_br  r   r	   r	   r   r   s  s"   $
r   azp_adjazpc           
   
   C   s   |j d d dkr|j d d dksJ |tju s |tju s J |du s4| |j d kr2|j|ks4J g | j dd |j d R }| d| j d } |du s[| | j d ks[J tj| j d |j d f|| jd}	tj	j
|	| |||||| |	j| S )z
    :param azp_adj: In the per-tensor case, this should include the azp.
    Always per-channel.
    :param azp: Only set in the per-token case. Per-token if set.
    r   r  rI   Nr   rK   )r   r(   r  r   r   rL   r  r\   rM   r)   r*   cutlass_scaled_mm_azp)
r   r	  r  r  r  r"  r#  r  r!  r   r	   r	   r   r$    s"   (
r$  c                 C   r  r   )r(   r)   r*   "cutlass_sparse_scaled_mm_supportedr  r	   r	   r   r%    r  r%  c                 C   r  r   )r(   r)   r*   cutlass_group_gemm_supportedr  r	   r	   r   r&    r  r&  c                 C   sV   | j tjtjtjtjfv sJ |  sJ d}| jd d|  dks$J tjj	
| S )a  
    Compresses a sparse matrix for use with Cutlass sparse operations.

    This function takes a dense tensor and compresses it into two components:
    non-zero elements and metadata. The compressed representation is compatible
    with Cutlass sparse kernels.

    Args:
        a (torch.Tensor):
            The input tensor to be compressed. Must have one of the following data types:
            - `torch.int8`
            - `torch.float8_e4m3fn`
            - `torch.bfloat16`
            - `torch.float16`

    Returns:
        tuple[torch.Tensor, torch.Tensor]:
            A tuple containing:
            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.

    Raises:
        ValueError: If the compression operation fails.

    Notes:
        - The `a_meta` tensor has a data type of `torch.uint8`.
        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
       rI   rJ   r   )rL   r(   int8float8_e4m3fnr  r   r}   r   r)   r*   cutlass_sparse_compress)r   ZelemsPerMetaElemr	   r	   r   r*    s    
r*  bt_nzsbt_metac           
   	   C   s   |j d d dkr|j d d dksJ |tju s |tju s J |du s5|j d |j d kr3|j|ks5J | j d }|j d }tj||f|| jd}	tjj	|	| ||||| |	S )aG  
    Performs a scaled sparse matrix multiplication using Cutlass.

    Steps:
    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
    `a = torch.randn((m, k), device='cuda')`.

    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
    `b = torch.randn((k, n), device='cuda')`.

    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
    `b = prune_to_2_4(b, dim=0)`.

    4. Compress the transposed sparse matrix `b.t()`:
    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.

    5. Perform sparse matrix multiplication using the compressed matrix,
    applying scaling factors for `a` and `b`, and the output data type:
    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.

    Returns:
    - The result of the scaled sparse matrix multiplication.
    r   r  rI   NrK   )
r   r(   r  r   rL   r\   rM   r)   r*   cutlass_scaled_sparse_mm)
r   r+  r,  r  r  r  r  r   r   r   r	   r	   r   r-    s   (

r-  problem_sizes1problem_sizes2input_permutationoutput_permutationnum_expertskblockscale_offsetsc
           
      C   s    t jj| |||||||||	
S )a1  
    Prepare data necessary to perform CUTLASS grouped matrix multiplications
    used in CUTLASS-based fused MoE.

    The function takes in topk_ids (token-expert mapping) and uses it to
    compute:
    - expert_offsets: Indices that mark at which token index each expert begins
                      its computation after the input is sorted with
                      input_permutation. The number of tokens computed with
                      expert E is expert_offsets[E + 1] - expert_offsets[E]
    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
                                      multiplication in two grouped MMs used in
                                      the fused MoE operation.
    - input_permutation: Permutation that must be used to shuffle the input
                         before executing the MMs.
    - output_permutation: Permutation that must be used to shuffle the output
                          after executing the MMs.
    - blockscale_offsets: Optional argument passed for fp4 moe. Indices that
                          mark at which block scale index each expert begins
                          its computation. The number of block scale rows
                          computed with expert E is blockscale_offsets[E + 1] -
                          blockscale_offsets[E]
    )r(   r)   r*   get_cutlass_moe_mm_data)
r  r  r.  r/  r0  r1  r2  r   r3  r4  r	   r	   r   r5  &  s   !r5  input_tensordst2src_mapc                 C   s>   |j d }tj|| j d f| j| jd}tjj| || |S )z
    Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
    This is used in MoE to permute the input tensor before performing grouped matrix multiplications.
    r   rI   r   )r   r(   r\   rM   rL   r)   _moe_Cshuffle_rows)r6  r7  Znum_tokens_permutedZoutput_tensorr	   r	   r   r9  O  s   
r9  expert_num_tokensnum_local_expertspadded_mc              
   C   s   t jj| |||||||S )a~  
    Prepare data necessary to perform CUTLASS grouped matrix multiplications
    used in CUTLASS-based fused MoE.

    The function takes in expert_num_tokens (token count per expert) and
    non_zero_expert_idxs (consecutive indices of experts with non-zero token 
    counts) and uses them to compute:
    - expert_offsets: Indices that mark at which token index each expert begins
                      its computation.
    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
                                      multiplication in two grouped MMs used in
                                      the fused MoE operation.
    )r(   r)   r*   get_cutlass_pplx_moe_mm_data)r  r.  r/  r:  r;  r<  r   r3  r	   r	   r   r=  \  s   r=  out_tensors	a_tensors	b_tensorsa_scales	a_strides	b_strides	c_stridesper_act_token
per_out_chc                 C   s$   t jj| |||||||||	|
|S )aY  
    A single grouped matrix multiplication used in CUTLASS-based fused MoE.
    The function executes fp8-quantized OUT = AB matrix multiplication.

    - expert_offsets: Indices that mark at which token index each expert begins
                      its computation. The number of tokens computed with
                      expert E is expert_offsets[E + 1] - expert_offsets[E]
    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
                     MMs used in the fused MoE operation.
    - a/b/c_strides: The data strides passed to grouped matrix multiplication.
    )r(   r)   r*   cutlass_moe_mm)r>  r?  r@  rA  r   r  r  rB  rC  rD  rE  rF  r	   r	   r   rG  t  s
   rG  alphas
sf_offsetsc	           	      C      t jj| ||||||||	S )av  
    An FP4 Blockscaled Group Gemm that takes in  a_tensors, b_tensors and runs
    the gemms for each combination based on the specified problem sizes.

    This is used as the MoE gemm during NVFP4 Quantized FusedMoE forward.
    - a/b_tensors: the NVFP4 a_ptrs and b_ptrs tensors which are quantized
                     input and expert weights.
    - a_/b_scales: The blockscales in FP8-E4M3 precision
    - expert_offsets/sf_offsets: Indices that mark at which token index
                    each expert begins its computation. The number of tokens
                    computed with expert E is expert_offsets[E + 1] -
                    expert_offsets[E] And the sf_size per expert is
                    sf_offset[E+1] - sf_offset[E]
    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
                     MMs used in the fused MoE operation.
    )r(   r)   r*   Zcutlass_fp4_group_mm)	r>  r?  r@  rA  r   rH  r  r  rI  r	   r	   r   cutlass_fp4_moe_mm  s
   rK  num_bitsc                 C      t jj| ||||S r   )r(   r)   r*   gptq_marlin_repack)r   r   r   r   rL  r	   r	   r   rN       rN  c                 C      t jj| |||S r   )r(   r)   r*   awq_marlin_repack)r   r   r   rL  r	   r	   r   rQ       rQ  c                 C   st   | j d }|d dksJ tj||d ||d  f| j| jd}t|D ]}tjj| | || |||||< q$|S Nr   r  rJ   r   )	r   r(   r\   rM   rL   ranger)   r*   rN  r   r   r   r   rL  r2  r8   er	   r	   r   gptq_marlin_moe_repack  s   

rW  c                 C   sn   | j d }|d dksJ tj||d ||d  f| j| jd}t|D ]}tjj| | |||||< q$|S rS  )	r   r(   r\   rM   rL   rT  r)   r*   rQ  rU  r	   r	   r   awq_marlin_moe_repack  s   

rX  c                 C   s2   t jj| |||||||||	|
j|||||||S r   )r(   r)   r*   gptq_marlin_gemmr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r	   r   rY    s   rY  group_zeros_typechannel_scales_typetoken_scales_typec              	   C   s   t jj| |j|||||S r   )r(   r)   r*   machete_supported_schedulesr   )r   r   r   rZ  r[  r\  r   r	   r	   r   r]    s   
r]  c
           
      C   s"   t jj| ||j|||||||	
S r   )r(   r)   r*   
machete_mmr   )
r   r   r   r   r   r   r   r   r   r   r	   r	   r   r^    s   r^  c                 C   s   t jj| ||j|S r   )r(   r)   r*   machete_prepack_Br   r   r	   r	   r   r_  	  rO  r_  permute_colsz_C::permute_colsc                 C   
   t | S r   r(   r   r   r   r	   r	   r   _permute_cols_fake  s   
rd  c                 C      t jj| |S r   )r(   r)   r*   r`  rc  r	   r	   r   r`       input_global_scalec                 C   s   t  rJ | jdksJ d| j d| jdkrdnd}| || jd } | j\}}d}| j}|| dks>J d| d| jtjtj	fv sPJ d| j dtj
||d	 f|tjd
}dd }||d}	|| }
||
d}tj
|	|d f|tjd
}tjj|| || |tj}||fS )a'  
    Quantize input tensor to FP4 and return quantized tensor and scale.

    This function quantizes the last dimension of the given tensor `input`. For
    every 16 consecutive elements, a single dynamically computed scaling factor
    is shared. This scaling factor is quantized using the `input_global_scale`
    and is stored in a swizzled layout (see
    https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).

    Args:
        input: The input tensor to be quantized to FP4
        input_global_scale: A scalar scaling factor for the entire tensor.

    Returns:
        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
            two values are packed into a uint8 and float8_e4m3 scaling factors
            in the sizzled layout.
    rI   z%input.ndim needs to be >= 1, but got .r   r  r   z+last dim has to be multiple of 16, but got z-input.dtype needs to be fp16 or bf16 but got rJ   r   c                 S   s   | | d | | S )NrI   r	   )xyr	   r	   r   r   H  s    z"scaled_fp4_quant.<locals>.<lambda>   r'  )r   r  r  Zreshaper   rM   rL   r(   r   r  r\   uint8int32r)   r*   scaled_fp4_quantr  r)  )rh   rg  Z
other_dimsr   r   r   rM   r8   Zround_upZ	rounded_mZscale_nZ	rounded_nZoutput_scaler	   r	   r   rn    s:   



rn  topkc                 C   s   t  rJ | jdksJ d| j dtj}| j\}}||| ks-J d| d| d|d }|d d	 }	tj||d | jtj	d
}
tj|| |	tj
| jd}tjj|
|| ||| |tj}|
|fS )a  
    Quantize input tensor to FP4 and return quantized tensor and scale, for
    packed MoE Inputs.
    Args:
        input_tensor: The input tensor to be quantized to FP4
        input_global_scale: A scalar scaling factor for the entire tensor.
        expert_offsets: The expert offsets tensor
        blockscale_offsets: The blockscale offsets tensor
    Outputs:
        output: The quantized tensor in FP4
        output_scales: The blockscale tensor in FP8-E4M3
    rJ   z%input.ndim needs to be == 2, but got rh  z2m_numtopk must be less than MAX_TOKENS_PER_EXPERT(z,) for cutlass_moe_fp4, observed m_numtopk = z;. Use VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value.r     r'  r   rK   )r   r  r  r   Z"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOEr   r(   r\   rM   rl  rm  r)   r*   scaled_fp4_experts_quantr  r)  )r6  rg  r  r4  ro  ZMAX_TOKENS_PER_EXPERTZ	m_numtopkr3  Zscales_kZpadded_kr8   Zoutput_scalesr	   r	   r   rq  V  s>   

rq  num_token_paddinguse_per_token_if_dynamicc                 C   s  | j dksJ | j}t }|rt|| jd |d f}|du r*tj|| j|d}n|du s2J d|j|ks9J |du rr|r[tj|d df| jtj	d}tj
j|| || ||fS tjd| jtj	d}tj
j|| | ||fS | dks~J |j tj
j|| | ||fS )a  
    Quantize input tensor to FP8 and return quantized tensor and scale.

    This function supports both static and dynamic quantization: If you
    provide the scale, it will use static scaling and if you omit it,
    the scale will be determined dynamically. The function also allows
    optional padding of the output tensors for downstream kernels that
    will benefit from padding.

    Args:
        input: The input tensor to be quantized to FP8
        scale: Optional scaling factor for the FP8 quantization
        scale_ub: Optional upper bound for scaling factor in dynamic
            per token case
        num_token_padding: If specified, pad the first dimension
            of the output to at least this value.
        use_per_token_if_dynamic: Whether to do per_tensor or per_token
            in the dynamic quantization case.

    Returns:
        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
            scaling factor.
    rJ   r   rI   Nr   z)padding not supported if output passed in)r  r   r   Z	fp8_dtypemaxr(   r\   rM   rL   r   r)   r*   Z"dynamic_per_token_scaled_fp8_quantZdynamic_scaled_fp8_quantr   Zstatic_scaled_fp8_quant)rh   r   rr  r   rs  r8   r   r  r	   r	   r   scaled_fp8_quant  s6    
	ru  
zero_pointc           
      C   s   | j d }| j d }|d d d d }tj||f| j| jd}tjd|f|j|jd}d}	|rD|dus8J dtjd|f|j|jd}	tjj| ||||||	|||
 |||	fS )a  
    Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format
    for Ampere W8A16 Fused Gemm kernel

    Args:
        qweight: uint8 weight tensor, original k x n format.
        scale: fp16/bf16 weight scale tensor, 1 x n format.
        zero_point: fp16/bf16 weight zero_point tensor, 1 x n format.
            Must be provided for asymmetric quantization.
        has_zp: if use symmetric quantization, has_zp = False.
            if use asymmetric quantization, has_zp = True.

    Returns:
        tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] :
            rearranged weight, scale, and optionally zero_point.
    r   rI       r   Nz8zero_point must be provided for asymmetric quantization.)r   r(   r\   rM   rL   r)   r*   Z#rearrange_kn_weight_as_n32k16_order)
r   r   rv  r   KNZ	N_32alignZqweight_reorderZscale_reorderZzero_point_reorderr	   r	   r   allspark_repack_weight  s2   






rz  c                 C   s"   t jj| |||||||||	|
S r   )r(   r)   r*   r   )r   r   r   r   r   r   r   r   r   r   r   r	   r	   r   r     s
   	symmetricc                 C   s   t j| t jd}|dur%||du ksJ dt jj|| || |||fS t j|  | jd  df| j	t j
d}|r<dnt j|t jd}t jj||  || |||fS )a  
    Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.

    Args:
        input: The input tensor to be quantized to int8.
        scale: Optional scaling factor for the int8 quantization.
            When not provided, we invoke dynamic-per-token quantization.
        azp: Optional zero-point for the int8 quantization.
            Must be provided for asymmetric quantization if `scale` is provided.
        symmetric: Whether to use symmetric quantization (scale only, azp ignored).

    Returns:
      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
    r   Nz6azp must only be provided for asymmetric quantization.r   rI   r   )r(   r   r(  r)   r*   Zstatic_scaled_int8_quantr\   r   r   rM   r   rm  Zdynamic_scaled_int8_quantrk   )rh   r   r#  r{  r8   Zinput_scalesZ	input_azpr	   r	   r   scaled_int8_quant	  s,   

r|  c	           	      C   rJ  r   )r(   r)   r*   marlin_qqq_gemmr   r	   r	   r   r}  2  s   r}  c                 C   rM  r   )r(   r)   r*   r   r   r	   r	   r   r   ;  r   c                 C   rP  r   )r(   r)   r*   ggml_mul_mat_vec_a8r   r	   r	   r   r~  @     r~  c                 C   rP  r   )r(   r)   r*   ggml_mul_mat_a8r   r	   r	   r   r  I  r  r  c	           	      C   rJ  r   )r(   r)   r*   ggml_moe_a8r   r	   r	   r   r  R  s   r  c              	   C   r   r   )r(   r)   r*   r   r  r	   r	   r   r   b  s   	c                 C   r  r   )r(   r)   r*   ggml_moe_get_block_size)r   r	   r	   r   r  o  r  r  udeltaABCD_z_delta_bias_delta_softpluscache_indiceshas_initial_state
ssm_statespad_slot_idc                 C   s,   t jj| |||||||||	|
||| d S r   )r(   r)   r*   selective_scan_fwd)r  r  r  r  r  r  r  r  r  r0   r  r  r  r  r	   r	   r   r  t  s
   	r  rows_per_blockc                 C      t jj| ||S r   )r(   r)   r2   LLMM1)r   r	  r  r	   r	   r   r    s   r  cu_countc                 C   r  r   )r(   r)   r2   wvSplitK)r   r	  r  r	   r	   r   r       r  c                 C   s>   t j|jd | jd f||jd}t jj| ||||| |S r   )r(   r\   r   rM   r)   r2   	wvSplitKQ)r   r	  r  r  r  r  r   r	   r	   r   r    s   r  c                 C      t jj| | d S r   )r(   r)   r8  moe_sum)rh   r8   r	   r	   r   r    s   r  experts_idsnum_tokens_post_padc                 C   r5   r   )r(   r)   r8  moe_align_block_size)r  r2  r   r   r  r  r	   r	   r   r    s   r  topk_weightsBLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_Kc                 C   s<   t  stdtjj| |||||||||	|
||| d S )NzGThe optimized moe_wna16_gemm kernel is only available on CUDA platforms)r   r|   NotImplementedErrorr(   r)   r8  moe_wna16_gemm)rh   r8   r   r   r   r  r   r  r  r   r  r  r  r   r	   r	   r   r    s   r  token_expert_indicesgating_outputc                 C   rn   r   )r(   r)   r8  topk_softmax)r  r  r  r  r	   r	   r   r    r{   r  num_tokens_past_paddedmoe_block_sizemul_topk_weightsis_epc                 C   sB   t jj| |||||||||	|
||||||||j|||||||S r   )r(   r)   r8  moe_wna16_marlin_gemmr   )rh   r8   r   r   r   r   r   r   r   r   r   r   r  r  r  r   r  r  r   r   r   r   r   r   r   r   r	   r	   r   r    s   
r  Zmarlin_gemm_moez_moe_C::marlin_gemm_moeb_q_weights
sorted_idsb_zero_pointsreplicate_inputapply_weightsc                 C   s   t j|||f| j| jdS r   r(   r\   rL   rM   )r   r  r  r  r  r   r  r   r   r   r   r   r   r   r   r2  ro  r  r  r  r	   r	   r   marlin_gemm_moe_fake  s   r  z_moe_C::moe_wna16_marlin_gemmc                 C   s   t j|| |f| j| jdS r   r  )rh   r8   r   r   r   r   r   r   r   r   r  r  r  r   r  r  r   r   r   r   r   r   r   r   r	   r	   r   moe_wna16_marlin_gemm_fake  s   r  valueslot_mappingc              
   C   rf   r   )r(   r)   _C_cache_opsreshape_and_cacher_   r  r   r   r  r   r   r    r	   r	   r   r    s   
r  c              
   C   rf   r   )r(   r)   r  reshape_and_cache_flashr  r	   r	   r   r    s
   
r  kv_ck_pec                 C   r5   r   )r(   r)   r  concat_and_cache_mla)r  r  r4   r  r   r   r	   r	   r   r  "  s   r  
key_cachesvalue_cachesblock_mappingc                 C   r   r   )r(   r)   r  copy_blocks)r  r  r  r	   r	   r   r  /  s   r  	kv_cachesc                 C   r  r   )r(   r)   r  copy_blocks_mla)r  r  r	   r	   r   r  5  rR  r  srcdstc                 C   r   r   )r(   r)   r  swap_blocks)r  r  r  r	   r	   r   r  :  r   r  ru   fp8kv_dtypec                 C   rn   r   )r(   r)   r  convert_fp8)r8   rh   r   r  r	   r	   r   r  ?  s   r  	src_cacheblock_tablecu_seq_lensrQ   
seq_startsc                 C   r5   r   )r(   r)   r  gather_cache)r  r  r  r  rQ   r  r	   r	   r   r  F  r?   r  	attributerM   c                 C   re  r   )r(   r)   _C_cuda_utilsget_device_attribute)r  rM   r	   r	   r   r  P  rf  r  c                 C   r  r   )r(   r)   r  0get_max_shared_memory_per_block_device_attribute)rM   r	   r	   r   r  T  s   r  ipc_tensors	rank_datarankfully_connectedc                 C   rP  r   )r(   r)   _C_custom_arinit_custom_ar)r  r  r  r  r	   r	   r   r  [  s   r  fainp
reg_bufferreg_buffer_sz_bytesc                 C      t jj| |||| d S r   )r(   r)   r  
all_reduce)r  r  r   r  r  r	   r	   r   r  a  s   r  c                 C      t jj|  d S r   )r(   r)   r  disposer  r	   r	   r   r  g  r  r  c                   C      t jj S r   )r(   r)   r  	meta_sizer	   r	   r	   r   r  k  r   r  c                 C   re  r   )r(   r)   r  register_buffer)r  r  r	   r	   r   r  o  rf  r  c                 C   r  r   )r(   r)   r  get_graph_buffer_ipc_metar  r	   r	   r   r  s  r  r  handlesoffsetsc                 C   r   r   )r(   r)   r  register_graph_buffers)r  r  r  r	   r	   r   r  w  r   r  rN   c                 C   r  r   )r(   r)   r  !allocate_shared_buffer_and_handle)rN   r	   r	   r   r  |  r  r  
mem_handlec                 C   r  r   )r(   r)   r  open_mem_handle)r  r	   r	   r   r    r  r  ptrc                 C   r  r   )r(   r)   r  free_shared_buffer)r  r	   r	   r   r    r  r  
world_sizeqr_max_sizec                 C   r  r   )r(   r)   r  init_custom_qr)r  r  r  r	   r	   r   r    s   r  c                 C   r  r   )r(   r)   r  
qr_destroyr  r	   r	   r   r    r  r  quant_levelcast_bf2halfc                 C   r  r   )r(   r)   r  qr_all_reduce)r  r  r   r  r  r	   r	   r   r    s   r  c                 C   r  r   )r(   r)   r  qr_get_handler  r	   r	   r   r    r  r  c                 C   re  r   )r(   r)   r  qr_open_handles)r  r  r	   r	   r   r    rf  r  c                   C   r  r   )r(   r)   r  r  r	   r	   r	   r   r    r   cache_seqlensnum_heads_per_head_knum_heads_kc                 C   r  )ac  
    Arguments:
        cache_seqlens: (batch_size), dtype torch.int32.
        num_heads_per_head_k: Equals to seq_len_q * num_heads_q // num_heads_k.
        num_heads_k: num_heads_k.

    Return:
        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32.
        num_splits: (batch_size + 1), dtype torch.int32.
    )r(   r)   r*   get_flash_mla_metadata)r  r  r  r	   r	   r   r    s   
r  qk_cache
head_dim_vtile_scheduler_metadata
num_splitssoftmax_scalec	                 C   sB   |du r| j d d }tjj| |d|||||||
\}	}
|	|
fS )a5  
    Arguments:
        q: (batch_size, seq_len_q, num_heads_q, head_dim).
        k_cache: (num_blocks, page_block_size, num_heads_k, head_dim).
        block_table: (batch_size, max_num_blocks_per_seq), torch.int32.
        cache_seqlens: (batch_size), torch.int32.
        head_dim_v: Head_dim of v.
        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), torch.int32, return by get_mla_metadata.
        num_splits: (batch_size + 1), torch.int32, return by get_mla_metadata.
        softmax_scale: float. The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim).
        causal: bool. Whether to apply causal attention mask.

    Return:
        out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
        softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
    Nr   g      )r   r(   r)   r*   Zflash_mla_fwd_kvcache)r  r  r  r  r  r  r  r  rG   r   Zsoftmax_lser	   r	   r   flash_mla_with_kvcache  s   r  q_nopeq_pekv_c_and_k_pe_cache
page_tablec              	   C   s   t jj| |||||| | S r   )r(   r)   r*   cutlass_mla_decode)r   r   r  r  r   r  r   r	   r	   r   r    s   r  num_kv_splitsc	           	      C   s"   t jj| ||||||||	 | S r   )r(   r)   r*   sm100_cutlass_mla_decode)	r   r   r  r  r   r  r   r   r  r	   r	   r   r    s   r  num_batchesc                 C   rP  r   )r(   r)   r*   $sm100_cutlass_mla_get_workspace_size)r   r  r   r  r	   r	   r   r    s   r  Zweight_packed_linearz_C::weight_packed_linearmat1mat2is_vnnic                 C   s$   t j| d|df| j|jdS r   r   )r	  r
  r  r  r	   r	   r   weight_packed_linear_fake  s   r  Zfused_experts_cpuz_C::fused_experts_cpuhidden_statesw1w2inplaceuse_int8_w8a8use_fp8_w8a16w1_scalew2_scalea1_scalea2_scalec                 C   ra  r   rb  )r  r  r  r  r  r  r  r  r  r  r   r  r  r  r	   r	   r   fused_experts_cpu_fake  s   
r  Zint8_scaled_mm_with_quantz_C::int8_scaled_mm_with_quantscales2c                 C   s&   |  d}| d}tj||f|dS )Nr   r   )rN   r(   r\   )r	  r
  r  r  r  r  Mry  r	   r	   r   int8_scaled_mm_with_quant_fake(  s   
	
r  )r   r   r   r   r   r   )T)NN)TFFF)NNNNNNN)NNNN)NNNFN)NF)NNT)ru   r  )F)
contextlibtypingr   r   r   r(   Z	vllm.envsr   Zvllm.loggerr   Zvllm.platformsr   Zvllm.scalar_typer   __name__loggerZis_tpuZis_xpuZvllm._CZvllmImportErrorrV  warningZsupports_moe_opssuppressZvllm._moe_Cr   Ztorch.libraryr   ZTensorintfloatstrr+   r/   r3   r6   r>   booltuplerP   r]   rc   rg   rl   ro   rx   rz   r~   rL   r   r   r   r   hasattrr)   r*   r   r   r   r   ZSymIntr   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r   r$  r%  r&  r*  r-  r5  r9  r=  rG  rK  rN  rQ  rW  rX  rY  listr]  r^  r_  rd  r`  rn  rq  ru  rz  r   r|  r}  r   r~  r  r  r   r  r  r  r  r  r  r  r  r  r  r8  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	   r	   r	   r   <module>   s  
	

/	

3	



	
9	

2











	


		
	
	
		



837	)
	
	
 99B/)				
&		
		

	
		
"(
"	$	
,		

