o
    )i                  #   @   s  d dl mZ d dlmZ d dlZd dlm  mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ ddlmZ e  Z!edrzd dl"m#Z$ e%e$drue$ndZ$W n e&y   dZ$Y nw dZ$ee'Z(ej)j*j+j,Z-ej)j.j/j,Z0ej)j.j1j,Z2ej)j.j3j,Z4ej)j.j5j,Z6G dd dZ7G dd de7Z8G dd de7Z9G dd de7Z:G dd de7Z;G dd de7Z<G dd de7Z=G dd  d eZ>e$durda?d!Z@d"e@ e@e@d# e@d# d$ZAe@d# ZB				dMd%ejCd&ejCd'ejCd(eDd)eEd*eEd+eFd,eFd-eFd.eEd/eEd0eFd1eejC d2eejC d3eejC d4eejC d5df"d6d7ZG				dMd%ejCd&ejCd'ejCd(eDd)eEd*eEd+eFd,eFd-eFd.eEd/eEd0eFd1eejC d2eejC d3eejC d4eejC d5df"d8d9ZHed:eGg d;eHejId< ej)j*jJj,ZJG d=d> d>ZKG d?d@ d@e7ZLG dAdB dBe7ZMG dCdD dDe7ZNG dEdF dFe7ZOG dGdH dHe7ZPG dIdJ dJe7ZQG dKdL dLeZRdS )N    )	find_spec)OptionalN)auto_functionalized)PatternMatcherPass)enable_symm_mem_for_group)
VllmConfig)get_tp_group tensor_model_parallel_all_reduce)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)init_logger)current_platform)direct_register_custom_op   )VllmInductorPassZ
flashinfertrtllm_allreduce_fusionc                   @   s    e Zd ZdejdefddZdS )BasePatterndtypedevicec                 C   s    || _ || _t | _t | _d S N)r   r   r   tpr   tp_size)selfr   r    r   n/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/compilation/collective_fusion.py__init__.   s   zBasePattern.__init__N)__name__
__module____qualname__torchr   strr   r   r   r   r   r   ,   s    r   c                   @   "   e Zd Zdd ZdefddZdS )GEMMReduceScatterPatternc                 C   s8   t jddg| j| jd}t jddg| j| jd}||gS )N      r   r   r   emptyr   r   )r   mul	mm_weightr   r   r   
get_inputs7   s   z#GEMMReduceScatterPattern.get_inputspm_passc                    sP   dt jdt jf fdd}dt jdt jf fdd}t||  tj| d S )Nr(   r)   c                    s4   t jjj| |}t jjjj|d j jj	d}|S Nr   dim
world_size
group_name)
r   opsatenmmdefaultvllmreduce_scatterr   r   unique_name)r(   r)   r3   r6   r   r   r   pattern>   s   
z2GEMMReduceScatterPattern.register.<locals>.patternc                    s"   t jjj| |dd jjjd}|S )Navgr   )scatter_dimr0   )r   r1   symm_memZfused_matmul_reduce_scatterr   device_groupr0   )r(   r)   gemm_rsr8   r   r   replacementH   s   z6GEMMReduceScatterPattern.register.<locals>.replacementr   Tensorpmregister_replacementr*   fwd_onlyr   r+   r9   r?   r   r8   r   register<   s
   
z!GEMMReduceScatterPattern.registerNr   r   r   r*   r   rF   r   r   r   r   r"   5   s    r"   c                   @   r!   )AllGatherGEMMPatternc                 C   s8   t jddg| j| jd}t jddg| j| jd}||gS Nr$   r%   r&   )r   xweightr   r   r   r*   Y   s   zAllGatherGEMMPattern.get_inputsr+   c                    sp   dt jdt jdtt jt jf f fdd}dt jdt jdtt jt jf f fdd}t||  tj| d S )NrJ   rK   returnc                    s0   t jjjj| d j jjd}t jjj	||S r,   )
r   r1   r5   
all_gatherr4   r   r   r7   r2   r3   )rJ   rK   rM   r8   r   r   r9   a   s   
z.AllGatherGEMMPattern.register.<locals>.patternc                    s&   t jjj| |gd jjjd\}}|S )Nr   )
gather_dimr0   )r   r1   r<   Zfused_all_gather_matmulr   r=   r0   )rJ   rK   	ag_output
mm_outputsr8   r   r   r?   n   s   
z2AllGatherGEMMPattern.register.<locals>.replacement)r   rA   tuplerB   rC   r*   rD   rE   r   r8   r   rF   _   s"   zAllGatherGEMMPattern.registerNrG   r   r   r   r   rH   W   s    rH   c                   @   r!   )ScaledMMReduceScatterPatternc                 C   st   t jddg| jtd}t jddg| jtd dd}t jddg| jt jd}t jddg| jt jd}||||gS Nr#   r%   r   r   )r   r'   r   	FP8_DTYPE
contiguous	transposefloat32)r   inputr)   scale_ascale_br   r   r   r*      s   z'ScaledMMReduceScatterPattern.get_inputsr+   c              
      t   dt jdt jdt jdt jdt jf
 fdd}dt jdt jdt jdt jdt jf
 fdd	}t||  tj| d S )
NrX   mat2rY   rZ   rL   c              	      sB   t jjjj| |||d d  jd}t jjjj|d j j	j
d}|S )Nr\   rY   rZ   biasZscale_result	out_dtyper   r-   )r   r1   r2   
_scaled_mmr4   r   r5   r6   r   r   r7   )rX   r\   rY   rZ   Z	scaled_mmr6   r8   r   r   r9      s   
z6ScaledMMReduceScatterPattern.register.<locals>.patternc              
      s*   t jjj| |||dd j jjjd}|S Nr:   r   )r;   r_   r0   r   r1   r<   Z"fused_scaled_matmul_reduce_scatterr   r   r=   r0   )rX   r\   rY   rZ   r>   r8   r   r   r?         z:ScaledMMReduceScatterPattern.register.<locals>.replacementr@   rE   r   r8   r   rF      s"   z%ScaledMMReduceScatterPattern.registerNrG   r   r   r   r   rR   }   s    rR   c                   @   r!   )AllGatherScaledMMPatternc                 C   s   t jddg| jtd}t jddg| jtd dd}|jd | j }t j|dg| jt jd}t jddg| jt jd}||||gS N   r#   r%   r   r   )	r   r'   r   rT   rU   rV   shaper   rW   )r   rJ   rK   s1rY   rZ   r   r   r   r*      s   z#AllGatherScaledMMPattern.get_inputsr+   c              
      r[   )
NrJ   rK   rY   rZ   rL   c              	      s>   t jjjj| d j jjd}t jjj	j||||d d  j
dS )Nr   r-   r]   )r   r1   r5   rM   r4   r   r   r7   r2   r`   r   )rJ   rK   rY   rZ   rM   r8   r   r   r9      s   
z2AllGatherScaledMMPattern.register.<locals>.patternc                    s>   t jjj| |g||gdd gd g jgdg jjjd
\}}|S Nr   F)rN   ZbiasesZresult_scalesZ
out_dtypesZuse_fast_accumr0   r   r1   r<   Zfused_all_gather_scaled_matmulr   r   r=   r0   )rJ   rK   rY   rZ   rO   rP   r8   r   r   r?         
z6AllGatherScaledMMPattern.register.<locals>.replacementr@   rE   r   r8   r   rF      s*   z!AllGatherScaledMMPattern.registerNrG   r   r   r   r   rd          rd   c                   @   r!   )#CutlassScaledMMReduceScatterPatternc                 C   s   t jddg| jtd}t jddg| jtd dd}t jddg| jt jd}t jddg| jt jd}t jddg| j| jd}|||||gS rS   )r   r'   r   rT   rU   rV   rW   r   )r   rX   r)   rY   rZ   cutlass_mm_outputr   r   r   r*      s   
z.CutlassScaledMMReduceScatterPattern.get_inputsr+   c                    s   dt jdt jdt jdt jdt jdt jf fdd}dt jd	t jdt jdt jdt jdt jf fd
d}t||  tj| d S )NrX   rK   rY   rZ   rn   rL   c              	      sJ   t jjjt jjjj|| |||d d}t jjjj|d d j	 j
jd}|S )NoutabZa_scalesZb_scalesr^   r   r   r-   )r   r1   higher_orderr   _Ccutlass_scaled_mmr4   r5   r6   r   r   r7   )rX   rK   rY   rZ   rn   ru   r6   r8   r   r   r9      s    

	z=CutlassScaledMMReduceScatterPattern.register.<locals>.patternr\   c              
      s*   t jjj| |||dd j jjjd}|S ra   rb   )rX   r\   rY   rZ   rn   r>   r8   r   r   r?   
  rc   zACutlassScaledMMReduceScatterPattern.register.<locals>.replacementr@   rE   r   r8   r   rF      s*   z,CutlassScaledMMReduceScatterPattern.registerNrG   r   r   r   r   rm      rl   rm   c                   @   r!   )AllGatherCutlassScaledMMPatternc                 C   s   t jddg| jtd}t jddg| jtd dd}|jd | j }t j|dg| jt jd}t jddg| jt jd}|jd }t j||g| j| j	d}|||||gS re   )
r   r'   r   rT   rU   rV   rg   r   rW   r   )r   rJ   rK   rh   rY   rZ   s2outputr   r   r   r*      s   
z*AllGatherCutlassScaledMMPattern.get_inputsr+   c                    s   dt jdt jdt jdt jdt jdt jf fdd}dt jdt jdt jdt jdt jdt jf fd	d
}t||  tj| d S )NrJ   rK   rY   rZ   rx   rL   c              	      sJ   t jjjj| d j jjd}t jjj	t jj
jj|||||d d}|d S )Nr   r-   ro   r   )r   r1   r5   rM   r4   r   r   r7   rs   r   rt   ru   )rJ   rK   rY   rZ   rx   rM   ru   r8   r   r   r9   1  s    

z9AllGatherCutlassScaledMMPattern.register.<locals>.patternc                    s>   t jjj| |g||gdd gd g jgdg jjjd
\}}|S ri   rj   )rJ   rK   rY   rZ   rx   rO   rP   r8   r   r   r?   H  rk   z=AllGatherCutlassScaledMMPattern.register.<locals>.replacementr@   rE   r   r8   r   rF   /  s2   z(AllGatherCutlassScaledMMPattern.registerNrG   r   r   r   r   rv     s    rv   c                       sH   e Zd Zdef fddZdee defddZde	j
fd	d
Z  ZS )AsyncTPPassconfigc                    s   t  | tt jj tdd| _t| j	| j
| j t| j	| j
| j | j	tjkr]t| j	| j
| j t| j	| j
| j t| j	| j
| j t| j	| j
| j d S d S )NZasync_tp_passZ	pass_name)superr   r   r   r=   r0   r   patternsr"   model_dtyper   rF   rH   r   Zbfloat16rR   rd   rm   rv   )r   rz   	__class__r   r   r   _  s>   




zAsyncTPPass.__init__rg   rL   c                 C   s   t  }|d uo|| dkS )Nr   )r   )r   rg   r   r   r   r   is_applicable_for_shapez  s   z#AsyncTPPass.is_applicable_for_shapegraphc                 C   sD   |    | |d | j|}td| | |d |   d S )NZbefore_async_tp_passz(Replaced %s patterns with async TP pass.Zafter_async_tp_pass)begin
dump_graphr}   applyloggerdebugend_and_logr   r   countr   r   r   __call__  s   zAsyncTPPass.__call__)r   r   r   r   r   r   intboolr   fxGraphr   __classcell__r   r   r   r   ry   ]  s    ry   i   @      )r   r$      rf   allreduce_inresidual	rms_gammarms_eps
world_rankr/   launch_with_pdltrigger_completion_at_endfp32_accmax_token_numpattern_codefuse_rms_quantnorm_out	quant_out	scale_outscale_factorrL   c                 C   s  | j \}}|  }|| | }|	| | }|tt|t|k}|rtd us*J d|d u r3| }|}n| }tjdi d| d| j d d|d|d|d|d	|d
|d|d| j d dtd|ddd|d|d|
dd d|d|dtj	j
d| d S t| }|d ur|d u r|r|d u rtjj|||||| nCtjj||||| n7|d u rtjj|||| |}n
tjj|||| |d ur|d urtjj|||| n	tjj||| |d u s|d ur| | d S d S )Nz0Flashinfer must be enabled when using flashinferr   Z	token_numr   Zresidual_inresidual_outr   r   r   r   r/   
hidden_dimZworkspace_ptrsr   use_oneshotTr   r   r   allreduce_outr   r   Zlayout_coder   r   )rg   element_sizemin_FI_MAX_SIZESget_DEFAULT_FI_MAX_SIZE_FI_WORKSPACE_TENSORflashinfer_commr   ZFP4QuantizationSFLayoutZSWIZZLEDr	   r   r1   rt   Z#fused_add_rms_norm_static_fp8_quantZrms_norm_static_fp8_quantfused_add_rms_normrms_normscaled_fp4_quantstatic_scaled_fp8_quantZcopy_)r   r   r   r   r   r/   r   r   r   r   r   r   r   r   r   r   Z
num_tokensZhidden_sizer   Zcurrent_tensor_sizeZmax_fusion_sizeZuse_flashinferr   r   r   r   r    call_trtllm_fused_allreduce_norm  s   




	

r   c                 C   s   d S r   r   )r   r   r   r   r   r/   r   r   r   r   r   r   r   r   r   r   r   r   r   %call_trtllm_fused_allreduce_norm_fake  s   r   &flashinfer_trtllm_fused_allreduce_norm)r   r   r   r   r   )Zop_nameZop_funcZmutates_argsZ	fake_impldispatch_keyc                   @   s>   e Zd ZdZ			ddededededef
d	d
Zdd ZdS )FlashInferFusedAllReduceParamsz5Parameters for FlashInfer fused allreduce operations.F   rankr/   use_fp32_lamportr   r   c                 C   s:   || _ || _|| _d| _d| _d| _d| _|| _|| _d S )NTF)	r   r/   r   r   r   r   r   r   r   )r   r   r/   r   r   r   r   r   r   r      s   
z'FlashInferFusedAllReduceParams.__init__c                 C   s"   | j | j| j| j| j| j| jdS )N)r   r/   r   r   r   r   r   )r   r/   r   r   r   r   r   r8   r   r   r   !get_trtllm_fused_allreduce_kwargs2  s   z@FlashInferFusedAllReduceParams.get_trtllm_fused_allreduce_kwargsN)Fr   F)r   r   r   __doc__r   r   r   r   r   r   r   r   r     s"    
r   c                       J   e Zd ZdZdedejdedef fddZ	dd	 Z
d
efddZ  ZS )AllReduceRMSNormPatternz
    This pattern replaces the allreduce + rms norm (without residual) 
    with fused flashinfer implementation.
    Applies to allreduce + rmsnorm before attn in the first Transformer block.
    epsilonr   r   allreduce_paramsc                       t  || || _|| _d S r   r|   r   r   r   r   r   r   r   r   r   r   r   r   E     
z AllReduceRMSNormPattern.__init__c                 C   sP   t jg d| j| jd}t jg d| j| jd}t jdg| j| jd}|||gS )Nr   rf   r$   r%   r$   r&   )r   rX   
rms_resultrK   r   r   r   r*   P  s   

z"AllReduceRMSNormPattern.get_inputsr+   c                    \   dt jdt jdt jf fdd}dt jdt jdt jf fdd}t||  tj| d S )NrX   r   rK   c                    s(   t | }tt||| jd}|d |fS )NresultrX   rK   r   r   )r	   r   RMS_OPr   )rX   r   rK   allreduce_outputrmsr8   r   r   r9   [  s   z1AllReduceRMSNormPattern.register.<locals>.patternc                    sH   t | }ttf| ||d d | jtjjd j	 }|d |d fS )Nr   r   r   r   r   r   r   r      r   )
r   
zeros_liker   r   r   r   AllReduceFusionPatternkARResidualRMSNormr   r   )rX   r   rK   r   	allreducer8   r   r   r?   h  s"   
z5AllReduceRMSNormPattern.register.<locals>.replacementr@   rE   r   r8   r   rF   Y  s   z AllReduceRMSNormPattern.registerr   r   r   r   floatr   r   r    r   r   r*   r   rF   r   r   r   r   r   r   >  s    	r   c                       r   )AllReduceFusedAddRMSNormPatternz
    This pattern replaces the allreduce + rms norm (with residual) 
    with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn and mlp + rmsnorm before attn.
    r   r   r   r   c                    r   r   r   r   r   r   r   r     r   z(AllReduceFusedAddRMSNormPattern.__init__c                 C   sR   t jddg| j| jd}t jddg| j| jd}t jddg| j| jd}|||gS rI   r&   )r   rX   r   rK   r   r   r   r*     s   z*AllReduceFusedAddRMSNormPattern.get_inputsr+   c                    r   )Nr   rX   rK   c                    s,   t |}tt|| | jd}|d |d fS )NrX   r   rK   r   r   r   )r	   r   
RMS_ADD_OPr   )r   rX   rK   r   r   r8   r   r   r9     s   z9AllReduceFusedAddRMSNormPattern.register.<locals>.patternc                    s>   t tf|| d d d | jtjjd j }|d |d fS )Nr   r   r   )r   r   r   r   r   r   r   r   )r   rX   rK   r   r8   r   r   r?     s    z=AllReduceFusedAddRMSNormPattern.register.<locals>.replacementr@   rE   r   r8   r   rF     s   z(AllReduceFusedAddRMSNormPattern.registerr   r   r   r   r   r     s    
r   c                       B   e Zd ZdZdedejdedef fddZ	de
fd	d
Z  ZS )*AllReduceFusedRMSNormStaticQuantFP8Patternz
    This pattern replaces the allreduce + rms norm (without residual) 
    + static fp8 quant with fused flashinfer implementation.
    Applies to allreduce + rmsnorm + quant before attn 
    in the first Transformer block.
    r   r   r   r   c                    &   t  || || _|| _tj| _d S r   r|   r   r   r   r   Zfloat8_e4m3fnquant_dtyper   r   r   r   r        z3AllReduceFusedRMSNormStaticQuantFP8Pattern.__init__r+   c              
      s~    fdd}dt jdt jdt jdt jdt jf
 fdd	}dt jd
t jdt jdt jdt jf
 fdd}t||| tj| d S )Nc                     s   t jg d j jd} t jg d j jd}t jg d j jd}t jdg j jd}t jd jt jd}| ||||gS )Nr   r%   r$   g      ?)r   Zzerosr   r   r'   r   ZtensorrW   )rX   rmsnorm_resultquant_resultrK   scaler8   r   r   r*     s   


zGAllReduceFusedRMSNormStaticQuantFP8Pattern.register.<locals>.get_inputsrX   r   r   rK   r   c                    s<   t | }tt||| jd}tt||d |d}|d |fS )Nr   r   r   rX   r   )r	   r   r   r   STATIC_FP8_QUANT_OP)rX   r   r   rK   r   
all_reducermsnorm_out_tuplequant_out_tupler8   r   r   r9     s   zDAllReduceFusedRMSNormStaticQuantFP8Pattern.register.<locals>.pattern
result_rmsc                    sJ   t | }ttf| |||d | jtjj|d	 j	 }|d |d fS )N	r   r   r   r   r   r   r   r   r   r$   r   )
r   r   r   r   r   r   r   kARResidualRMSNormFP8Quantr   r   )rX   r   r   rK   r   r   r   r8   r   r   r?     s$   
zHAllReduceFusedRMSNormStaticQuantFP8Pattern.register.<locals>.replacementr   rA   rB   rC   rD   r   r+   r*   r9   r?   r   r8   r   rF     s,   z3AllReduceFusedRMSNormStaticQuantFP8Pattern.registerr   r   r   r   r   r   r   r    r   r   r   rF   r   r   r   r   r   r         r   c                       r   )-AllReduceFusedAddRMSNormStaticQuantFP8Patternz
    This pattern replaces the allreduce + rms norm (with residual)
    + static fp8 quant with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn + quant and 
    mlp + rmsnorm + quant before attn.
    r   r   r   r   c                    r   r   r   r   r   r   r   r     r   z6AllReduceFusedAddRMSNormStaticQuantFP8Pattern.__init__r+   c              
      s~    fdd}dt jdt jdt jdt jdt jf
 fdd	}dt jdt jdt jdt jdt jf
 fd
d}t||| tj| d S )Nc                     s   t jddg j jd} t jddg j jd}t jddg j jd}t jddg j jd}t jddg jt jd}||| ||gS )Nr$   r%   r   )r   r'   r   r   r   rW   )rX   r   rK   r   r   r8   r   r   r*      s(   


zJAllReduceFusedAddRMSNormStaticQuantFP8Pattern.register.<locals>.get_inputsr   r   rX   rK   r   c                    s@   t |}tt||| jd}tt| |d |d}|d |d fS )Nr   r   r   r   )r	   r   r   r   r   )r   r   rX   rK   r   r   fused_add_rmsnorm_out_tupler   r8   r   r   r9   6  s    zGAllReduceFusedAddRMSNormStaticQuantFP8Pattern.register.<locals>.patternc                    s@   t tf||d | d | jtjj|d	 j }|d |d fS )Nr   r$   r   )r   r   r   r   r   r   r   r   )r   r   rX   rK   r   r   r8   r   r   r?   O  s"   zKAllReduceFusedAddRMSNormStaticQuantFP8Pattern.register.<locals>.replacementr   r   r   r8   r   rF     s,   z6AllReduceFusedAddRMSNormStaticQuantFP8Pattern.registerr   r   r   r   r   r     r   r   c                       r   ),AllReduceFusedRMSNormStaticQuantNVFP4Patternz
    This pattern replaces the allreduce + rms norm (without residual) 
    + static nvfp4 quant with fused flashinfer implementation.
    Applies to allreduce + rmsnorm + quant before attn 
    in the first Transformer block.
    r   r   r   r   c                    r   r   r   r   r   r   r   r   o     
z5AllReduceFusedRMSNormStaticQuantNVFP4Pattern.__init__r+   c                    s    fdd}dt jdt jdt jdt jdt jdt jf fd	d
}dt jdt jdt jdt jdt jdt jf fdd}t||| tj| d S )Nc                     s   t jg d j jd} t jg d j jd}t jd jt jd}t jddg jt jd}t jdg j jd}t jddg jt jd}| |||||gS )N)r   r#   r#   r%   r#   rf   r   r#      r$   r   r'   r   r   Zuint8rW   Zint32)rX   r   r   input_global_scalerK   output_scaler8   r   r   r*   w  s0   



zIAllReduceFusedRMSNormStaticQuantNVFP4Pattern.register.<locals>.get_inputsrX   r   r   rK   r   r   c           	         sD   t | }tt||| jd}tt||d ||d}|d ||d fS )Nr   r   rx   rX   r   Zinput_scaler   )r	   r   r   r   STATIC_FP4_QUANT_OP)	rX   r   r   rK   r   r   r   r   r   r8   r   r   r9     s   zFAllReduceFusedRMSNormStaticQuantNVFP4Pattern.register.<locals>.patternr   c                    sP   t | }ttf| ||||| jtjj|d	 j	 }|d |d |d fS )Nr   r$   r      )
r   r   r   r   r   r   r   kARResidualRMSNormFP4Quantr   r   )rX   r   r   rK   r   r   r   r   r8   r   r   r?     s$   
zJAllReduceFusedRMSNormStaticQuantNVFP4Pattern.register.<locals>.replacementr   r   r   r8   r   rF   u  s4   z5AllReduceFusedRMSNormStaticQuantNVFP4Pattern.registerr   r   r   r   r   r   g      r   c                       r   )/AllReduceFusedAddRMSNormStaticQuantNVFP4Patternz
    This pattern replaces the allreduce + rms norm (with residual)
    + static nvfp4 quant with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn + quant and 
    mlp + rmsnorm + quant before attn.
    r   r   r   r   c                    r   r   r   r   r   r   r   r     r   z8AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.__init__r+   c                    s    fdd}dt jdt jdt jdt jdt jdt jf fd	d
}dt jdt jdt jdt jdt jdt jf fdd}t||| tj| d S )Nc                     s   t jddg j jd} t jddg j jd}t jddg j jd}t jd jt jd}t jddg jt jd}t jddg jt jd}||| |||gS )Nr#   r%   r   r   r   r$   r   )rX   r   rK   r   r   r   r8   r   r   r*     s8   



zLAllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register.<locals>.get_inputsr   r   rX   r   rK   r   c           	         sH   t |}tt||| jd}tt| |d ||d}|d |d |d fS )Nr   r   r   r   )r	   r   r   r   r   )	r   r   rX   r   rK   r   r   r   r   r8   r   r   r9     s*   zIAllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register.<locals>.patternc                    sF   t tf||d | || jtjj|d	 j }|d |d |d fS )Nr   r$   r   r   )r   r   r   r   r   r   r   r   )r   r   rX   r   rK   r   r   r8   r   r   r?     s"   zMAllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register.<locals>.replacementr   r   r   r8   r   rF     s,   z8AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.registerr   r   r   r   r   r     r   r   c                       s:   e Zd Zdef fddZdejfddZdd Z  Z	S )	AllReduceFusionPassrz   c                    s  t  | d| _t | _| jdkrd S tdd| _|jd u r!d S |j | _	t
 j| _t }| jtjk}td u r@td d S | jtvrNtd| j d S tt| jt| j	| j |r^dnd  |jjj}tj|| j|| j	| j|d	\| _}|at|| j|||jjjd
| _ dD ]b}t!|| j| j"| j #| j t$|| j| j"| j #| j t%&drt'|| j| j"| j #| j t(|| j| j"| j #| j t)|| j| j"| j #| j t*|| j| j"| j #| j tj+j,j-.  qd| _d S )NTr   Zall_reduce_fusion_passr{   zTFlashinfer is not installed or comm module not found, skipping allreduce fusion passz>Flashinfer allreduce fusion is not supported for world size %sr$   r   )Ztp_rankr   r   r   groupr   )r   r/   r   r   r   )gh㈵>gư>d   F)/r|   r   disabledr   r   r   r}   Zmodel_configZget_hidden_sizer   r   r=   r   r
   r~   r   rW   r   r   warningr   r   r   r   Zcompilation_configZpass_configZ!fi_allreduce_fusion_max_token_numZ1trtllm_create_ipc_workspace_for_all_reduce_fusionipc_handlesr   r   Zenable_fusionr   r   r   rF   r   r   Zhas_device_capabilityr   r   r   r   	_inductorpattern_matcherZ_seen_patternsclear)r   rz   r   r   Zmax_num_tokenZworkspace_tensorr   r   r   r   r   !  s   



	







zAllReduceFusionPass.__init__r   c                 C   sN   | j rd S |   | |d | j|}td| | |d |   d S )NZbefore_all_reduce_fusion_passzReplaced %s patternsZafter_all_reduce_fusion_pass)r   r   r   r}   r   r   r   r   r   r   r   r   r     s   zAllReduceFusionPass.__call__c                 C   s*   | j rd S td urt| j| j d S d S r   )r   r   Z+trtllm_destroy_ipc_workspace_for_all_reducer  r   r8   r   r   r   __del__  s   zAllReduceFusionPass.__del__)
r   r   r   r   r   r   r   r   r  r   r   r   r   r   r     s    b
r   )NNNN)Simportlib.utilr   typingr   r   Ztorch._inductor.pattern_matcherr  r  rB   Ztorch.fxr   Z*torch._higher_order_ops.auto_functionalizer   r   Z#torch.distributed._symmetric_memoryr   Zvllm.configr   Zvllm.distributedr   r	   Zvllm.distributed.parallel_stater
   r   Zvllm.loggerr   Zvllm.platformsr   Z
vllm.utilsr   Zvllm_inductor_passr   Z	fp8_dtyperT   Zflashinfer.commZcommr   hasattrImportErrorr   r   r1   r5   r   r4   ZALLREDUCE_OPrt   r   r   r   r   r   r   r   r   r   r"   rH   rR   rd   rm   rv   ry   r   MiBr   r   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s$  	"&197?
+	

n	


!AAOX\\