o
    )i[I                     @   s,  d dl mZ d dlZd dlm  mZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d dlmZ d	d
lmZ eeZG dd dZG dd deZG dd deZG dd deZG dd deZe ZG dd deZ G dd deZ!G dd deZ"G dd deZ#dS )    )OptionalN)PatternMatcherPass)
VllmConfig)get_tp_group tensor_model_parallel_all_reduce$get_tensor_model_parallel_world_size)init_logger)current_platform   )VllmInductorPassc                
   @   sX   e Zd ZdZ	ddedejdedeej	j
 fddZd	d
 Zdd Zdd Zdd ZdS )_RMSNormAndQuantOpHelperzEBase helper for RMSNorm and RMSNorm + Quantization functionalization.Nepsilondtypedevicequant_opc                 K   s   || _ || _|| _|| _d S N)r   r   r   r   selfr   r   r   r   kwargs r   q/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/compilation/sequence_parallelism.py__init__   s   
z!_RMSNormAndQuantOpHelper.__init__c                 C   "   t jjjt jjjj|||| jdS )N)resultinputweightr   )torchopshigher_orderauto_functionalized_CZrms_normdefaultr   )r   Zresult_bufferinput_tensorweight_tensorr   r   r   _functional_rmsnorm$   s   
z,_RMSNormAndQuantOpHelper._functional_rmsnormc                 C   r   )N)r   residualr   r   )r   r   r   r    r!   Zfused_add_rms_normr"   r   )r   r#   residual_tensorr$   r   r   r   _functional_fused_add_rmsnorm,   s   
z6_RMSNormAndQuantOpHelper._functional_fused_add_rmsnormc                 C   s@   | j d u r	td| |||}tjjj| j ||d |d}|S )N=_RMSNormAndQuantOpHelper was not initialized with a quant_op.r   r   r   scale)r   RuntimeErrorr%   r   r   r   r    )r   Zrmsnorm_result_bufferquant_result_bufferr#   r$   scale_tensorZrmsnorm_out_tuplequant_out_tupler   r   r   _functional_rmsnorm_then_quant5   s   
z7_RMSNormAndQuantOpHelper._functional_rmsnorm_then_quantc                 C   sH   | j d u r	td| |||}tjjj| j ||d |d}||d fS )Nr)   r   r*      )r   r,   r(   r   r   r   r    )r   r-   r#   r'   r$   r.   Zfused_add_rmsnorm_out_tupler/   r   r   r   (_functional_fused_add_rmsnorm_then_quantF   s   
zA_RMSNormAndQuantOpHelper._functional_fused_add_rmsnorm_then_quantr   )__name__
__module____qualname____doc__floatr   r   strr   _ops
OpOverloadr   r%   r(   r0   r2   r   r   r   r   r      s     

	r   c                
       s   e Zd ZdZ	ddedejdedeej	j
 f fddZd	ejd
ejfddZd	ejd
ejfddZd	ejd
ejfddZ  ZS )_SequenceParallelPatternHelperz)Helper for sequence parallelism patterns.Nr   r   r   r   c                    s0   t  j|||fd|i| t | _t | _d S )Nr   )superr   r   tp_groupr   tp_sizer   	__class__r   r   r   Z   s   z'_SequenceParallelPatternHelper.__init__xreturnc                 C   s   t |S r   )r   r   rA   r   r   r   _all_reduced   s   z*_SequenceParallelPatternHelper._all_reducec                 C      t jjjj|d| j| jjdS Nr   )dimZ
world_size
group_name)r   r   vllmreduce_scatterr"   r>   r=   unique_namerC   r   r   r   _reduce_scatterg      
z._SequenceParallelPatternHelper._reduce_scatterc                 C   rE   rF   )r   r   rI   
all_gatherr"   r>   r=   rK   rC   r   r   r   _all_gathern   rM   z*_SequenceParallelPatternHelper._all_gatherr   )r3   r4   r5   r6   r7   r   r   r8   r   r9   r:   r   TensorrD   rL   rO   __classcell__r   r   r?   r   r;   W   s    

r;   c                   @   "   e Zd Zdd ZdefddZdS )FirstAllReduceRMSNormPatternc                 C   sP   t jg d| j| jd}t jg d| j| jd}t jdg| j| jd}|||gS )Nr         r   r   rV   r   emptyr   r   )r   r   permutearg3_1r   r   r   
get_inputsx   s   
z'FirstAllReduceRMSNormPattern.get_inputspm_passc                    s\   dt jdt jdt jf fdd}dt jdt jdt jf fdd}t||  tj| d S )Nr   rZ   r[   c                    s$     | } |||}|d |fS Nr   )rD   r%   )r   rZ   r[   
all_reducermsnormr   r   r   pattern   s   
z6FirstAllReduceRMSNormPattern.register.<locals>.patternc                    s8     | }t|} |||} |d }||fS r^   )rL   r   
empty_liker%   rO   )r   rZ   r[   rJ   rmsnorm_resultr`   rN   ra   r   r   replacement   s   

z:FirstAllReduceRMSNormPattern.register.<locals>.replacementr   rP   pmregister_replacementr\   fwd_onlyr   r]   rb   re   r   ra   r   register   s"   
z%FirstAllReduceRMSNormPattern.registerNr3   r4   r5   r\   r   rk   r   r   r   r   rS   v   s    rS   c                   @   rR   )MiddleAllReduceRMSNormPatternc                 C   R   t jddg| j| jd}t jddg| j| jd}t jddg| j| jd}|||gS NrV   rW   rX   r   mm_1r&   rms_norm_weightsr   r   r   r\         
z(MiddleAllReduceRMSNormPattern.get_inputsr]   c              
      |   dt jdt jdt jdtt jt jf f fdd}dt jdt jdt jdtt jt jf f fdd}t||  tj| d S )	Nr&   rq   rr   rB   c                    s(     |} || |}|d |d fS Nr   r1   rD   r(   r&   rq   rr   r_   r`   ra   r   r   rb      s
   
z7MiddleAllReduceRMSNormPattern.register.<locals>.patternc                    s2     |} || |} |d }||d fS ru   rL   r(   rO   )r&   rq   rr   rJ   r`   rN   ra   r   r   re      s   
z;MiddleAllReduceRMSNormPattern.register.<locals>.replacementr   rP   tuplerg   rh   r\   ri   rj   r   ra   r   rk      *   
z&MiddleAllReduceRMSNormPattern.registerNrl   r   r   r   r   rm          rm   c                   @   rR   )LastAllReduceRMSNormPatternc                 C   rn   ro   rX   rp   r   r   r   r\      rs   z&LastAllReduceRMSNormPattern.get_inputsr]   c              
      rt   )	Nr&   rq   rr   rB   c                    s      |} || |}|d S r^   rv   rw   ra   r   r   rb      s
   
z5LastAllReduceRMSNormPattern.register.<locals>.patternc                    s*     |} || |} |d }|S r^   rx   )r&   rq   rr   rJ   r`   
normalizedra   r   r   re      s   
z9LastAllReduceRMSNormPattern.register.<locals>.replacementry   rj   r   ra   r   rk      r{   z$LastAllReduceRMSNormPattern.registerNrl   r   r   r   r   r}      r|   r}   c                       J   e Zd Zdedejdedejjf fddZ	dd Z
d	efd
dZ  ZS )%FirstAllReduceRMSNormStaticFP8Patternr   r   r   opc                       t  j||||d d S N)r   r<   r   r   r   r   r   r   r?   r   r   r         z.FirstAllReduceRMSNormStaticFP8Pattern.__init__c                 C   s~   t jg d| j| jd}t jg d| j| jd}t jg d| jtd}t jdg| j| jd}t jd| jt jd}|||||gS )NrT   rW   rV   g      ?)r   Zzerosr   r   rY   	FP8_DTYPEZtensorfloat32)r   r   rd   quant_resultr   r+   r   r   r   r\      s   

z0FirstAllReduceRMSNormStaticFP8Pattern.get_inputsr]   c              
      st   dt jdt jdt jdt jdt jf
 fdd}dt jdt jdt jdt jdt jf
 fdd	}t||  tj| d S )
Nr   rd   r   r   r+   c                    s(     | } |||||}|d |fS r^   )rD   r0   )r   rd   r   r   r+   r_   
static_fp8ra   r   r   rb     s
   

z?FirstAllReduceRMSNormStaticFP8Pattern.register.<locals>.patternc                    sR     | }tj||jd}tj||jd} |||||} |d }||fS N)r   r   )rL   r   rc   r   r0   rO   )r   rd   r   r   r+   rJ   r   rN   ra   r   r   re     s   

zCFirstAllReduceRMSNormStaticFP8Pattern.register.<locals>.replacementrf   rj   r   ra   r   rk   	  s2   z.FirstAllReduceRMSNormStaticFP8Pattern.registerr3   r4   r5   r7   r   r   r8   r9   r:   r   r\   r   rk   rQ   r   r   r?   r   r      s    r   c                       r   )&MiddleAllReduceRMSNormStaticFP8Patternr   r   r   r   c                    r   r   r   r   r?   r   r   r   1  r   z/MiddleAllReduceRMSNormStaticFP8Pattern.__init__c                 C      t jddg| j| jd}t jddg| j| jd}t jddg| j| jd}t jddg| jtd}t jddg| jt jd}|||||gS NrV   rW   r   r   rY   r   r   r   r   r   rq   r&   rr   r   r+   r   r   r   r\   5     
z1MiddleAllReduceRMSNormStaticFP8Pattern.get_inputsr]   c                       dt jdt jdt jdt jdt jdtt jt jf f fdd}dt jdt jdt jdt jdt jdtt jt jf f fd	d
}t||  tj| d S )Nr   r&   rq   rr   r+   rB   c                    s,     |} | ||||\}}|d |fS r^   rD   r2   )r   r&   rq   rr   r+   r_   r   rmsnorm_residual_outra   r   r   rb   I  s
   

z@MiddleAllReduceRMSNormStaticFP8Pattern.register.<locals>.patternc           
         sF     |}tj|| jd} |||||\}} |d }	|	|fS r   rL   r   rc   r   r2   rO   )
r   r&   rq   rr   r+   rJ   quant_result_bufr   r   rN   ra   r   r   re   U  s   
zDMiddleAllReduceRMSNormStaticFP8Pattern.register.<locals>.replacementry   rj   r   ra   r   rk   G  :   z/MiddleAllReduceRMSNormStaticFP8Pattern.registerr   r   r   r?   r   r   /      r   c                       r   )$LastAllReduceRMSNormStaticFP8Patternr   r   r   r   c                    r   r   r   r   r?   r   r   r   k  r   z-LastAllReduceRMSNormStaticFP8Pattern.__init__c                 C   r   r   r   r   r   r   r   r\   o  r   z/LastAllReduceRMSNormStaticFP8Pattern.get_inputsr]   c                    r   )Nr   r&   rq   rr   r+   rB   c                    s(     |} | ||||\}}|d S r^   r   )r   r&   rq   rr   r+   r_   r   _ra   r   r   rb     s
   

z>LastAllReduceRMSNormStaticFP8Pattern.register.<locals>.patternc           
         sB     |}tj|| jd} |||||\}} |d }	|	S r   r   )
r   r&   rq   rr   r+   rJ   r   r   r   r~   ra   r   r   re     s   
zBLastAllReduceRMSNormStaticFP8Pattern.register.<locals>.replacementry   rj   r   ra   r   rk     r   z-LastAllReduceRMSNormStaticFP8Pattern.registerr   r   r   r?   r   r   i  r   r   c                       sL   e Zd ZdZdef fddZdee defddZ	d	e
jfd
dZ  ZS )SequenceParallelismPassa  
    This pass enables sequence parallelism for models.
    It identifies patterns where an AllReduce operation is followed by
    an RMSNorm (or RMSNorm and then Quantization) operation.
    These patterns are replaced with a ReduceScatter operation, followed by
    a local RMSNorm/Quantization, and then an AllGather operation.

    The general transformation is:
    Input -> AllReduce -> RMSNorm -> Output
    becomes
    Input -> ReduceScatter -> RMSNorm -> AllGather -> Output

    While this pass itself does not directly yield performance improvements,
    it lays the groundwork for subsequent fusion passes, such as
    GEMM + ReduceScatter and AllGather + GEMM fusions. These fusions can
    significantly reduce communication overhead and improve overall model
    performance.
    configc                    s   t  | tdd| _dD ]Z}tjjjj}t	|| j
| j|| j t|| j
| j|| j t|| j
| j|| j t|| j
| j| j t|| j
| j| j t|| j
| j| j tjjj  qd S )NZsequence_parallelism_pass)Z	pass_name)gh㈵>gư>)r<   r   r   patternsr   r   r!   Zstatic_scaled_fp8_quantr"   r   Zmodel_dtyper   rk   r   r   rS   rm   r}   	_inductorpattern_matcherZ_seen_patternsclear)r   r   r   Zfp8_quant_opr?   r   r   r     sF   








z SequenceParallelismPass.__init__shaperB   c                 C   s   t  }|d uo|| dkS )Nr   r   )r   r   r>   r   r   r   is_applicable_for_shape  s   z/SequenceParallelismPass.is_applicable_for_shapegraphc                 C   sD   |    | |d | j|}td| | |d |   d S )NZ before_sequence_parallelism_passz.Replaced %s patterns with sequence parallelismZafter_sequence_parallelism_pass)beginZ
dump_graphr   applyloggerdebugZend_and_log)r   r   countr   r   r   __call__  s   z SequenceParallelismPass.__call__)r3   r4   r5   r6   r   r   r   intboolr   fxZGraphr   rQ   r   r   r?   r   r     s
    !r   )$typingr   r   Ztorch._inductor.pattern_matcherr   r   rg   Ztorch.fxr   r   Zvllm.configr   Zvllm.distributedr   r   Zvllm.distributed.parallel_stater   Zvllm.loggerr	   Zvllm.platformsr
   Zvllm_inductor_passr   r3   r   r   r;   rS   rm   r}   Z	fp8_dtyper   r   r   r   r   r   r   r   r   <module>   s,   A(++8::