o
    )ia                  	   @   sB  U d dl mZmZmZ d dlZd dlm  mZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZ ddlmZ ddlmZ eeZe Z dd Z!dd Z"ej#j$j%j&Z'ej#j$j(j&Z)G dd deZ*e*e dej+dZ,e*e dej+dZ-e*e dej.dZ/e,ej#j$j0j&e-ej#j$j1j&e/ej#j$j2j&iZ3e4e*ef e5d< G dd deZ6e6e,dej#j$j7j&e6e,dej#j$j8j&e6e/dej#j$j9j&e6e/dej#j$j9j&iZ:e4e6ef e5d< G dd deZ;G dd dZ<G dd  d e<Z=G d!d" d"e<Z>G d#d$ d$e<Z?G d%d& d&e<Z@G d'd( d(eZAdS ))    )Callable
NamedTupleOptionalN)fx)auto_functionalized)PatternMatcherPass)
OpOverload)
VllmConfig)init_logger)
GroupShape)current_platform   )find_getitem_maybe)MultiOutputMatch)VllmInductorPassc                  O      t j| i |t jddS Ncuda)dtypedevice)torchemptyZbfloat16argskwargs r   c/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/compilation/fusion.py
empty_bf16      r   c                  O   r   r   )r   r   Zfloat32r   r   r   r   
empty_fp32   r   r   c                   @   s@   e Zd ZU dZejed< eed< eed< dZ	eed< dd Z
d	S )
QuantKeyae  
    Named tuple for identifying the type of quantization.
    dtype: quantized data type
    static: static quantization if True, dynamic if False
    group_shape: quantization group shape
    symmetric: symmetric if True, asymmetric if False

    TODO(luka) use QuantDescriptor once standardized:
    https://github.com/vllm-project/vllm/issues/8913

    r   staticgroup_shapeT	symmetricc              	   C   sl   | j tjkrdn| j tjkrdnt| j }d| jrdnd dtjj| j	  d| d| j
s1d d		S d d		S )
NZ
per_tensorZ	per_tokenz	QuantKey(r!   Zdynamic,a z
symmetric))r"   r   
PER_TENSOR	PER_TOKENstrr!   r   graphZdtype_abbrsr   r#   )selfr"   r   r   r   __str__7   s   zQuantKey.__str__N)__name__
__module____qualname____doc__r   r   __annotations__boolr   r#   r,   r   r   r   r   r    &   s   
 
r    TF	QUANT_OPSc                   @   s*   e Zd ZU dZeed< eed< dd ZdS )FusedRMSQuantKeyz
    Named tuple for identifying the type of RMSNorm + quant fusion.
    quant: type of quantization
    fused_add: does the op also perform the residual add
    quant	fused_addc                 C   s$   d| j  d| jrd dS d dS )NzFusedQuantKey(z, withr&   outz
 residual))r5   r6   r+   r   r   r   r,   Y   s
   
zFusedRMSQuantKey.__str__N)r-   r.   r/   r0   r    r1   r2   r,   r   r   r   r   r4   P   s
   
 r4   	FUSED_OPSc                       sD   e Zd Zdejf fddZdeeee	j
ef f fddZ  ZS )QuantMultiOutputMatchmatchc                    s8   t  | t|tsJ t|tsJ || _|| _d S N)super__init__
isinstancer   QUANT_OPFUSED_OP)r+   r;   Zquant_opZfused_op	__class__r   r   r>   l   s
   
zQuantMultiOutputMatch.__init__fused_return_mappingc                 K   s   |  | j|}| }| ||}dgt|d  }t||D ])\}}|| \}	}
t|	|
}|dur?|| |jd |jd< |	jd |
 ||< qt	||jd< dS )aa  
        This utility function inserts an auto-functionalized node for FUSED_OP.
        It also correctly sets its meta value and rebinds the users of the
        unfused nodes to use the fused node instead.

        :param fused_return_mapping: A dictionary, mapping from getitem indices
        of the fused node result to a tuple of the old node and a getitem index.
        :param kwargs: kwargs that get directly forwarded to the auto_fn node

        Example:
        If we want to replace this graph:
        _, x1, x2 = auto_fn(op1)
        _, y1, y2 = auto_fn(op2)

        with
        _, x1, y2, x2 = auto_fn(FUSED_OP)

        we would call:
        insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)}

        Note that the 0th element is None for auto-functionalized in-place ops.
        Hence, others appear 1-indexed.
        Nr   val)
Zinsert_auto_fnrA   keysZinsert_getitemsmaxzipr   Zreplace_all_uses_withmetatuple)r+   rD   r   Z
fused_nodeindicesZgetitem_nodesZmeta_validxZgetitem_nodeold_nodeZold_idxZold_getitemr   r   r   insert_fused_nodes   s   

z'QuantMultiOutputMatch.insert_fused_node)r-   r.   r/   pmMatchr>   dictintrJ   r   NoderN   __classcell__r   r   rB   r   r:   j   s
    r:   c                   @   s   e Zd ZdedefddZdS )RMSNormQuantPatternepsilonkeyc                 C   sZ   || _ |jj| _|jtv sJ d|j t|j | _|tv s&J d| t| | _d S )Nz unsupported quantization scheme z'unsupported fused rmsnorm+quant op for )rV   r5   r   quant_dtyper3   r@   r9   rA   )r+   rV   rW   r   r   r   r>      s   


zRMSNormQuantPattern.__init__N)r-   r.   r/   floatr4   r>   r   r   r   r   rU      s    rU   c                       s:   e Zd Z	d	dedejf fddZdefddZ  Z	S )
RMSNormStaticQuantPatternTrV   rX   c                    s,   t dt|dtj|dd}t || d S )NFTr   r!   r"   r#   r6   r5   r4   r    r   r'   r=   r>   )r+   rV   rX   r#   Z	fused_keyrB   r   r   r>         z"RMSNormStaticQuantPattern.__init__pm_passc              
      s   dt jdt jdt jdt jdt jf
 fdd}dt jdt jdt jdt jdt jf
 fdd	}t jd
dd jdtd
dtd
dtdd
tddg}t|||tj| d S )Nresult
result_rmsinputweightscalec                    s2   t t||| jd}t  j| |d |d}|d S )Nr`   rb   rc   rV   r   r`   rb   rd   r   RMS_OPrV   r@   r`   ra   rb   rc   rd   at1Zat2r8   r   r   pattern   s   z3RMSNormStaticQuantPattern.register.<locals>.patternc                    s    t  j| ||| jd}|d S )N)r`   rb   rc   rd   rV   r   r   rA   rV   r`   ra   rb   rc   rd   atr8   r   r   replacement   s   z7RMSNormStaticQuantPattern.register.<locals>.replacement      r   r   r   r   	r   ZTensorr   rX   r   r   rO   Zregister_replacementZfwd_only)r+   r_   rk   ro   inputsr   r8   r   register   s.   z"RMSNormStaticQuantPattern.registerT)
r-   r.   r/   rY   r   r   r>   r   ru   rT   r   r   rB   r   rZ      s    rZ   c                       sX   e Zd Z	ddedejf fddZdedee	ge
f fdd	ZG d
d deZ  ZS )!FusedAddRMSNormStaticQuantPatternTrV   rX   c                    s,   t dt|dtj|dd}t || d S )NTr[   r\   r]   )r+   rV   rX   r#   rW   rB   r   r   r>      r^   z*FusedAddRMSNormStaticQuantPattern.__init__r_   record_matchc              
         dt jdt jdt jdt jdt jf
fdd}dt jdt jdt jdt jdt jf
fdd	}t jd
ddjdtd
dtd
dtdd
tddg}tj|||tj| fddd d S )Nr`   rb   residualrc   rd   c                    s:   t t||| jd}t  j| |d |d}|d |d fS )Nrb   rz   rc   rV   r   rf      r   
RMS_ADD_OPrV   r@   r`   rb   rz   rc   rd   rn   rj   r8   r   r   rk     s   z;FusedAddRMSNormStaticQuantPattern.register.<locals>.patternc              	      s*   t  j| |||| jd}|d |d fS )N)r`   rb   rz   rc   rd   rV   r   r|   rl   r`   rb   rz   rc   rd   rn   r8   r   r   ro     s   	z?FusedAddRMSNormStaticQuantPattern.register.<locals>.replacementrp   rq   r   rr   r   c                         | jjS r<   rP   r@   rA   mrx   r+   r   r   <lambda>-      z<FusedAddRMSNormStaticQuantPattern.register.<locals>.<lambda>Zextra_checkrs   r+   r_   rx   rk   ro   rt   r   r   r   ru      s8   
z*FusedAddRMSNormStaticQuantPattern.registerc                   @      e Zd Zdd ZdS )z'FusedAddRMSNormStaticQuantPattern.Matchc                 C   s   |  t}|  | j}t|jdksJ t|jdksJ |  ( | jj }|df|dfd}| j	|fi |d|jd i W d    d S 1 sLw   Y  d S )Nr|   r   r   r|   rV   
find_auto_fnr~   r@   lenusersinserting_after_matchr;   r   copyrN   r+   Zrms_nodeZ
quant_noder   rD   r   r   r   process2  s   




"z/FusedAddRMSNormStaticQuantPattern.Match.processNr-   r.   r/   r   r   r   r   r   rP   0      rP   rv   )r-   r.   r/   rY   r   r   r>   r   r   r   r2   ru   r:   rP   rT   r   r   rB   r   rw      s    
2rw   c                       `   e Zd Zejdfdedejdef fddZde	de
egef fd	d
ZG dd deZ  ZS )RMSNormDynamicQuantPatternTrV   rX   r"   c                    s*   t dt|d||dd}t || d S )NFr[   r\   r4   r    r=   r>   r+   rV   rX   r"   r#   rW   rB   r   r   r>   O     z#RMSNormDynamicQuantPattern.__init__r_   rx   c              
      ry   )Nr`   ra   rb   rc   rd   c                    s<   t t||| jd}t  j| |d |d d}|d |d fS )Nre   r   r`   rb   rd   scale_ubr|   rg   ri   r8   r   r   rk   ^  s   z4RMSNormDynamicQuantPattern.register.<locals>.patternc              
      s,   t  j| ||| jd d d}|d |d fS )Nr`   rb   rc   rd   rV   r   rz   r   r|   rl   rm   r8   r   r   ro   o  s   
z8RMSNormDynamicQuantPattern.register.<locals>.replacementrp   rq   r   rr   r   c                    r   r<   r   r   r   r   r   r     r   z5RMSNormDynamicQuantPattern.register.<locals>.<lambda>r   rs   r   r   r   r   ru   [  8   
z#RMSNormDynamicQuantPattern.registerc                   @   r   )z RMSNormDynamicQuantPattern.Matchc                 C   s   |  t}|  | j}t|jdksJ t|jdksJ |  + | jj }|d= |df|dfd}| j	|f|jd d d d| W d    d S 1 sOw   Y  d S )Nr   r|   ra   r   rV   )rV   r   rz   )
r   rh   r@   r   r   r   r;   r   r   rN   r   r   r   r   r     s$   


"z(RMSNormDynamicQuantPattern.Match.processNr   r   r   r   r   rP     r   rP   r-   r.   r/   r   r(   rY   r   r   r>   r   r   r   r2   ru   r:   rP   rT   r   r   rB   r   r   M      
4r   c                       r   )"FusedAddRMSNormDynamicQuantPatternTrV   rX   r"   c                    s*   t dt|d||dd}t || d S )NTFr[   r\   r   r   rB   r   r   r>     r   z+FusedAddRMSNormDynamicQuantPattern.__init__r_   rx   c              
      ry   )Nr`   rb   rz   rc   rd   c                    sB   t t||| jd}t  j| |d |d d}|d |d |d fS )Nr{   r   r   r|   r}   r   r8   r   r   rk     s   z<FusedAddRMSNormDynamicQuantPattern.register.<locals>.patternc              
      s2   t  j| ||| jd |d}|d |d |d fS )Nr   r      r|   rl   r   r8   r   r   ro     s   
z@FusedAddRMSNormDynamicQuantPattern.register.<locals>.replacementrp   rq   r   rr   r   c                    r   r<   r   r   r   r   r   r     r   z=FusedAddRMSNormDynamicQuantPattern.register.<locals>.<lambda>r   rs   r   r   r   r   ru     r   z+FusedAddRMSNormDynamicQuantPattern.registerc                   @   r   )z(FusedAddRMSNormDynamicQuantPattern.Matchc                 C   s   |  t}|  | j}t|jdksJ t|jdksJ |  * | jj }|df|df|dfd}| j	|f|jd d d| W d    d S 1 sNw   Y  d S )Nr|   r   )r   r|   r   rV   )rV   r   r   r   r   r   r   r     s&   

"z0FusedAddRMSNormDynamicQuantPattern.Match.processNr   r   r   r   r   rP     r   rP   r   r   r   rB   r   r     r   r   c                       sx   e Zd ZU dZdZded< edefddZdef fdd	Z	d
e
defddZdejfddZdejfddZ  ZS )
FusionPassa  
    This pass fuses a pre-defined set of custom ops into fused ops.
    It uses the torch pattern matcher to find the patterns and replace them.
    It also manually processes multi-output matches, as those are broken in
    the torch pattern matcher.

    Because patterns can only be registered once, the pass is a singleton.
    This will be addressed in a future version of PyTorch:
    https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
    NzOptional[FusionPass]	_instanceconfigc                 C   s,   | j du rt|| _ | j S |jj| j _| j S )z
        Get the singleton instance of the FusionPass.
        If the instance exists, the config is updated but
        initialization is not repeated.
        N)r   r   Zcompilation_configZpass_config)clsr   r   r   r   instance"  s
   

zFusionPass.instancec                    s   | j jd u s
J dt | g | _tdd| _dD ]3}t|t	| j t
|t	| j| j t|t	| j| j t|t	| j| j tjjj  qd S )Nz,FusionPass singleton instance already existsZfusion_pass)Z	pass_name)gh㈵>gư>)rC   r   r=   r>   matchesr   patternsrZ   	FP8_DTYPEru   rw   rx   r   r   r   	_inductorpattern_matcherZ_seen_patternsclear)r+   r   rV   rB   r   r   r>   /  s0   



zFusionPass.__init__r;   returnc                 C   s   | j | dS )NF)r   append)r+   r;   r   r   r   rx   P  s   zFusionPass.record_matchr*   c                    s<   | j D ]}|  q   t fdd| j D sJ dS )z
        Manually process multi-output matches and replace them with fused nodes.
        See MultiOutputMatch for more details.
        c                 3   s(    | ]}|j jD ]}| jvV  qqd S r<   )r;   nodes).0r;   noder*   r   r   	<genexpr>b  s    z-FusionPass.process_matches.<locals>.<genexpr>N)r   r   Zeliminate_dead_codeall)r+   r*   r;   r   r   r   process_matchesX  s   

 zFusionPass.process_matchesc                 C   sv   |    | |d | j|}td| | |d | | tdt| j | |d | j	  | 
  d S )NZbefore_fusionzReplaced %s patternsZafter_pattern_matchzPost-processed %s matchesZafter_fusion)beginZ
dump_graphr   applyloggerdebugr   r   r   r   Zend_and_log)r+   r*   countr   r   r   __call__e  s   

zFusionPass.__call__)r-   r.   r/   r0   r   r1   classmethodr	   r   r>   r   r2   rx   r   ZGraphr   r   rT   r   r   rB   r   r     s   
 !r   )Btypingr   r   r   r   Ztorch._inductor.pattern_matcherr   r   rO   r   Z*torch._higher_order_ops.auto_functionalizer   r   Z
torch._opsr   Zvllm.configr	   Zvllm.loggerr
   Z9vllm.model_executor.layers.quantization.utils.quant_utilsr   Zvllm.platformsr   Zfx_utilsr   Zmulti_output_matchr   Zvllm_inductor_passr   r-   r   Z	fp8_dtyper   r   r   opsZ_CZrms_normdefaultrh   Zfused_add_rms_normr~   r    r'   ZkFp8StaticTensorSymZkFp8DynamicTensorSymr(   ZkFp8DynamicTokenSymZstatic_scaled_fp8_quantZdynamic_scaled_fp8_quantZ"dynamic_per_token_scaled_fp8_quantr3   rQ   r1   r4   Zrms_norm_static_fp8_quantZ#fused_add_rms_norm_static_fp8_quantZ rms_norm_dynamic_per_token_quantr9   r:   rU   rZ   rw   r   r   r   r   r   r   r   <module>   s`   







>9]be