o
    )i                     @   s   d dl Z d dlm  mZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d	d
lmZmZmZmZmZ d	dlmZ eeZe jjjj Z!e jj"j#j Z$G dd dZ%G dd deZ&dS )    N)auto_functionalized)PatternMatcherPass)FakeTensorModeunset_fake_temporarily)	Attention)
VllmConfig)init_logger)current_platform   )	QUANT_OPS
GroupShapeQuantKey
empty_bf16
empty_fp32)VllmInductorPassc                	   @   sT   e Zd Z	ddedededejfddZdd	 Zd
e	de
fddZd
e	fddZdS )AttentionStaticQuantPatternT
layer_name	num_heads	head_sizequant_dtypec                 C   sV   || _ || _|| _|| _t|dtj|d| _| jtv s#J d| j t| j | _	d S )NT)dtypestaticgroup_shape	symmetricz unsupported quantization scheme )
r   r   r   r   r   r   Z
PER_TENSOR	quant_keyr   QUANT_OP)selfr   r   r   r   r    r   h/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/compilation/fusion_attn.py__init__   s   
z$AttentionStaticQuantPattern.__init__c                 O   s    | j dd|}tj|i |S )Ncuda)r   Zdevice)r   torchempty)r   argskwargsr   r   r   empty_quant/   s   z'AttentionStaticQuantPattern.empty_quantpm_passlayerc                 C   s,   |j | j| jj| jjr| | d S d S N)implZfused_output_quant_supportedr   r   r   r   	_register)r   r&   r'   r   r   r   register_if_supported3   s   
z1AttentionStaticQuantPattern.register_if_supportedc                    sR  dt jdt jdt jdt jdt jdt jf fdd}dt jdt jdt jdt jdt jdt jf fd	d
}t j t N td j jtd j jtd j jtd j j  d j j tddg}dd }dt j	j
fdd}t|||||tj| W d    n1 sw   Y  W d    d S W d    d S 1 sw   Y  d S )Nqkvoutput_attnoutput_quantscalec           
   	      s`   t |d j jg}tt| ||| jd d}t |d d j j g}t j|||d}	|	d S )Nquerykeyvalueoutputr   Zoutput_scaler
   )resultinputr1   )
RESHAPE_OPr   r   r   ATTN_OPr   r   )
r,   r-   r.   r/   r0   r1   view_7at1Zattn_out_viewZat2r   r   r   pattern<   s(   z6AttentionStaticQuantPattern._register.<locals>.patternc              	      sF   t |d j jg}tt| ||| j|d}t |d d j j gS )Nr2   r3   r
   )r:   r   r   r   r;   r   )r,   r-   r.   r/   r0   r1   r<   r=   r>   r   r   replacementR   s   z:AttentionStaticQuantPattern._register.<locals>.replacement   r
   c                    s    fdd}|S )Nc                     s    | i |S r(   r   )r#   r$   
process_fxtrace_fnr   r   wrappedq   s   zMAttentionStaticQuantPattern._register.<locals>.wrap_trace_fn.<locals>.wrappedr   )rC   rD   rE   r   rB   r   wrap_trace_fno   s   z<AttentionStaticQuantPattern._register.<locals>.wrap_trace_fngmc                 S   s   ddl m} ||  | S )Nr   )view_to_reshape)Z#torch._inductor.fx_passes.post_gradrH   )rG   rH   r   r   r   fx_view_to_reshapev   s   zAAttentionStaticQuantPattern._register.<locals>.fx_view_to_reshape)r!   ZTensorr   r   r   r   r   r%   r   fxZGraphModulepmZregister_replacementZfwd_only)r   r&   r?   r@   ZinputsrF   rI   r   r>   r   r*   :   s>   

Pz%AttentionStaticQuantPattern._registerN)T)__name__
__module____qualname__strintr!   r   r   r%   r   r   r+   r*   r   r   r   r   r      s     

r   c                       sF   e Zd ZdZdef fddZdejjj	ddfdd	Z
d
d Z  ZS )AttnFusionPassa  
    This pass fuses post-attention quantization onto attention if supported.

    It uses the pattern matcher and matches each layer manually, as strings
    cannot be wildcarded. This also lets us check support on attention layers
    upon registration instead of during pattern matching.

    Currently, only static fp8 quant is supported, but patterns could easily be
    added for other quant schemes and dtypes. The bigger hurdle for wider
    support are attention kernels, which need to support fusing output quant.
    configc                    sz   t  | |jj| _tdd| _| j D ]\}}t||j	|j
t }|| j| qt| jdkr;td d S d S )NZattn_fusion_pass)Z	pass_namer   zAttention + quant fusion is enabled, but CompilationConfig.static_forward_context is empty. Cannot access attention layers so no fusion patterns were registered.)superr   Zcompilation_configZstatic_forward_contextZstatic_fwd_ctxr   patternsitemsr   r   r   r	   Z	fp8_dtyper+   lenloggerwarning)r   rR   r5   r'   r?   	__class__r   r   r      s   
zAttnFusionPass.__init__graphreturnNc                 C   sD   |    | |d | j|}td| | |d |   d S )NZbefore_attn_fusionz*Fused quantization onto %s attention nodesZafter_attn_fusion)beginZ
dump_graphrT   applyrW   debugZend_and_log)r   r[   countr   r   r   __call__   s   zAttnFusionPass.__call__c                 C   s   t | tS r(   )r   Zhash_sourcer   r>   r   r   r   uuid   s   zAttnFusionPass.uuid)rL   rM   rN   __doc__r   r   r!   rJ   r[   ZGraphra   rb   __classcell__r   r   rY   r   rQ      s
    	rQ   )'r!   Ztorch._inductor.pattern_matcherZ	_inductorZpattern_matcherrK   Z*torch._higher_order_ops.auto_functionalizer   r   Ztorch._subclasses.fake_tensorr   r   Zvllm.attentionr   Zvllm.configr   Zvllm.loggerr   Zvllm.platformsr	   Zfusionr   r   r   r   r   Zvllm_inductor_passr   rL   rW   opsZvllmZunified_attention_with_outputdefaultr;   ZatenZreshaper:   r   rQ   r   r   r   r   <module>   s    g