o
    )i<T                     @   sB  d Z ddlmZmZ ddlZddlmZ ddlm  mZ	 ddl
mZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZmZmZ dd
lmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ e e,Z-da.dd Z/G dd dej0Z1G dd dej0Z2de3fddZ4de3deej5 fddZ6dej5dej5dej5de3d ej5f
d!d"Z7dej5dej5dej5de3d ej5f
d#d$Z8e+d"e7g e8e)j9d% 	d,dej5dej5dej5d&ej5de3d'eej5 d dfd(d)Z:	d,dej5dej5dej5d&ej5de3d'eej5 d dfd*d+Z;e+d)e:d&ge;e)j9d% dS )-zAttention layer.    )ListOptionalN)AttentionType)AttentionBackend)backend_name_to_enumget_attn_backend)validate_kv_sharing_target)CacheConfigget_current_vllm_config)get_kv_transfer_grouphas_kv_transfer_groupis_v1_kv_transfer_group)ForwardContextget_forward_context)init_logger)UnquantizedLinearMethod)QuantizationConfig)BaseKVCacheMethod)_Backendcurrent_platform)direct_register_custom_opc                  C   sj   t d urt S t rtdrda nzddlm}  | d da W n ty+   da Y nw t s3td t S )Nd   Fr   	find_speczxformers.opsTz(Xformers is not available, falling back.)	USE_XFORMERS_OPSr   Zis_cudaZhas_device_capabilityimportlib.utilr   ImportErrorloggerwarningr    r   `/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/attention/layer.pycheck_xformers_availability    s    
r!   c                       s  e Zd ZdZddddddddejddfdedededee d	ee	e  d
ee
 dee dee dee dedededee deee  ddf fddZ	d%dejdejdejdeej dejf
ddZdd ZdefddZd ejfd!d"Zdee fd#d$Z  ZS )&	Attentionac  Attention layer.

    This class takes query, key, and value tensors as input. The input tensors
    can either contain prompt tokens or generation tokens.
    The class does the following:

    1. Store the input key and value tensors in the KV cache.
    2. Perform (multi-head/multi-query/grouped-query) attention.
    3. Return the output tensor.
    NF 	num_heads	head_sizescalenum_kv_headsalibi_slopescache_configquant_configlogits_soft_capper_layer_sliding_windowuse_mlaprefix	attn_typekv_sharing_target_layer_nameattn_backendreturnc                    s  t    |	dur|	}n
|dur|j}nd}|dur'|j}|j}|j}|j}nd}d}d}d}|du r5|}|| dksFJ d| d| d|| _|| _tj	d	tj
d
| _tj	d	tj
d
| _tj	d	tj
d
| _tj	d	tj
d
| _d	| _d	| _|
| _|| _|| _|| _|| _|ddu| _|r|j| |dnd}|durt|tst|tsJ | jdkrtd|| _| j|  t }|du rt||||||
| jd| _ n|| _ | j ! }|||||||||||f
i || _"t#| j $ | _%|| _&t'(  ot')  | _*| j j+| _,t- j.}||j/v rtd| | |j/|< || _0|| _1|dur-t2|||j/ || _3dd t4t- j5j6D | _7tj	t8j9tj
d
| _:tj	t8j;tj
d
| _<tj	t8j=tj
d
| _>dS )zg
        The KV cache is stored inside this class and is accessed via
        `self.kv_cache`.
        Nauto   Fr   num_heads ($) is not divisible by num_kv_heads ()g      ?)dtypesinks)r.   Zfp8_e5m2z8fp8_e5m2 kv-cache is not supported with fp8 checkpoints.)r-   has_sinkzDuplicate layer name: c                 S   s   g | ]}t g qS r   )torchtensor).0_r   r   r    
<listcomp>   s    
z&Attention.__init__.<locals>.<listcomp>)?super__init__sliding_windowZcache_dtype
block_sizeis_attention_freecalculate_kv_scaleskv_cache_dtyper;   r<   float32_k_scale_v_scale_q_scaleZ_prob_scale_k_scale_float_v_scale_floatr-   r$   r%   r'   getr:   Zget_quant_method
isinstancer   r   
ValueErrorquant_methodZcreate_weightsget_default_dtyper   r1   Zget_impl_clsimplr   get_namebackendr8   r   Zis_cuda_alikeZis_cpuuse_direct_callZaccept_output_buffer
use_outputr
   compilation_configZstatic_forward_context
layer_namer/   r   r0   rangeZparallel_configZpipeline_parallel_sizekv_cacheenvsZQ_SCALE_CONSTANTq_rangeZK_SCALE_CONSTANTk_rangeZV_SCALE_CONSTANTv_range)selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   Zextra_impl_argsrB   rF   rC   rD   rE   rP   r8   Zimpl_clsrW   	__class__r   r    rA   E   s   
	








zAttention.__init__querykeyvalueoutput_shapec           
   	   C   s  | j rt j}|jr| ||| | jr|dur|n|j}tj||j	|j
d}|d }| jsZ|d| j| j}|d| j| j}|durM|d| j| j}|durZ|d| j| j}| jrt }|j}t|trm|| j }| j|j }	| jj| ||||	||d ntjj||||| j |d|S | jrt }|j}t|tr|| j }| j|j }	| j| ||||	|S tjj|||| jS )a_  
        The KV cache is stored inside this class and is accessed via
        `self.kv_cache`.

        Attention metadata (`attn_metadata`) is set using a context manager in
        the model runner's `execute_model` method. It is accessed via forward
        context using
        `vllm.forward_context.get_forward_context().attn_metadata`.
        N)r8   device)output)rE   r   attn_metadataZenable_kv_scales_calculationcalc_kv_scalesrV   shaper;   Zzerosr8   rf   r-   viewr$   r%   r'   rU   rN   dictrX   rZ   virtual_enginerR   forwardopsZvllmunified_attention_with_outputunified_attention)
r_   rb   rc   rd   re   ri   rh   Zhidden_sizeforward_contextZself_kv_cacher   r   r    ro      sf   




zAttention.forwardc                 C   sv   | j t| | j  | jt| | j  | jt| | j	  | j
 | _| j
 | _d| _d S )NF)rJ   Zcopy_r;   absmaxr\   rH   r]   rI   r^   itemrK   rL   rE   )r_   rb   rc   rd   r   r   r    rj   "  s   
zAttention.calc_kv_scalesc                 C   s\   d| j j }|d| j j 7 }|d| j j 7 }|d| j j 7 }|d| j jj 7 }|S )Nz
head_size=z, num_heads=z, num_kv_heads=z, scale=z
, backend=)rR   r%   r$   r'   r&   ra   __name__)r_   sr   r   r    
extra_repr+  s   zAttention.extra_repr	act_dtypec                 C   s   t | jdr| j| | jtjkr@t | jdrBddlm} t| j|s&J | jj	d urD| jj	j
tjkrF| jj	tj| j_	d S d S d S d S d S )Nprocess_weights_after_loadingr9   r   )FlashInferImpl)hasattrrR   r{   rT   r   ZFLASHINFER_VLLM_V1Z%vllm.v1.attention.backends.flashinferr|   rN   r9   r8   r;   rG   to)r_   rz   r|   r   r   r    r{   3  s   
z'Attention.process_weights_after_loadingc                 C   s   | j S N)r1   )r_   r   r   r    r   @  s   zAttention.get_attn_backendr   )rw   
__module____qualname____doc__r   ZDECODERintfloatr   r   r	   r   boolstrtyper   rA   r;   TensorSizero   rj   ry   r8   r{   r   __classcell__r   r   r`   r    r"   9   sz    
	

 	
I	r"   c                
       s\   e Zd ZdZ	ddedededee f fddZd	ej	d
ej	dej	dej	fddZ
  ZS )MultiHeadAttentionz7Multi-headed attention without any cache, used for ViT.Nr$   r%   r&   r'   c                    s   t    || _|| _|| _|d u r|n|| _| j| j dks,J d| j d| j d| j| j | _t }t	||d ddd}t
| }t rOtj| _n|tjtjtjfv r\tj}|tjtjtjhv rh|ntj| _| jtjkr{t s}tj| _d S d S d S )Nr   r5   r6   r7   r4   F)rF   rC   rD   )r@   rA   r$   r%   r&   r'   num_queries_per_kvr;   rQ   r   r   rS   r   Zis_rocmr   
TORCH_SDPAr1   Z
FLASH_ATTNZFLASH_ATTN_VLLM_V1ZFLEX_ATTENTIONXFORMERSPALLAS_VLLM_V1r!   )r_   r$   r%   r&   r'   r8   r1   rT   r`   r   r    rA   G  sF   




zMultiHeadAttention.__init__rb   rc   rd   r2   c                 C   sV  |  \}}}| d}|||| j| j}|||| j| j}|||| j| j}| j }dkrAtj||dd}tj||dd}| jt	j
krXddlm}	 |	j|||| jd}
nL| jt	jkr|dd |||fD \}}}tj|||| jd}
|
dd}
n(| jt	jkrd	d |||fD \}}}dd
lm} ||||| jd}
|
dd}
|
||dS )z/Input shape: batch_size x seq_len x hidden_size      )dimr   )rp   )r&   c                 s       | ]	}| d dV  qdS r   r   N	transposer=   xr   r   r    	<genexpr>      z-MultiHeadAttention.forward.<locals>.<genexpr>c                 s   r   r   r   r   r   r   r    r     r   )flash_attention)Zsm_scalerg   )sizerl   r$   r%   r'   r   r;   Zrepeat_interleaver1   r   r   Zxformersrp   Z"memory_efficient_attention_forwardr&   r   FZscaled_dot_product_attentionr   r   Z$torch_xla.experimental.custom_kernelr   Zreshape)r_   rb   rc   rd   ZbszZq_lenr>   Zkv_lenZ
num_repeatZxopsoutr   r   r   r    ro   p  sB   
zMultiHeadAttention.forwardr   )rw   r   r   r   r   r   r   rA   r;   r   ro   r   r   r   r`   r    r   D  s*    )r   rX   c                 C   sJ   t  rt sd S t }t }|j}|d u rd S t|tsJ ||  d S r   )r   r   r   r   ri   rN   rm   Zwait_for_layer_load)rX   	connectorrs   ri   r   r   r     wait_for_kv_layer_from_connector  s   r   kv_cache_layerc                 C   sR   t  rt sd S t }t }|j}|d u rd S t|tsJ || |||   d S r   )r   r   r   r   ri   rN   rm   Zsave_kv_layer)rX   r   r   rs   ri   r   r   r     maybe_save_kv_layer_to_connector  s   r   rb   rc   rd   r2   c           	      C   s`   t | t }|j}t|tr|| }|j| }|j|j }|j	|| ||||}t
|| |S r   r   r   ri   rN   rm   Zno_compile_layersrZ   rn   rR   ro   r   )	rb   rc   rd   rX   rs   ri   r_   rZ   rh   r   r   r    rr     s   


rr   c                 C   s   t |  S r   )r;   Z
empty_like
contiguous)rb   rc   rd   rX   r   r   r    unified_attention_fake  s   r   )Zop_nameZop_funcZmutates_argsZ	fake_impldispatch_keyrh   output_scalec           
   
   C   sf   t | t }|j}t|tr|| }|j| }|j|j }	|jj	|| |||	|||d t
||	 d S )N)rh   r   r   )
rb   rc   rd   rh   rX   r   rs   ri   r_   rZ   r   r   r    rq     s"   

	rq   c                 C   s   d S r   r   )rb   rc   rd   rh   rX   r   r   r   r    "unified_attention_with_output_fake  s   r   r   )<r   typingr   r   r;   Ztorch.nnnnZtorch.nn.functionalZ
functionalr   Z	vllm.envsr[   Zvllm.attentionr   Z vllm.attention.backends.abstractr   Zvllm.attention.selectorr   r   Z%vllm.attention.utils.kv_sharing_utilsr   Zvllm.configr	   r
   Zvllm.distributed.kv_transferr   r   r   Zvllm.forward_contextr   r   Zvllm.loggerr   Z!vllm.model_executor.layers.linearr   Z3vllm.model_executor.layers.quantization.base_configr   Z0vllm.model_executor.layers.quantization.kv_cacher   Zvllm.platformsr   r   Z
vllm.utilsr   rw   r   r   r!   Moduler"   r   r   r   r   r   rr   r   r   rq   r   r   r   r   r    <module>   s     Y


	
!

