o
    )i~                     @   sp  d Z ddlmZ ddlmZmZmZmZmZ ddl	Z	ddl
mZ ddlmZmZmZmZ ddlmZmZmZmZmZ ddlmZmZmZmZmZmZ dd	lm Z m!Z! dd
l"m#Z# e#e$Z%G dd deZ&eG dd dee!Z'de'de(dee fddZ)de'deee  de(ddfddZ*G dd dee' Z+G dd dee' Z,de	j-de.de	j/dee. dee f
dd Z0dS )!z1Attention layer with xFormers and PagedAttention.    )	dataclass)DictListOptionalTupleTypeN)ops)AttentionBiasBlockDiagonalCausalMaskBlockDiagonalMask!LowerTriangularMaskWithTensorBias)AttentionBackendAttentionImplAttentionLayerAttentionMetadataAttentionType)CommonAttentionStateCommonMetadataBuilder&get_num_prefill_decode_query_kv_tokensget_seq_len_block_table_argsis_all_cross_attn_metadata_set is_all_encoder_attn_metadata_set)PagedAttentionPagedAttentionMetadata)init_loggerc                   @   s   e Zd ZedefddZeded fddZeded fdd	Zeded
 fddZ	eded fddZ
edededededeedf f
ddZedejdejdeeef ddfddZedeej dejddfdd ZdS )!XFormersBackendreturnc                   C   s   dS )NZXFORMERS r   r   r   l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/attention/backends/xformers.pyget_name      zXFormersBackend.get_nameXFormersImplc                   C      t S N)r!   r   r   r   r   get_impl_cls"   r    zXFormersBackend.get_impl_clsr   c                   C   r"   r#   )XFormersMetadatar   r   r   r   get_metadata_cls&   r    z XFormersBackend.get_metadata_clsXFormersMetadataBuilderc                   C   r"   r#   )r'   r   r   r   r   get_builder_cls*   r    zXFormersBackend.get_builder_clsr   c                   C   r"   r#   )r   r   r   r   r   get_state_cls.   r    zXFormersBackend.get_state_cls
num_blocks
block_sizenum_kv_heads	head_size.c                 C   s   t | |||S r#   )r   get_kv_cache_shape)r*   r+   r,   r-   r   r   r   r.   2   s   z"XFormersBackend.get_kv_cache_shapesrc_kv_cachedst_kv_cache
src_to_dstNc                 C   s   t | || d S r#   )r   swap_blocks)r/   r0   r1   r   r   r   r2   <   s   zXFormersBackend.swap_blocks	kv_cachessrc_to_distsc                 C   s   t | | d S r#   )r   copy_blocks)r3   r4   r   r   r   r5   D   s   zXFormersBackend.copy_blocks)__name__
__module____qualname__staticmethodstrr   r   r$   r&   r(   r)   intr   r.   torchTensorr   r2   r   r5   r   r   r   r   r      sR    
	
r   c                   @   s  e Zd ZU dZeej ed< eed< eed< e	ed< dZ
eee  ed< dZeej ed< dZeej ed	< dZee ed
< dZee ed< dZeej ed< dZed  ed< dZed  ed< dZeee  ed< dZeej ed< dZeej ed< dZee ed< dZee ed< dZeej ed< dZeej ed< dd Zedd Zedd Zeded  fddZeded  fdd ZdS )!r%   a  Metadata for XFormersbackend.

    NOTE: Any python object stored here is not updated when it is
    cuda-graph replayed. If you have values that need to be changed
    dynamically, it should be stored in tensor. The tensor has to be
    updated from `CUDAGraphRunner.forward` API.
    seq_lens_tensormax_prefill_seq_lenmax_decode_seq_lenuse_cuda_graphNseq_lensseq_start_loccontext_lens_tensormax_query_lenmax_decode_query_lenquery_start_loc_cached_prefill_metadata_cached_decode_metadataencoder_seq_lensencoder_seq_lens_tensorencoder_seq_start_locmax_encoder_seq_lennum_encoder_tokenscross_slot_mappingcross_block_tablesc                 C   s   d | _ d | _d | _d S r#   )	attn_biasencoder_attn_biascross_attn_biasselfr   r   r   __post_init__   s   
zXFormersMetadata.__post_init__c                 C      t | S )zO
        All attention metadata required for encoder attention is set.
        )r   rT   r   r   r   r      s   z1XFormersMetadata.is_all_encoder_attn_metadata_setc                 C   rW   )z
        All attention metadata required for enc/dec cross-attention is set.

        Superset of encoder attention required metadata.
        )r   rT   r   r   r   r      s   z/XFormersMetadata.is_all_cross_attn_metadata_setr   c                 C   s  | j dkrd S | jd ur| jS | jd us| jd usJ | jd us'| jd us'J | jd u r.d n	| jd | j d  }| jd u r?d n	| jd | j d  }| jd u rPd n| jd | j	 }| jd u r_d n| jd | j  }| jd u rnd n| jd | j  }| j
d u r}d n| j
d | j  }| jd u rd n| jd | j  }tdi d| j d| j	ddd|d| jd| jd	|d
|d| jd| jddd|d|d|d|ddd| jd| jd| jd| jd| j| _| jS )Nr      num_prefillsnum_prefill_tokensnum_decode_tokensslot_mapping"multi_modal_placeholder_index_mapsenable_kv_scales_calculationrB   r>   rE   r?   r@   rG   rC   rD   block_tablesrA   FrJ   rK   rM   rO   rP   r   )rY   rH   rB   rJ   r>   rK   rG   rC   r\   rZ   rD   r_   r%   r]   r^   rE   r?   rM   rO   rP   )rU   rG   rC   r\   rB   r>   rD   r_   r   r   r   prefill_metadata   s   



	
z!XFormersMetadata.prefill_metadatac                 C   s6  | j dkrd S | jd ur| jS | jd us| jd usJ | jd u r"d n| j| jd  }| jd u r1d n| j| jd  }| jd u r@d n| j| jd  }tdi ddddd| j d|dd ddd	|d
dd| j	d|d| j
d| jd| jd| jd| jd| j| _| jjd ur| jj}||d  | j_| jS )Nr   rY   rZ   r[   r\   r]   r^   Tr>   r?   r@   r_   rA   rJ   rK   rM   rO   rP   r   )r[   rI   r>   rK   r\   rZ   rY   r_   r%   r@   rA   rJ   rM   rO   rP   rG   )rU   r\   r>   r_   qsr   r   r   decode_metadata   sh   


	
z XFormersMetadata.decode_metadata) r6   r7   r8   __doc__r   r<   r=   __annotations__r;   boolrB   r   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rV   propertyr   r   r`   rb   r   r   r   r   r%   L   s<   
 


9r%   attn_metadata	attn_typer   c                 C   sL   |t jks
|t jkr| jS |t jkr| jS |t jkr| jS tdt	| )a}  
    Extract appropriate attention bias from attention metadata
    according to attention type.

    Arguments:

    * attn_metadata: Attention metadata structure associated with attention
    * attn_type: encoder attention, decoder self-attention,
                 encoder/decoder cross-attention

    Returns:
    * Appropriate attention bias value given the attention type
    Invalid attention type 
r   DECODERENCODER_ONLYrQ   ENCODERrR   ENCODER_DECODERrS   AttributeErrorr:   )rg   rh   r   r   r   _get_attn_bias%  s   



rp   rQ   c                 C   sX   |t jks
|t jkr|| _dS |t jkr|| _dS |t jkr#|| _dS tdt	| )ae  
    Update appropriate attention bias field of attention metadata,
    according to attention type.

    Arguments:

    * attn_metadata: Attention metadata structure associated with attention
    * attn_bias: The desired attention bias value
    * attn_type: encoder attention, decoder self-attention,
                 encoder/decoder cross-attention
    ri   Nrj   )rg   rQ   rh   r   r   r   _set_attn_biasB  s   






rq   c                   @   s   e Zd ZeZdS )r'   N)r6   r7   r8   r%   Z_metadata_clsr   r   r   r   r'   ^  s    r'   c                   @   s   e Zd ZdZdejddfdededededee	e  d	ee d
e
dee de
dee
 deddfddZ		ddedejdeej deej dejdddeej deej dejfddZejfdejdejdejdede
dejfddZdS ) r!   a  
    If the input tensors contain prompt tokens, the layout is as follows:
    |<--------------- num_prefill_tokens ----------------->|	
    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|

    Otherwise, the layout is as follows:	
    |<----------------- num_decode_tokens ------------------>|	
    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|

    Generation tokens can contain padding when cuda-graph is used.
    Currently, prompt tokens don't contain any padding.

    The prompts might have different lengths, while the generation tokens
    always have length 1.

    If chunked prefill is enabled, prefill tokens and decode tokens can be
    batched together in a flattened 1D query.

    |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->|
    |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->|

    Currently, cuda graph is disabled for chunked prefill, meaning there's no
    padding between prefill and decode tokens.
    NF	num_headsr-   scaler,   alibi_slopessliding_windowkv_cache_dtypelogits_soft_caprh   kv_sharing_target_layer_name	use_iroper   c                 C   s   |
d urt d|d urtd |rtd || _|| _t|| _|| _|d ur2tj	|tj
d}|| _|| _|| _| j| j | _t }||vrUtd| d| d|	| _d S )Nz3KV sharing is not supported in V0 XFORMERS backend.zGXFormers does not support logits soft cap. Outputs may be slightly off.zeUsing irope in XFormers is not supported yet, it will fall back to global attention for long context.dtypez
Head size z? is not supported by PagedAttention. Supported head sizes are: .)NotImplementedErrorloggerZwarning_oncerr   r-   floatrs   r,   r<   ZtensorZfloat32rt   ru   rv   num_queries_per_kvr   Zget_supported_head_sizes
ValueErrorrh   )rU   rr   r-   rs   r,   rt   ru   rv   rw   rh   rx   ry   Zsupported_head_sizesr   r   r   __init__}  s4   


zXFormersImpl.__init__layerquerykeyvaluekv_cacherg   r%   outputoutput_scalec	                 C   s  |durt d| j}	|	tjkr|jstd|	tjkr#|js#td|d| j	| j
}|durI|dus6J |d| j| j
}|d| j| j
}n|du sOJ |	tjkr| dkrt|| j| j
\}
}|dur|dur|	tjkrv|j}n|j}t|||
||| j|j|j t||	\}}}t|}||d }|d| }|dur|dur|d| }|d| }|jd |ksJ |jd |ksJ |j }r?| dks|j dkr| j|||||	d}|j|d| jksJ ||d|< nH|	tjksJ d|jdus	J |jdusJ t|||| j|
||j|j|j |j| j!| j"|j|j}|d| j|jks9J ||d|< |j# }rr|	tjksOJ d	t$|d
|	\}}}t%||
||||| j| j| j&| j!|j|j||d< |d| j	| j
 S )ad
  Forward pass with xFormers and PagedAttention.

        For decoder-only models: query, key and value must be non-None.

        For encoder/decoder models:
        * XFormersImpl.forward() may be invoked for both self- and cross-
          attention layers.
        * For self-attention: query, key and value must be non-None.
        * For cross-attention:
            * Query must be non-None
            * During prefill, key and value must be non-None; key and value
              get cached for use during decode.
            * During decode, key and value may be None, since:
              (1) key and value tensors were cached during prefill, and
              (2) cross-attention key and value tensors do not grow during
                  decode
        
        A note on how the attn_type (attention type enum) argument impacts
        attention forward() behavior:
    
            * DECODER: normal decoder-only behavior;
                use decoder self-attention block table
            * ENCODER: no KV caching; pass encoder sequence
                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
                max_encoder_seq_len) to kernel, in lieu of decoder
                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len).
                Used for encoder branch of encoder-decoder models.
            * ENCODER_ONLY: no kv_caching, uses the normal attention 
                attributes (seq_lens/seq_lens_tensor/max_seq_len).
            * ENCODER_DECODER: cross-attention behavior;
                use cross-attention block table for caching KVs derived
                from encoder hidden states; since KV sequence lengths
                will match encoder sequence lengths, pass encoder sequence
                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
                max_encoder_seq_len)
    
        Args:
            query: shape = [num_tokens, num_heads * head_size]
            key: shape = [num_tokens, num_kv_heads * head_size]
            value: shape = [num_tokens, num_kv_heads * head_size]
            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
                NOTE: kv_cache will be an empty tensor with shape [0]
                for profiling run.
            attn_metadata: Metadata for attention.
            attn_type: Select attention type, between encoder attention,
                       decoder self-attention, or encoder/decoder cross-
                       attention. Defaults to decoder self-attention,
                       which is the vLLM default generally
        Returns:
            shape = [num_tokens, num_heads * head_size]
        Nz?fused output quantization is not yet supported for XFormersImplz?Encoder attention requires setting encoder metadata attributes.zUEncoder/decoder cross-attention requires setting cross-attention metadata attributes.r   )rh   z5Encoder-only models should not have prefix attention.z4Encoder-only models should not have decode metadata.F)'r}   rh   r   rm   r   ro   rn   r   viewrr   r-   r,   Znumelr   Zsplit_kv_cacherO   r\   Zwrite_to_paged_cacherv   Z_k_scaleZ_v_scaler   r<   
empty_likeshaper`   r_   &_run_memory_efficient_xformers_forwardrl   rG   rE   Zforward_prefixr>   rt   ru   rb   r   Zforward_decoders   )rU   r   r   r   r   r   rg   r   r   rh   Z	key_cacheZvalue_cacheZupdated_slot_mappingZnum_prefill_query_tokensZnum_prefill_kv_tokensZnum_decode_query_tokensZdecode_queryZprefill_metaoutZdecode_metaZseq_lens_argZmax_seq_len_argZblock_tables_argr   r   r   forward  s   >







zXFormersImpl.forwardc              	   C   s  |}| j | jkrP||jd | j | j|jd }|dddddddf |jd | j | j|jd }|dddddddf |jd | j | j|jd }t||}|du r| jdu r|tj	kr}|j
dusjJ |jdusqJ tj|j
|j|jd}nG|tjkr|jdusJ tj|j|jd}n1|tjkr|j
dusJ tj|j
|jd}n|tjkr|j
dusJ tj|j
|jd}ntd|| jdur|| j}|g}n|tjksJ |j
dusJ t| j| j |j|j
}t||| | jdu r|d}|d}|d}tj||||d d| jd}||S |j
dus"J t|}	d}
t |j
D ]=\}}|
| }tj|d|
|f |d|
|f |d|
|f || d| jd}|	|
| !|||
|  |
|7 }
q.|	S )aK  Attention for 1D query of multiple prompts. Multiple prompt
        tokens are flattened in to `query` input.

        See https://facebookresearch.github.io/xformers/components/ops.html
        for API spec.

        Args:
            output: shape = [num_prefill_tokens, num_heads, head_size]
            query: shape = [num_prefill_tokens, num_heads, head_size]
            key: shape = [num_prefill_tokens, num_kv_heads, head_size]
            value: shape = [num_prefill_tokens, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
            attn_type: Select attention type, between encoder attention,
                       decoder self-attention, or encoder/decoder cross-
                       attention. Defaults to decoder self-attention,
                       which is the vLLM default generally
        r   r   N)devicezUnknown AttentionType: %sg        )rQ   prs   )"r,   rr   r   r   r   expandrp   rt   r   rn   rB   rJ   r   Zfrom_seqlensr   rm   rl   rk   r
   r   ru   Zmake_local_attention_make_alibi_biasr{   rq   Z	unsqueezexopsZ"memory_efficient_attention_forwardrs   Zview_asr<   r   	enumeratecopy_)rU   r   r   r   rg   rh   Zoriginal_queryrQ   r   r   startiseq_lenendr   r   r   r   w  s   












 z3XFormersImpl._run_memory_efficient_xformers_forward)NN)r6   r7   r8   rc   r   rk   r;   r   r   r   r:   re   r   r   r<   r=   r   r%   r   r   r   r   r   r!   c  s    "
	

4	

 Ur!   rt   r,   r{   rB   c           	   	   C   s   g }|D ]V}t j||d}|d d d f |d d d f  }|d d d }| jd }t jd|||| j|dd d d d d d d |f |}|| d d d d f  |t| q|S )Nrz         r   rX   )r   r{   )	r<   Zaranger   emptyr   r   Zmul_appendr   )	rt   r,   r{   rB   Zattn_biasesr   ZbiasZ
padded_lenrr   r   r   r   r     s*    
r   )1rc   dataclassesr   typingr   r   r   r   r   r<   Zxformersr   r   Zxformers.ops.fmha.attn_biasr	   r
   r   r   Z vllm.attention.backends.abstractr   r   r   r   r   Zvllm.attention.backends.utilsr   r   r   r   r   r   Zvllm.attention.ops.paged_attnr   r   Zvllm.loggerr   r6   r~   r   r%   r:   rp   rq   r'   r!   r=   r;   r{   r   r   r   r   r   <module>   s^    0 Y


   %