o
    )i                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZmZ ddlZddlmZ ddlmZmZmZmZmZmZmZ dd	lmZmZmZmZmZm Z m!Z!m"Z"m#Z# dd
l$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0 erddl1m2Z2 e(e3Z4G dd deZ5eG dd deZ6G dd dee6 Z7G dd deZ8de9de:de;fddZ<de:de9fddZ=dS )z$Attention layer with FlashAttention.    )defaultdict)	dataclass)
accumulate)TYPE_CHECKINGDictListOptionalTupleTypeN)_custom_ops)AttentionBackendAttentionImplAttentionLayerAttentionMetadataAttentionMetadataBuilderAttentionTypeis_quantized_kv_cache)	PAD_SLOT_IDCommonAttentionStatecompute_slot_mappingcompute_slot_mapping_start_idx&get_num_prefill_decode_query_kv_tokensget_seq_len_block_table_argsis_all_cross_attn_metadata_set is_all_encoder_attn_metadata_setis_block_tables_empty)flash_attn_supports_fp8get_flash_attn_version)init_logger)MultiModalPlaceholderMap)async_tensor_h2dmake_tensor_with_pad)flash_attn_varlen_funcflash_attn_with_kvcache)ModelInputForGPUBuilderc                   @   s
  e Zd ZU dZeed< edee fddZ	ede
fddZeded fd	d
Zeded fddZeded fddZeded fddZedededededeedf f
ddZedejdejdejddfdd Zed!eej d"ejddfd#d$ZdS )%FlashAttentionBackendTaccept_output_bufferreturnc                   C   s   g dS )N)    @   `                   r0   r0   r0   n/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/attention/backends/flash_attn.pyget_supported_head_sizes,   s   z.FlashAttentionBackend.get_supported_head_sizesc                   C   s   dS )NZ
FLASH_ATTNr0   r0   r0   r0   r1   get_name0      zFlashAttentionBackend.get_nameFlashAttentionImplc                   C      t S N)r5   r0   r0   r0   r1   get_impl_cls4   r4   z"FlashAttentionBackend.get_impl_clsr   c                   C   r6   r7   )FlashAttentionMetadatar0   r0   r0   r1   get_metadata_cls8   r4   z&FlashAttentionBackend.get_metadata_clsFlashAttentionMetadataBuilderc                   C   r6   r7   )r;   r0   r0   r0   r1   get_builder_cls<   r4   z%FlashAttentionBackend.get_builder_clsr   c                   C   r6   r7   )r   r0   r0   r0   r1   get_state_cls@   r4   z#FlashAttentionBackend.get_state_cls
num_blocks
block_sizenum_kv_heads	head_size.c                 C   s"   |d dkr
t dd| |||fS )N   r   z$Block size must be a multiple of 16.   )
ValueError)r>   r?   r@   rA   r0   r0   r1   get_kv_cache_shapeD   s   z(FlashAttentionBackend.get_kv_cache_shapesrc_kv_cachedst_kv_cache
src_to_dstNc                 C   s@   | d }|d }t ||| | d }|d }t ||| d S )Nr      )opsswap_blocks)rF   rG   rH   Zsrc_key_cacheZdst_key_cacheZsrc_value_cacheZdst_value_cacher0   r0   r1   rK   O   s   z!FlashAttentionBackend.swap_blocks	kv_cachessrc_to_distsc                 C   s.   dd | D }dd | D }t ||| d S )Nc                 S      g | ]}|d  qS )r   r0   .0kv_cacher0   r0   r1   
<listcomp>a       z5FlashAttentionBackend.copy_blocks.<locals>.<listcomp>c                 S   rN   )rI   r0   rO   r0   r0   r1   rR   b   rS   )rJ   copy_blocks)rL   rM   Z
key_cachesZvalue_cachesr0   r0   r1   rT   \   s   z!FlashAttentionBackend.copy_blocks)__name__
__module____qualname__r&   bool__annotations__staticmethodr   intr2   strr3   r
   r8   r:   r<   r=   r	   rE   torchTensorrK   rT   r0   r0   r0   r1   r%   (   sX   
 

r%   c                   @   s  e Zd ZU dZeee  ed< eej	 ed< eed< eed< eej	 ed< eej	 ed< e
ed< d	Zee ed
< d	Zee ed< d	Zeej	 ed< d	Zeej	 ed< d	Zed  ed< d	Zed  ed< d	Zeee  ed< d	Zeej	 ed< d	Zeej	 ed< d	Zee ed< d	Zee ed< d	Zeej	 ed< d	Zeej	 ed< edd Zedd Zeded  fddZeded  fddZd	S ) r9   a$  Metadata for FlashAttentionBackend.

    NOTE: Any python object stored here is not updated when it is
    cuda-graph replayed. If you have values that need to be changed
    dynamically, it should be stored in tensor. The tensor has to be
    updated from `CUDAGraphRunner.forward` API.
    seq_lensseq_lens_tensormax_prefill_seq_lenmax_decode_seq_lencontext_lens_tensorblock_tablesuse_cuda_graphNmax_query_lenmax_decode_query_lenquery_start_locseq_start_loc_cached_prefill_metadata_cached_decode_metadataencoder_seq_lensencoder_seq_lens_tensorencoder_seq_start_locmax_encoder_seq_lennum_encoder_tokenscross_slot_mappingcross_block_tablesc                 C      t | S )zO
        All attention metadata required for encoder attention is set.
        )r   selfr0   r0   r1   r      s   z7FlashAttentionMetadata.is_all_encoder_attn_metadata_setc                 C   rs   )z
        All attention metadata required for enc/dec cross-attention is set.

        Superset of encoder attention required metadata.
        )r   rt   r0   r0   r1   r      s   z5FlashAttentionMetadata.is_all_cross_attn_metadata_setr'   c                 C   s  | j dkrd S | jd ur| jS | jd us| jd usJ | jd us'| jd us'J | jd u r.d n	| jd | j d  }| jd u r?d n| jd | j }| jd u rNd n| jd | j  }| jd u r]d n| jd | j  }| j	d u rld n	| j	d | j d  }| j
d u r}d n| j
d | j  }| jd u rd n| jd | j  }tdi d| j d| jddd|d| jd| jd	|d
|d| jd| jddddd|d|d|d|ddd| jd| jd| jd| jd| jd| j| _| jS )Nr   rI   num_prefillsnum_prefill_tokensnum_decode_tokensslot_mapping"multi_modal_placeholder_index_mapsenable_kv_scales_calculationr_   r`   rf   ra   rg   rb   rh   ri   rc   rd   re   Frl   rm   rn   ro   rq   rr   r0   )rv   rj   r_   rl   r`   rm   rh   ry   rw   ri   rc   rd   r9   rz   r{   rf   ra   rn   ro   rq   rr   )ru   rh   ry   r_   r`   ri   rc   rd   r0   r0   r1   prefill_metadata   s   



	
z'FlashAttentionMetadata.prefill_metadatac                 C   s  | j dkrd S | jd ur| jS | jd us| jd usJ | jd u r"d n| j| jd  }| jd u r1d n| j| jd  }| jd u r@d n| j| jd  }tdi ddddd| j d|dd ddd	d d
|d| j	d| j
ddd| jd| jd ur| j| jd  | j| j  nd d| jd ur| j| jd  n*d dd d|d| jd| jd| jd| jd| jd| jd| j| _| jS dd d|d| jd| jd| jd| jd| jd| jd| j| _| jS )Nr   rv   rw   rx   ry   rz   r{   Tr_   r`   rg   rf   ra   rb   rh   ri   rc   rd   re   rl   rm   rn   ro   rq   rr   r0   )rx   rk   r`   rm   ry   rw   rv   rd   r9   rg   rf   rb   rh   ri   re   rl   rn   ro   rq   rr   )ru   ry   r`   rd   r0   r0   r1   decode_metadata  s   


	



z&FlashAttentionMetadata.decode_metadata)rU   rV   rW   __doc__r   r   r[   rY   r]   r^   rX   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   propertyr   r   r|   r}   r0   r0   r0   r1   r9   g   s<   
 	

8r9   c                   @   sv   e Zd ZdddZdd Zddd	ed
efddZdedeee  de	j
fddZdee dee dedefddZdS )r;   input_builderr$   c                 C   s"   || _ |j| _|j| _|j| _d S r7   )r   runnersliding_windowr?   )ru   r   r0   r0   r1   __init__;  s   z&FlashAttentionMetadataBuilder.__init__c                 C   sD   g | _ g | _g | _g | _g | _tt| _d| _d| _	d| _
d| _d S )Nr   F)ry   prefill_seq_lenscontext_lensrd   curr_seq_lensr   r   multimodal_placeholder_mapsrv   rw   rx   Zhas_prefix_cache_hitrt   r0   r0   r1   prepareA  s   
z%FlashAttentionMetadataBuilder.prepare
inter_dataz,ModelInputForGPUBuilder.InterDataForSeqGroupchunked_prefill_enabledprefix_cache_hitc              
   C   sV  |j }|j}t|jdd |jD |j|j|j|j|j	D ]\}}}}	}
}}| j
| |rX|j}|rC| D ]\}}| j| | q6|  jd7  _|  j|7  _| j
| n|  j|
7  _| j
|	 g }|rn|| }n|sr|s|dur|dkr|| }n	|| | d }| j
| t|}t||
|| j}t|| j||||| j|j qdS )zAdd a sequence group to the metadata. Specifically update/append
        1. context length.
        2. block table.
        3. slot mapping.
        c                 S   s   g | ]}t |qS r0   )len)rP   tr0   r0   r1   rR   \  rS   z@FlashAttentionMetadataBuilder._add_seq_group.<locals>.<listcomp>rI   Nr   )	is_promptrd   zipZseq_idsZinput_tokensZorig_seq_lensr_   
query_lensr   Zcurr_sliding_window_blocksappendZmulti_modal_placeholder_mapsitemsr   extendrv   rw   r   rx   r   r   r   r   r   ry   r?   )ru   r   r   r   r   rd   Zseq_idZ	token_lenZseq_lenZcurr_seq_lenZ	query_lenZcontext_lenZcurr_sliding_window_blockZmm_mapsmodalityZplaceholdersblock_tableZis_profile_runZ	start_idxr0   r0   r1   _add_seq_groupO  sX   



z,FlashAttentionMetadataBuilder._add_seq_groupnum_seqsrd   r'   c           	      C   s   | j jj\}}||ksJ | j jd | }t|D ]#\}}|r<t|}||kr0|||d |f< q|d | ||d |f< qt|j| j jddS )NT)deviceZnon_blocking)	r   graph_block_tablesshape	enumerater   r]   Z
from_numpytor   )	ru   r   rd   Zmax_batch_sizeZ
max_blocksr   ir   r>   r0   r0   r1   _get_graph_runner_block_tables  s    


z<FlashAttentionMetadataBuilder._get_graph_runner_block_tablesr_   r   cuda_graph_pad_size
batch_sizec                 C   s&  t dd | jjD }| jjD ]}| || jj| q| jj}|dk}t|}	|| jd }
t	|
dkr9t|
}nd}t| j
dd}t| jdd}| j}tt|dd}tt|dd}t	|}|r| jtg|  | jg |  || j }| || j}n
t| jdtj|d	}|	dksJ d
||dusJ t| jtj|| jj}t|tj|| jj}t| jtj|| jj}t|tj|| jj}t|tj|| jj}dd | j D }t di d| jd|d| jd|d|d|ddd|d|	d|d|d|d|d|d|d|d|S ) a  Build attention metadata with on-device tensors.

        Args:
            seq_lens: The maybe padded sequence lengths of the input sequences.
            query_lens: The query lengths of the input sequences.
            cuda_graph_pad_size: The padding size for cuda graph.
                                 -1 if cuda graph is not used.
            batch_size: The maybe padded batch size.
        c                 S   s   g | ]}|j qS r0   )r   )rP   r   r0   r0   r1   rR     s    z7FlashAttentionMetadataBuilder.build.<locals>.<listcomp>Nr   rI   )default)initial)paddtyper   zquery_lens: {}c                 S   s   i | ]	\}}||  qS r0   )Z	index_map)rP   r   Zplaceholder_mapr0   r0   r1   
<dictcomp>  s    z7FlashAttentionMetadataBuilder.build.<locals>.<dictcomp>rv   ry   rw   rx   r_   rz   r{   Tr`   rf   rg   ra   rb   rh   ri   rc   rd   re   r0   )!anyr   Zinter_data_listr   r   r   r   maxrv   r   r   r   rx   listr   ry   r   r   rd   rw   r   r!   r]   r[   formatr    r   Z
pin_memorylongZint32r   r   r9   )ru   r_   r   r   r   r   r   r   Zuse_captured_graphrf   Zdecode_query_lensrg   ra   rb   rx   rh   ri   r   rd   rc   r`   Zslot_mapping_tensorZquery_start_loc_tensorZseq_start_loc_tensorZplaceholder_index_mapsr0   r0   r1   build  s   




	
z#FlashAttentionMetadataBuilder.buildN)r   r$   )rU   rV   rW   r   r   rX   r   r[   r   r]   r^   r   r   r0   r0   r0   r1   r;   8  s,    

<

r;   c                   @   s   e Zd ZdZdejddfdededededee	e  d	ee d
e
dee de
dee
 deddfddZ		ddedejdejdejdejdedeej deej dejfddZdS )r5   a  
    If the input tensors contain prompt tokens, the layout is as follows:
    |<--------------- num_prefill_tokens ----------------->|	
    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|

    Otherwise, the layout is as follows:	
    |<----------------- num_decode_tokens ------------------>|	
    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|

    Generation tokens can contain padding when cuda-graph is used.
    Currently, prompt tokens don't contain any padding.

    The prompts might have different lengths, while the generation tokens
    always have length 1.

    If chunked prefill is enabled, prefill tokens and decode tokens can be
    batched together in a flattened 1D query.

    |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->|
    |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->|

    Currently, cuda graph is disabled for chunked prefill, meaning there's no
    padding between prefill and decode tokens.
    NF	num_headsrA   scaler@   alibi_slopesr   kv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_name	use_iroper'   c                 C   s  |
d urt d|rtd || _|| _t|| _|| _|d ur)tj	|tj
d}|| _|d ur6|d dfnd| _|| _t| jd ud| _t| jr`| jdrSt s`t d	| j d
t  d|d u rfd}|| _| j| j | _t }||vrtd| d| d|	| _d S )Nz5KV sharing is not supported in V0 FLASH_ATTN backend.z_Using irope in V0 is not supported yet, it will fall back to global attention for long context.)r   rI   r   )r   r   )Zrequires_alibifp8z FlashAttention does not support z, kv-cache on this device (FA supports fp8 = z).z
Head size z? is not supported by FlashAttention. Supported head sizes are: .)NotImplementedErrorloggerwarningr   rA   floatr   r@   r]   ZtensorZfloat32r   r   r   r   vllm_flash_attn_versionr   
startswithr   r   Znum_queries_per_kvr%   r2   rD   r   )ru   r   rA   r   r@   r   r   r   r   r   r   r   Zsupport_head_sizesr0   r0   r1   r     sZ   




zFlashAttentionImpl.__init__layerquerykeyvaluerQ   attn_metadataoutputoutput_scalec	           )      C   s  |dusJ d|durt dt r|jtjkr'|jdkr#|jdks'J d| j}	|	tj	kr6|j
s6td|	tjkrB|jsBtd| j}
| j}| j}| j}| j}|
d}|r_t s_t d	| d
kr|d
 }|d }|	tj	kr|dur|dur|	tjkr|j}n|j}tjj|||d
 |d | |
|j|j |r|tj}|tj}|tj}|r|j\}}}t |!||| f" |j#\}}|!|||f}t$||	\}}}||d }||d }|d| }|d| }|jd
 |ksJ |jd
 |ksJ |j% }r?| d
ks |j&du s |j& d
krt'|d|	\}}} }!|d| }|d| }|rs|j\}"}#}t |!|"|#| f" |j\}}|!|"|#|f}t |!|"|#| f" |j\}}|!|"|#|f}|jd
 d |jd f}$t(d%i d|d|d|d|d| d|d|!d|dt)|	d|d|d|d|d| j*d|j#+|$d|j+|$d|j+|$ nx|	tj,ksJ d|j-dusJ |j.dusJ t/|j-}%|j.jd
 d |jd f}$t(d%i d|d|d|d|j.d|j0d|j1d|%d|ddd|d|d |j&d|d|d| j*d|j#+|$d|j+|$d|j+|$ |j2 }&r|&j3dusMJ |&j3dkr|	tj,ks]J d!|&j.duseJ |&j.jd
 d |jd f}$t(d%i d|d|d|d|&j.d|&j3d|&j1d|&j4d|ddd|d|d|d |&j&d|d| j*d|j#+|$d|j+|$d|j+|$ |S t5|&d"|	\}'}}(|'jd
 |jd# f}$t6|7d|||(|'|d||||7d| j*|j#+|$|j+|$|j+|$d$ |S )&a  Forward pass with FlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            output: shape = [num_tokens, num_heads, head_size]
            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
                NOTE: kv_cache will be an empty tensor with shape [0]
                for profiling run.
            attn_metadata: Metadata for attention.
        NOTE: It in-place updates the output tensor.
        NOTE: FP8 quantization, flash-attn expect the size of
              {q,k,v}_descale to be (num_sequences, num_kv_heads).
              We use torch's .expand() to avoid duplicating values
        NzOutput tensor must be provided.zEfused output quantization is not yet supported for FlashAttentionImplg      ?zJkey/v_scale is only supported in FlashAttention 3 with base dtype bfloat16z?Encoder attention requires setting encoder metadata attributes.zUEncoder/decoder cross-attention requires setting cross-attention metadata attributes.r   z<FlashAttention does not support FP8 kv-cache on this device.r   rI   TqkvZcu_seqlens_qZcu_seqlens_kZmax_seqlen_qZmax_seqlen_ksoftmax_scalecausalwindow_sizer   softcapout
fa_version	q_descale	k_descale	v_descalez/Only decoder-only models support prefix cachingZ	seqused_kr   z9Only decoder-only models support max_decode_query_len > 1F)r   Zk_cacheZv_cacher   Zcache_seqlensr   r   r   r   r   r   r   r   r   r   r0   )8r   r   r   r]   Zbfloat16Z_k_scale_floatZ_v_scale_floatr   r   ENCODERr   AttributeErrorENCODER_DECODERr   r   r   r   r   r   r   Znumelrq   ry   rJ   Z_C_cache_opsZreshape_and_cache_flashflattenZ_k_scaleZ_v_scaleviewZfloat8_e4m3fnr   Zscaled_fp8_quantZreshape
contiguousZ_q_scaler   r|   rd   _get_query_key_seq_metadatar"   _get_causal_optionr   expandDECODERr_   rh   r   rf   r`   r}   rg   rb   r   r#   Z	unsqueeze))ru   r   r   r   r   rQ   r   r   r   r   r   r   r   r   r   Zfp8_attentionZ	key_cacheZvalue_cacheZupdated_slot_mappingZ
num_tokensr   rA   _Znum_prefill_query_tokensZnum_prefill_kv_tokensZnum_decode_query_tokensZdecode_queryZdecode_outputZprefill_outputZprefill_metaZq_seq_start_locZ	q_seq_lenZk_seq_start_locZ	k_seq_lenZnum_kv_tokensr@   Zdescale_shapemax_seq_lenZdecode_metaZseq_lens_argZblock_tables_argr0   r0   r1   forwardM  s  







	

	
	
-



zFlashAttentionImpl.forward)NN)rU   rV   rW   r~   r   r   r[   r   r   r   r\   rX   r   r   r]   r^   r9   r   r0   r0   r0   r1   r5     sh    "
	

>	
r5   r   r   r'   c                 C   s   |t jkr|r| j}n| j}| j|| j|fS |t jkr-|r!| j}n| j}| j|| j| jfS |t jkr<| j| j| j| jfS |t j	krQ|sGJ d| j| j| j| jfS t
dt| )aO  
    Returns sequence metadata for key and query based on the specified 
    attention type and whether input is a prompt.

    This function computes the starting locations and maximum sequence lengths 
    for key and query sequences for different attention types.

    Args:
        attn_metadata: The attention metadata object
        is_prompt (bool): A flag indicating if the input is a prompt
        attn_type (AttentionType): The type of attention being used.

    Returns:
        tuple: A tuple containing four integers:
            - Starting location for the query sequence.
            - Maximum sequence length for the query sequence.
            - Starting location for the key sequence.
            - Maximum sequence length for the key sequence.

    Raises:
        AttributeError: If an invalid attention type is provided.
    z.Should not have decode for encoder only model.zInvalid attention type )r   r   ra   rb   ri   r   rn   ro   r   ENCODER_ONLYr   r\   )r   r   r   r   r0   r0   r1   r   S  s6   



r   c                 C   s    | t jkp| t jkp| t jk S )a  
    Determine whether the given attention type is suitable for causal 
    attention mechanisms.

    Args:
        attn_type (AttentionType): The type of attention being evaluated

    Returns:
        bool: Returns `True` if the attention type is suitable for causal 
        attention (i.e., not encoder, encoder-only, or encoder-decoder), 
        otherwise returns `False`.
    )r   r   r   r   )r   r0   r0   r1   r     s
   
r   )>r~   collectionsr   dataclassesr   	itertoolsr   typingr   r   r   r   r	   r
   r]   Zvllmr   rJ   Z vllm.attention.backends.abstractr   r   r   r   r   r   r   Zvllm.attention.backends.utilsr   r   r   r   r   r   r   r   r   Zvllm.attention.utils.fa_utilsr   r   Zvllm.loggerr   Zvllm.multimodalr   Z
vllm.utilsr    r!   Zvllm.vllm_flash_attnr"   r#   Zvllm.worker.model_runnerr$   rU   r   r%   r9   r;   r5   rX   r\   tupler   r   r0   r0   r0   r1   <module>   sH    $,? 
Q F  X
A