o
    )ie                     @   sH  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZmZmZmZ ddlZddlZddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ee%Z&e
rnddl'm(Z( dZ)dZ*dZ+e
r|ddl,m-Z- dedef fddZ.de/de0de0de0fddZ1dee0 dee0 de0de0d e0f
d!d"Z2dee0 dee0 de0de0d e0f
d#d$Z3d%e/dee0 d&e0d'e0de0d(e0d e0dee0ee0 f fd)d*Z4ed+d,d-Z5G d.d/ d/ee5 Z6G d0d1 d1eZ7d2d3 Z8d4d5 Z9de/d6e:d7e;fd8d9Z<d6e:d7ee0e0e0f fd:d;Z=eG d<d= d=Z>d>ed7e>fd?d@Z?dS )AzAttention backend utils    )defaultdict)contextmanager)	dataclass)
accumulate)	TYPE_CHECKINGAnyDictListOptionalTupleTypeTypeVarUnionN)AttentionMetadataAttentionMetadataBuilderAttentionState)AttentionType)ModelConfig)init_logger)MultiModalPlaceholderMap)async_tensor_h2dmake_tensor_with_pad)ModelRunnerBasez@ROCm/HIP is not currently supported with encoder/decoder models.   )ModelInputForGPUBuilderblock_tablesc                 C   s,   | du rdS t | totdd |  D S )zM
    Check if block_tables is None or a dictionary with all None values.
    NTc                 s   s    | ]}|d u V  qd S N ).0valuer   r   i/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/attention/backends/utils.py	<genexpr>1   s    z(is_block_tables_empty.<locals>.<genexpr>)
isinstancedictallvalues)r   r   r   r!   is_block_tables_empty*   s
   
r'   	is_prompt	query_lencontext_lensliding_windowc                 C   s"   d}| r|durt d|| }|S )z2
    Compute the start index of slot mapping.
    r   N)max)r(   r)   r*   r+   	start_idxr   r   r!   compute_slot_mapping_start_idx4   s   r.   slot_mappingblock_tablerange_start	range_end
block_sizec           	      C   s>   t ||D ]}|||  }|| }|| | }| | qd S r   )rangeappend)	r/   r0   r1   r2   r3   iZblock_numberblock_offsetslotr   r   r!   _compute_slot_mapping_python?   s   r9   c           	      C   sL   t |}t ||}|| }|| }|| }||9 }||7 }| | d S r   )nparrayZarangeextend)	r/   r0   r1   r2   r3   Zblock_table_arrayidxr7   Zseq_slot_mapping_arrayr   r   r!   _compute_slot_mapping_numpyI   s   
r>   is_profile_runseq_idseq_lenr-   c                 C   s   | r| tg|  dS td|| }| tg|  t||}	|}
|
|	 }|| }|tk r8t|||	|
| dS t|||	|
| dS )z
    Compute slot mapping.
    Nr   )r<   PAD_SLOT_IDr,   !_COMPUTE_SLOT_MAPPING_NUMPY_NUMELr9   r>   )r?   r/   r@   rA   r*   r-   r3   r   Zpadding_mask_lenr1   r2   Znumelr0   r   r   r!   compute_slot_mappingV   s    
rD   TAttentionMetadatar   )boundc                   @   s`   e Zd ZU ee ed< dddZdd Zdd	d
efddZ	de
e de
e dedefddZdS )CommonMetadataBuilder_metadata_clsinput_builderr   c                 C   s"   || _ |j| _|j| _|j| _d S r   )rI   runnerr+   r3   )selfrI   r   r   r!   __init__   s   zCommonMetadataBuilder.__init__c                 C   s>   g | _ g | _g | _g | _g | _tt| _d| _d| _	d| _
d S )Nr   )r/   prefill_seq_lenscontext_lensr   curr_seq_lensr   r   multimodal_placeholder_mapsnum_prefillsnum_prefill_tokensnum_decode_tokens)rK   r   r   r!   prepare   s   
zCommonMetadataBuilder.prepare
inter_dataz,ModelInputForGPUBuilder.InterDataForSeqGroupchunked_prefill_enabledc              
   C   sr  |j }|j}t|jdd |jD |j|j|j|j|j	D ]\}}}}}	}
}| j
|
 |rX|j}|rC| D ]\}}| j| | q6|  jd7  _|  j|7  _| j
| n|	dkseJ d||
|	|  j|	7  _| j
| g }|jr||| }n|s|s|d ur|dkr|| }n	|| | d  }| j
| t|}t||	|
| j}t|| j|||
|| j|j qd S )Nc                 S   s   g | ]}t |qS r   )len)r   tr   r   r!   
<listcomp>   s    z8CommonMetadataBuilder._add_seq_group.<locals>.<listcomp>   z+seq_len: {}, context_len: {}, query_len: {}r   )r(   r   zipZseq_idsZinput_tokensZorig_seq_lensseq_lens
query_lensrN   Zcurr_sliding_window_blocksr5   Zmulti_modal_placeholder_mapsitemsrP   r<   rQ   rR   rM   formatrS   rO   Zprefix_cache_hitr'   r.   r+   rD   r/   r3   )rK   rU   rV   r(   r   r@   Z	token_lenrA   Zcurr_seq_lenr)   r*   Zcurr_sliding_window_blockZmm_mapsmodalityZplaceholdersr0   r?   r-   r   r   r!   _add_seq_group   sb   




z$CommonMetadataBuilder._add_seq_groupr\   r]   cuda_graph_pad_size
batch_sizec                 C   s  | j jD ]
}| || j j q| jj}|dk}t|}t| jdd}	t| jdd}
| j	}t
t|dd}t
t|dd}|rz| jtg|  | jg |  |}| jjd| }t| jD ]\}}|rn|||dt|f< q^t|j|dd}n
t| jdtj|d}|dksJ d	||dusJ t| jtj|| jj}t|tj|| jj}t| jtj|| jj}t|tj|| jj}t|tj|| jj}d
d | j D }| j di d| j!d|d|ddd| j"d|d|d|d|d|	d|
d|d|d|d|d|S )a  Build attention metadata with on-device tensors.

        Args:
            seq_lens: The maybe padded sequence lengths of the input sequences.
            query_lens: The query lengths of the input sequences.
            cuda_graph_pad_size: The padding size for cuda graph.
                                 -1 if cuda graph is not used.
            batch_size: The maybe padded batch size.
        r   r   )default)initialNTZnon_blocking)paddtypedevicezquery_lens: {}c                 S   s   i | ]	\}}||  qS r   )Z	index_map)r   r`   Zplaceholder_mapr   r   r!   
<dictcomp>
  s    z/CommonMetadataBuilder.build.<locals>.<dictcomp>rQ   r/   "multi_modal_placeholder_index_mapsenable_kv_scales_calculationrR   rS   r\   seq_lens_tensormax_query_lenmax_prefill_seq_lenmax_decode_seq_lenquery_start_locseq_start_loccontext_lens_tensorr   use_cuda_graphr   )#rI   Zinter_data_listra   rV   rJ   ri   r,   rM   rO   rS   listr   r/   r<   rB   r   graph_block_tables	enumeraterW   torch
from_numpytor   intr_   r   rN   Z
pin_memorylongint32rP   r^   rH   rQ   rR   )rK   r\   r]   rb   rc   rU   ri   Zuse_captured_graphrn   ro   rp   rS   rq   rr   Zinput_block_tablesr6   r0   r   rs   rm   Zslot_mapping_tensorZquery_start_loc_tensorZseq_start_loc_tensorZplaceholder_index_mapsr   r   r!   build   s   




	
zCommonMetadataBuilder.buildN)rI   r   )__name__
__module____qualname__r   rE   __annotations__rL   rT   boolra   r	   r{   r~   r   r   r   r!   rG      s   
 

7rG   c                   @   s   e Zd ZdddZedefddZded	d fd
dZ	ddedefddZ		dded	e
eef fddZ	dded	dfddZd ddZdefddZde
eef fddZde
eef fddZdS )!CommonAttentionStaterJ   r   c                 C   s   || _ d| _d S )NF)rJ   _is_graph_capturing)rK   rJ   r   r   r!   rL   &  s   
zCommonAttentionState.__init__max_batch_sizec                 c   st    d| _ tj|fttj| jjd| _tj|tj	| jjd| _
t| jjj| jjd| _d V  d| _ | `| `
| `d S )NT)rh   ri   )ri   F)r   rx   fullrB   r|   rJ   ri   _graph_slot_mappingZonesr}   _graph_seq_lensry   rv   rz   _graph_block_tables)rK   r   r   r   r!   graph_capture*  s*   z"CommonAttentionState.graph_capturerc   returnc                 C   s   | j sJ | | jS r   )r   	__class__rJ   )rK   rc   r   r   r!   graph_clone@  s   
z CommonAttentionState.graph_cloneFis_encoder_decoder_modelc                 C   s   | j sJ | jjjdi ddddd|d| jd | dd ddd	d d
| jd | ddddddd| jjdd dd dd d| jd | dd}|rn| jj dv sgJ d| jj  d| j	||d |S )NrQ   r   rR   rS   r/   rk   rl   Tr\   rm   rn   rZ   Zmax_decode_query_lenro   rp   rq   rr   rs   r   rt   XFORMERS
FLASH_ATTNZ
ROCM_FLASH[Expected attn_backend name to be either 'XFORMERS','ROCM_FLASH', or 'FLASH_ATTN', but got '')rc   attn_metadatar   )
r   rJ   attn_backendZmake_metadatar   r   max_seq_len_to_capturer   get_name+_update_captured_metadata_for_enc_dec_model)rK   rc   r   r   r   r   r!   $graph_capture_get_metadata_for_batchD  s`   
	


z9CommonAttentionState.graph_capture_get_metadata_for_batchc                 C   sT   |j |jj|jjd}|r(| jj dv s!J d| jj  d| j||d |S )N)r/   rm   r   r   r   r   )r   input_buffers)r/   decode_metadatarm   r   rJ   r   r   /_add_additional_input_buffers_for_enc_dec_model)rK   r   r   r   r   r   r!   get_graph_input_buffersg  s    

z,CommonAttentionState.get_graph_input_buffersNc                 C   sl   |d j |jjdd |d j |jjdd |r4| jj dv s,J d| jj  d| || d S d S )Nrm   Trf   r   )r   r   zMExpected attn_backend name to be either 'XFORMERS' or 'FLASH_ATTN', but got 'r   )copy_r   rm   r   rJ   r   r   (_prepare_input_buffers_for_enc_dec_model)rK   r   r   r   r   r   r!   prepare_graph_input_buffers|  s$   

z0CommonAttentionState.prepare_graph_input_buffersc                 C   s   d S r   r   )rK   Zmodel_inputr   r   r!   begin_forward  s   z"CommonAttentionState.begin_forwardc                 C   s   t jg t jd |_t j|| j fdt jd |_t j|fdt jd |_	t j|fdt jd |_
| jj|_d|_dS )a{  
        Updates the attention metadata parameters for CUDA graph capture in an
        encoder-decoder model.

        This method modifies attention-related tensors and metadata required
        for CUDA graph capture in encoder-decoder models. Specifically, it
        updates the cross-attention and encoder sequence tensors in the 
        AttentionMetadata object.
        )rh   rZ   r   N)rx   Ztensorr{   cudacross_slot_mappingr   rJ   Zget_max_block_per_batchcross_block_tablesencoder_seq_lensencoder_seq_lens_tensorr   max_encoder_seq_lennum_encoder_tokens)rK   rc   r   r   r   r!   r     s2   


z@CommonAttentionState._update_captured_metadata_for_enc_dec_modelr   c                 C   s(   |j j|d< |j j|d< |j j|d< dS )a  
        Saves additional input buffers specific to the encoder-decoder model
        from the attention metadata.

        This method extracts and stores encoder-decoder related input buffers
        from the `attn_metadata` into the `input_buffers` dictionary. The
        buffers include encoder sequence lengths, cross-slot mappings, and
        cross-block tables, which are essential for the encoder-decoder model
        during CUDA graph replay.
        r   r   r   N)r   r   r   r   rK   r   r   r   r   r!   r     s   
zDCommonAttentionState._add_additional_input_buffers_for_enc_dec_modelc                 C   sF   |d j |jjdd |d j |jjdd |d j |jjdd dS )a  
        Populates input buffers with data from the encoder-decoder model's
        attention metadata.

        This method fills the input buffers with encoder-decoder specific
        tensors. It copies data from the `attn_metadata` and keyword arguments
        (`kwargs`) into corresponding buffers in the `input_buffers` dictionary.
        The copied data includes attention-related metadata as well as input 
        IDs and positional information for the encoder.
        r   Trf   r   r   N)r   r   r   r   r   r   r   r   r!   r     s   
z=CommonAttentionState._prepare_input_buffers_for_enc_dec_model)rJ   r   )F)r   N)r   r   r   rL   r   r{   r   r   r   r   r   strr   r   r   r   r   r   r   r   r   r   r!   r   $  sB    

&





r   c                 C   s   | j duo| jduo| jduS )zG
    All attention metadata required for encoder attention is set.
    N)r   r   r   r   r   r   r!    is_all_encoder_attn_metadata_set  s
   
r   c                 C   s   | j o| jduo| jduS )z
    All attention metadata required for enc/dec cross-attention is set.

    Superset of encoder attention required metadata.
    N)r   r   r   r   r   r   r!   is_all_cross_attn_metadata_set  s
   r   	attn_typer   c                 C   sn   |t jkr|r| j}n| j}| j|| jfS |t jkr"| j| j| j	fS |t j
kr.| j| jdfS tdt| )al  
    The particular choice of sequence-length- and block-table-related
    attributes which should be extracted from attn_metadata is dependent
    on the type of attention operation.

    Decoder attn -> select entirely decoder self-attention-related fields
    Encoder/decoder cross-attn -> select encoder sequence lengths & 
                                  cross-attn block-tables fields
    Encoder attn -> select encoder sequence lengths fields & no block tables
    
    Arguments:

    * attn_metadata: Attention metadata structure associated with attention op
    * is_prompt: True if prefill, False otherwise
    * attn_type: encoder attention, decoder self-attention,
                 encoder/decoder cross-attention

    Returns:

    * Appropriate sequence-lengths tensor
    * Appropriate max sequence-length scalar
    * Appropriate block tables (or None)
    NzInvalid attention type )r   ZDECODERro   rp   rm   r   ENCODER_DECODERr   r   r   ENCODERAttributeErrorr   )r   r(   r   Zmax_seq_lenr   r   r!   get_seq_len_block_table_args  s"   


r   c                 C   s~   d}d}d}|t jkr| jdusJ | j}| j}d}n|t jkr1| jdus'J | j}| j}| j}n	| j}| j}| j}|||fS )a  
    Calculate the number of prefill and decode tokens for query, key/value
    based on the attention metadata and the specified attention type.

    Args:
        attn_metadata (AttentionMetadata): Attention Metadata object.
        attn_type (AttentionType): The type of attention being used.
    Returns:
        Tuple[int, int, int]: A tuple containing three integers:
            - The number of prefill query tokens.
            - The number of prefill key/value tokens.
            - The number of decode query tokens.

    Raises:
        AssertionError: If the number of encoder tokens in `attn_metadata` 
        is `None` when required for the calculations.
    r   N)r   r   r   r   rR   rS   )r   r   Znum_prefill_query_tokensZnum_decode_query_tokensZnum_prefill_kv_tokensr   r   r!   &get_num_prefill_decode_query_kv_tokens!  s&   

r   c                   @   s:   e Zd ZU ee ed< eed< eed< eed< eed< dS )MLADimsq_lora_rankkv_lora_rankqk_nope_head_dimqk_rope_head_dim
v_head_dimN)r   r   r   r
   r{   r   r   r   r   r!   r   P  s   
 r   model_configc                 C   s(   | j }tt|dd |j|j|j|jdS )Nr   )r   r   r   r   r   )hf_text_configr   getattrr   r   r   r   )r   r   r   r   r!   get_mla_dimsY  s   
r   )@__doc__collectionsr   
contextlibr   dataclassesr   	itertoolsr   typingr   r   r   r	   r
   r   r   r   r   numpyr:   rx   Zvllm.attentionr   r   r   Z vllm.attention.backends.abstractr   Zvllm.configr   Zvllm.loggerr   Zvllm.multimodalr   Z
vllm.utilsr   r   r   loggerZvllm.worker.model_runner_baser   ZSTR_NOT_IMPL_ENC_DEC_ROCM_HIPrB   rC   Zvllm.worker.model_runnerr   r'   r   r{   r.   r9   r>   rD   rE   rG   r   r   r   r   tupler   r   r   r   r   r   r   r!   <module>   s   ,








' % 6	
4
/