o
    )i!                     @   s   d dl mZ d dlmZ d dlmZmZmZmZ d dl	Z	d dl
mZmZ d dlmZmZmZmZmZ d dlmZmZmZ G dd	 d	eZeG d
d deZG dd dee ZG dd dee ZG dd dee ZdS )    )contextmanager)	dataclass)ListOptionalTupleTypeN)AttentionTypeis_quantized_kv_cache)MLACommonBackendMLACommonImplMLACommonMetadataMLACommonMetadataBuilderMLACommonState)flash_mla_with_kvcacheget_mla_metadatais_flashmla_supportedc                   @   sv   e Zd ZedefddZeded fddZeded fdd	Zeded
 fddZ	eded fddZ
dS )FlashMLABackendreturnc                   C   s   dS )NZFLASHMLA r   r   r   l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/attention/backends/flashmla.pyget_name      zFlashMLABackend.get_nameFlashMLAImplc                   C      t S N)r   r   r   r   r   get_impl_cls   r   zFlashMLABackend.get_impl_clsFlashMLAMetadatac                   C   r   r   )r   r   r   r   r   get_metadata_cls    r   z FlashMLABackend.get_metadata_clsFlashMLAMetadataBuilderc                   C   r   r   )r   r   r   r   r   get_builder_cls$   r   zFlashMLABackend.get_builder_clsFlashMLAStatec                   C   r   r   )r    r   r   r   r   get_state_cls(   r   zFlashMLABackend.get_state_clsN)__name__
__module____qualname__staticmethodstrr   r   r   r   r   r!   r   r   r   r   r      s    r   c                       sP   e Zd ZU dZeeejejf  ed< dZ	eej ed< e
 fddZ  ZS )r   Ndecode_tile_scheduler_metadatadecode_num_splitsc                    s$   t  j}|d ur| j|_| j|_|S r   )superdecode_metadatar'   r(   )selfr*   	__class__r   r   r*   3   s   z FlashMLAMetadata.decode_metadata)r"   r#   r$   r'   r   r   torchTensor__annotations__r(   propertyr*   __classcell__r   r   r,   r   r   -   s   
 
r   c                       sB   e Zd Z fddZdee dee dedef fddZ  ZS )	r   c                    *   t  j|i | | jj| jj| _d S r   r)   __init__runnerZmodel_configZget_num_attention_headsZparallel_confignum_q_heads)r+   argskwargsr,   r   r   r5   A      
z FlashMLAMetadataBuilder.__init__seq_lens
query_lenscuda_graph_pad_size
batch_sizec                    sB   t  ||||}|jdkrt|j|jd  | jd\|_|_|S Nr      )	r)   buildnum_decode_tokensr   seq_lens_tensorZnum_prefillsr7   r'   r(   )r+   r;   r<   r=   r>   mr,   r   r   rA   G   s   

zFlashMLAMetadataBuilder.build)r"   r#   r$   r5   r   intrA   r2   r   r   r,   r   r   ?   s    r   c                       sx   e Zd Z fddZedef fddZ	ddedef fd	d
Z	ddef fddZ		ddef fddZ
  ZS )r    c                    r3   r   r4   )r+   r8   kwdsr,   r   r   r5   Y   r:   zFlashMLAState.__init__max_batch_sizec                 #   sh    t tj|tj| jjd| jd\| _| _t	 
| d V  W d    n1 s)w   Y  | `| `d S )N)Zdtypedevicer@   )r   r.   ZonesZint32r6   rH   r7   &_graph_decoder_tile_scheduler_metadata_graph_decode_num_splitsr)   graph_capture)r+   rG   r,   r   r   rK   _   s   zFlashMLAState.graph_captureFr>   is_encoder_decoder_modelc                    s|   t  ||}|jdksJ t| jd | | jd\}}| j| | jd |d  | | j|_	| jd |d  |_
|S r?   )r)   $graph_capture_get_metadata_for_batchrB   r   Z_graph_seq_lensr7   rI   copy_rJ   r'   r(   )r+   r>   rL   metadataZdecoder_tile_scheduler_metadatar(   r,   r   r   rM   p   s$   z2FlashMLAState.graph_capture_get_metadata_for_batchc                    s*   t  ||}|jj|d< |jj|d< |S Nr'   r(   )r)   get_graph_input_buffersr*   r'   r(   )r+   attn_metadatarL   input_buffersr,   r   r   rQ      s   z%FlashMLAState.get_graph_input_buffersc                    s8   t  ||| |d |jj |d |jj d S rP   )r)   prepare_graph_input_buffersrN   r*   r'   r(   )r+   rS   rR   rL   r,   r   r   rT      s   
z)FlashMLAState.prepare_graph_input_buffers)F)r"   r#   r$   r5   r   rE   rK   boolrM   rQ   rT   r2   r   r   r,   r   r    W   s$    r    c                       s   e Zd Z	ddededededeee  dee ded	ee d
edee ddf fddZde	j
de	j
de	j
dede	j
f
ddZ  ZS )r   N	num_heads	head_sizescalenum_kv_headsalibi_slopessliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_namer   c                    sv   t  j|||||||||	|
f
i | t sJ d|||g}t|r'td|	tjkr0tdt| jr9tdd S )Nz(FlashMLA is not supported on this devicezaFlashMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capz_Encoder self-attention and encoder/decoder cross-attention are not implemented for FlashMLAImplz,FlashMLA with FP8 KV cache not yet supported)	r)   r5   r   anyNotImplementedErrorr   ZDECODERr	   r\   )r+   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   Zmla_argsZunsupported_featuresr,   r   r   r5      s,   


zFlashMLAImpl.__init__q_nopeq_pekv_c_and_k_pe_cacherR   c           	      C   st   |  dksJ |j}|d usJ tj||gddd}t||d|j|j| j|j	|j
| jdd	\}}| |S )Nr   )dimr@   T)	qZk_cacheZblock_tableZcache_seqlensZ
head_dim_vZtile_scheduler_metadataZ
num_splitsZsoftmax_scaleZcausal)Znumelr*   r.   catZ	unsqueezer   Zblock_tablesrC   Zkv_lora_rankr'   r(   rX   Z
_v_up_proj)	r+   rb   rc   rd   rR   Zdecode_metarh   o_r   r   r   _forward_decode   s$   

zFlashMLAImpl._forward_decoder   )r"   r#   r$   rE   floatr   r   r&   r5   r.   r/   r   rl   r2   r   r   r,   r   r      sH    
	
&r   )
contextlibr   dataclassesr   typingr   r   r   r   r.   Z vllm.attention.backends.abstractr   r	   Z"vllm.attention.backends.mla.commonr
   r   r   r   r   Zvllm.attention.ops.flashmlar   r   r   r   r   r   r    r   r   r   r   r   <module>   s   I