o
    )i'                     @   sV  d dl mZmZ d dlmZ d dlmZmZ d dlm	Z	m
Z
mZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ e	rLd dlmZmZmZ G d	d
 d
ZG dd deZeG dd dZededZ G dd deee  Z!G dd deee  Z"G dd deZ#G dd deee  Z$G dd de$e  ee  Z%de&de'fddZ(dS )    )ABCabstractmethod)contextmanager)	dataclassfields)TYPE_CHECKINGAnyDictGenericListOptionalProtocolSetTupleTypeTypeVarN)
GroupShape)MultiModalPlaceholderMap)ModelRunnerBaseModelRunnerInputBaseModelRunnerInputBuilderBasec                   @   s    e Zd ZdZdZdZdZdZdS )AttentionTypezO
    Attention type.
    Use string to be compatible with `torch.compile`.
    decoderencoderZencoder_onlyZencoder_decoderN)__name__
__module____qualname____doc__DECODERZENCODERZENCODER_ONLYZENCODER_DECODER r   r   l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/attention/backends/abstract.pyr      s    r   c                   @   sZ  e Zd ZU dZdZeed< eede	fddZ
eeded fdd	Zeeded
 fddZeeded fddZed*ddZeeded fddZeedededededeedf f
ddZedeedf fddZeedejdejd ejdd!fd"d#Zeed$eej d%ejdd!fd&d'Zedee	e	f fd(d)Zd!S )+AttentionBackendz&Abstract class for attention backends.Faccept_output_bufferreturnc                   C      t NNotImplementedErrorr   r   r   r    get_name,      zAttentionBackend.get_nameAttentionImplc                   C   r$   r%   r&   r   r   r   r    get_impl_cls1   r)   zAttentionBackend.get_impl_clsAttentionMetadatac                   C   r$   r%   r&   r   r   r   r    get_metadata_cls6   r)   z!AttentionBackend.get_metadata_clsAttentionStatec                   C   r$   r%   r&   r   r   r   r    get_state_cls;   r)   zAttentionBackend.get_state_clsc                 O   s   |   |i |S r%   )r-   )clsargskwargsr   r   r    make_metadata@   s   zAttentionBackend.make_metadataAttentionMetadataBuilderc                   C   r$   r%   r&   r   r   r   r    get_builder_clsD   r)   z AttentionBackend.get_builder_cls
num_blocks
block_sizenum_kv_heads	head_size.c                 C   r$   r%   r&   )r6   r7   r8   r9   r   r   r    get_kv_cache_shapeI      z#AttentionBackend.get_kv_cache_shapec                   C   r$   r%   r&   r   r   r   r    get_kv_cache_stride_orderS      z*AttentionBackend.get_kv_cache_stride_ordersrc_kv_cachedst_kv_cache
src_to_dstNc                 C   r$   r%   r&   )r>   r?   r@   r   r   r    swap_blocksW      zAttentionBackend.swap_blocks	kv_cachessrc_to_distsc                 C   r$   r%   r&   )rC   rD   r   r   r    copy_blocks`      zAttentionBackend.copy_blocksc                 C   s   | j | jfS r%   )r   r   )r0   r   r   r    full_cls_nameh   s   zAttentionBackend.full_cls_name)r#   r,   )r   r   r   r   r"   bool__annotations__staticmethodr   strr(   r   r+   r-   r/   classmethodr3   r5   intr   r:   r<   torchTensorrA   r   rE   tuplerG   r   r   r   r    r!   %   sr   
 
r!   c                   @   s   e Zd ZU dZeed< eed< eed< ejed< ee	e
ejf  ed< eed< eeded  fd	d
Zeeded  fddZ	ddeee
  de	e
ef fddZdS )r,   z;Attention metadata for prefill and decode batched together.Znum_prefillsZnum_prefill_tokensZnum_decode_tokensZslot_mappingZ"multi_modal_placeholder_index_mapsZenable_kv_scales_calculationr#   c                 C      dS )zOReturn the attention metadata that's required to run prefill
        attention.Nr   selfr   r   r    prefill_metadata      z"AttentionMetadata.prefill_metadatac                 C   rQ   )zNReturn the attention metadata that's required to run decode
        attention.Nr   rR   r   r   r    decode_metadata   rU   z!AttentionMetadata.decode_metadataNskip_fieldsc                    s&   du rt   fddt D S )z6Similar to dataclasses.asdict, but avoids deepcopying.Nc                    s&   i | ]}|j vr|j t |j qS r   )namegetattr).0fieldrS   rW   r   r    
<dictcomp>   s    z5AttentionMetadata.asdict_zerocopy.<locals>.<dictcomp>)setr   r\   r   r\   r    asdict_zerocopy   s
   z!AttentionMetadata.asdict_zerocopyr%   )r   r   r   r   rM   rI   rN   rO   r   r	   rK   r   ZIndexMaprH   propertyr   rT   rV   r   r   r_   r   r   r   r    r,   m   s,   
 



r,   T)boundc                
   @   s   e Zd ZdZedddZeedefddZed	ed
dfddZ	e	dd	ede
d
efddZe	ddede
d
eeef fddZe	ddeeef dede
d
dfddZedddZdS ) r.   z\Holds attention backend-specific objects reused during the
    lifetime of the model runner.runnerr   c                 C      d S r%   r   )rS   rc   r   r   r    __init__   r=   zAttentionState.__init__max_batch_sizec                 c   s    dV  dS )z0Context manager used when capturing CUDA graphs.Nr   )rS   rf   r   r   r    graph_capture   s   
zAttentionState.graph_capture
batch_sizer#   zAttentionState[T]c                 C   rQ   )z5Clone attention state to save in CUDA graph metadata.Nr   )rS   rh   r   r   r    graph_clone   r)   zAttentionState.graph_cloneFis_encoder_decoder_modelc                 C   rQ   )z<Get attention metadata for CUDA graph capture of batch_size.Nr   )rS   rh   rj   r   r   r    $graph_capture_get_metadata_for_batch   rF   z3AttentionState.graph_capture_get_metadata_for_batchattn_metadatac                 C   rQ   )z<Get attention-specific input buffers for CUDA graph capture.Nr   )rS   rl   rj   r   r   r    get_graph_input_buffers   rF   z&AttentionState.get_graph_input_buffersinput_buffersNc                 C   rQ   )z9In-place modify input buffers dict for CUDA graph replay.Nr   )rS   rn   rl   rj   r   r   r    prepare_graph_input_buffers   rB   z*AttentionState.prepare_graph_input_buffersmodel_inputr   c                 C   rQ   )zPrepare state for forward pass.Nr   )rS   rp   r   r   r    begin_forward   r)   zAttentionState.begin_forward)rc   r   )F)rp   r   r#   N)r   r   r   r   r   re   r   rM   rg   ri   rH   ra   rk   r	   rK   r   rm   ro   rq   r   r   r   r    r.      sP    

r.   c                   @   sV   e Zd ZdZedddZeddd	Zed
ee dee dedede	f
ddZ
dS )r4   z/Abstract class for attention metadata builders.input_builderr   r#   Nc                 C   r$   )z?Create the builder, remember some configuration and parameters.r&   )rS   rr   r   r   r    re      r)   z!AttentionMetadataBuilder.__init__c                 C   r$   )zPrepare for one batch.r&   rR   r   r   r    prepare   r)   z AttentionMetadataBuilder.prepareseq_lens
query_lenscuda_graph_pad_sizerh   c                 C   r$   )z0Build attention metadata with on-device tensors.r&   )rS   rt   ru   rv   rh   r   r   r    build   s   zAttentionMetadataBuilder.build)rr   r   r#   N)r#   N)r   r   r   r   r   re   rs   r   rM   ra   rw   r   r   r   r    r4      s    r4   c                   @   sr   e Zd ZU ejed< ejed< ejed< eed< eed< ejed< dejdejd	ejd
ejdedejfddZdS )AttentionLayerZ_q_scaleZ_k_scaleZ_v_scaleZ_k_scale_floatZ_v_scale_floatZ_prob_scalequerykeyvaluekv_cacherl   r#   c                 C   rd   r%   r   )rS   ry   rz   r{   r|   rl   r   r   r    forward   r;   zAttentionLayer.forwardN)	r   r   r   rN   rO   rI   floatr,   r}   r   r   r   r    rx      s(   
 



rx   c                   @   s   e Zd Zedddddejdfdedededee dee	e  dee d	e
d
ee de
dee
 ddfddZe		ddedejdejdejdejdedeej deej dejfddZdejdedefddZdS ) r*   Nauto	num_headsr9   scaler8   alibi_slopessliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_namer#   c                 C   r$   r%   r&   )rS   r   r9   r   r8   r   r   r   r   r   r   r   r   r    re     s   zAttentionImpl.__init__layerry   rz   r{   r|   rl   outputoutput_scalec	           	      C   r$   r%   r&   )	rS   r   ry   rz   r{   r|   rl   r   r   r   r   r    r}        zAttentionImpl.forwarddtypestaticgroup_shapec                 C   rQ   )a  
        Does this attention implementation support fused output quantization.
        This is used by the AttnFusionPass to only fuse output quantization
        onto implementations that support it.

        TODO(luka) merge parameters into QuantDescriptor
        :param dtype: quantized dtype
        :param static: static or dynamic quantization
        :param group_shape: quant group shape.
        :return: is fusion supported for this type of quantization
        Fr   )rS   r   r   r   r   r   r    fused_output_quant_supported#  s   z*AttentionImpl.fused_output_quant_supportedNN)r   r   r   r   r   r   rM   r~   r   r   rK   re   rx   rN   rO   ra   r}   r   rH   r   r   r   r   r   r    r*     sr    
	
		
r*   c                   @   sZ   e Zd Ze		ddedejdejdejdejdedeej d	eej d
ejfddZ	dS )MLAAttentionImplNr   hidden_states_or_cqkv_c_normedk_per|   rl   r   r   r#   c	           	      C   r$   r%   r&   )	rS   r   r   r   r   r|   rl   r   r   r   r   r    r}   5  r   zMLAAttentionImpl.forwardr   )
r   r   r   r   rx   rN   rO   ra   r   r}   r   r   r   r    r   3  s.    		
r   r   r#   c                 C   s   | dkS )Nr   r   )r   r   r   r    is_quantized_kv_cacheD  s   r   ))abcr   r   
contextlibr   dataclassesr   r   typingr   r   r	   r
   r   r   r   r   r   r   r   rN   Z9vllm.model_executor.layers.quantization.utils.quant_utilsr   Zvllm.multimodalr   Zvllm.worker.model_runner_baser   r   r   r   r!   r,   ra   r.   r4   rx   r*   r   rK   rH   r   r   r   r   r    <module>   s(   4H820