o
    )ik                     @   s.  d dl Z d dlZd dlZd dl mZ d dlmZmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZmZ d dlmZ erVd dlmZ d dlmZ d d	lmZ d dlmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' e%e(Z)da*eG dd dZ+eG dd dZ,dej-de.dej-fddZ/de,de+de+fddZ0de1e, de+de1e+ fddZ2ed Z3G d!d" d"ej4Z5G d#d$ d$e j6ee3 Z7ej8d%d& Z9d'e:fd(d)Z;eG d*d+ d+Z<d,ed-e1e: d.e=d/ de>e:e<f fd0d1Z?d2e>e:e<f de<fd3d4Z@	 dSd5eAde+d6eAde+fd7d8ZBd9e:d:e=e7e3  d;e
e+ge+f de=e7e3  fd<d=ZCd9e:d>e=e d:e=e7e3  de=e fd?d@ZD	AdTde+dBeAdeEeAeAeAeAf fdCdDZF	AdTdEdFdGdHdBeAdeGfdIdJZHdKeej- dfdLeAd fgZId9e:dMe	dNe1eEe:e	e	f  de	fdOdPZJdMe	de	fdQdRZKdS )U    N)abstractmethod)	dataclassmake_dataclass)TYPE_CHECKINGAnyCallableClassVarGenericOptionalTypeVar)
VllmConfigget_layers_from_vllm_config)cdiv)AttentionImpl)SchedulerOutput)
InputBatch)AttentionBackend)	Attention)get_kv_connector_cache_layout)init_logger)AttentionSpecc                   @   s   e Zd ZU dZejed< ejed< 	 ejed< ejed< 	 ejed< 	 eed< 	 eed< 	 eed	< 	 ejed
< ejed< dZe	ed< dS )CommonAttentionMetadataz
    Per-batch attention metadata, shared across layers and backends.
    AttentionMetadataBuilder instances use it to construct per-layer metadata.
    
    For many of the tensors we keep both GPU and CPU versions.
    query_start_locquery_start_loc_cpuseq_lensseq_lens_cpunum_computed_tokens_cpunum_reqsnum_actual_tokensmax_query_lenblock_table_tensorslot_mappingTcausalN)
__name__
__module____qualname____doc__torchTensor__annotations__intr"   bool r,   r,   l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/v1/attention/backends/utils.pyr   "   s&   
 






r   c                   @   s   e Zd ZU eed< eed< dS )UbatchSlicerequest_slicetoken_sliceN)r#   r$   r%   slicer)   r,   r,   r,   r-   r.   D   s   
 r.   r   r/   returnc                 C   s   | |j |jd  | |j   S )z
    Creates a new query_start_loc that corresponds to the requests in 
    request_slice.

    Note: This function creates a new tensor to hold the new query_start_locs.
    This will break cudagraph compatibility.
       )startstop)r   r/   r,   r,   r-   slice_query_start_locsJ   s   r6   ubatch_sliceattn_metadatac                 C   s   | j }| j}t|j|}t|dksJ dt| t|j|}|j| }|j| }|j| }|j	|j
 }	|j	|j
 }
ttt|dd |dd   }|j| }|j| }t||||||	|
|||d
S )z|
    This function creates a new CommonAttentionMetadata that corresponds to 
    the requests included in ubatch_slice
       z3query_start_loc must have at least 2 elements, got r3   N)
r   r   r   r   r   r   r   r   r    r!   )r/   r0   r6   r   lenr   r   r   r   r5   r4   r*   r'   maxabsitemr    r!   r   )r7   r8   r/   r0   r   r   r   r   r   num_requestsr   r   r    r!   r,   r,   r-   _make_metadata_with_sliceY   sP   




r@   ubatch_slicescommon_attn_metadatac                 C   s"   g }| D ]
}| t|| q|S )z
    Creates a new CommonAttentionMetadata instance that corresponds to the 
    requests for each UbatchSlice in ubatch_slices.

    Note: This function does not modify common_attn_metadata
    )appendr@   )rA   rB   resultsr7   r,   r,   r-   split_attn_metadata   s   
rE   Mc                   @   s&   e Zd ZdZdZ	 dZ	 dZ	 dZdS )AttentionCGSupportz Constants for the cudagraph support of the attention backend
    Here we do not consider the cascade attention, as currently
    it is never cudagraph supported.   r9   r3   r   N)r#   r$   r%   r&   ZALWAYSZUNIFORM_BATCHZUNIFORM_SINGLE_TOKEN_DECODENEVERr,   r,   r,   r-   rG      s    rG   c                   @   s   e Zd ZU ejZee ed< dZee	e
  ed< ededee dedejfdd	Ze	
dde
dededefddZdedefddZdede
defddZde
dejde
de
dededede
defddZdS ) AttentionMetadataBuildercudagraph_supportNreorder_batch_thresholdkv_cache_speclayer_namesvllm_configdevicec                 C   s
   || _ d S N)rM   )selfrM   rN   rO   rP   r,   r,   r-   __init__   s   
z!AttentionMetadataBuilder.__init__Fcommon_prefix_lenrB   
fast_buildr2   c                 C   s   t )a  
        Central method that builds attention metadata.
        Some builders (MLA) require reorder_batch to be called prior to build.
        
        Args:
            common_prefix_len: The length of the common prefix of the batch.
            common_attn_metadata: The common attention metadata.
            fast_build: The meta-data will prioritize speed of building over
                then speed at execution. Can be used for spec-decode where the
                result of a build call may only be used for few layers/iters.
        )NotImplementedErrorrR   rT   rB   rU   r,   r,   r-   build   s   zAttentionMetadataBuilder.buildc                 C   s   | j d|dS )z
        Build attention metadata for CUDA graph capture. Uses build by default.
        Subclasses that override this method should call self.build or
        super().build_for_cudagraph_capture.
        r   )rT   rB   rX   )rR   rB   r,   r,   r-   build_for_cudagraph_capture   s   z4AttentionMetadataBuilder.build_for_cudagraph_capturedraft_indexc                 C   s   | j d|ddS )a  
        Build attention metadata for draft model. Uses build by default.
        
        Args:
            common_attn_metadata: The common attention metadata.
            draft_index: The index of the current draft operation.
                When speculating a chain of tokens, this index refers to the
                draft attempt for the i-th token.
                For tree-based attention, this index instead refers to the
                draft attempt for the i-th level in the tree of tokens.
        r   T)rT   rB   rU   rY   )rR   rB   r[   r,   r,   r-   build_for_drafting   s   z+AttentionMetadataBuilder.build_for_drafting
query_lensnum_query_headsnum_kv_heads	use_alibiuse_sliding_windowuse_local_attentionnum_smsc	           	      C   s   dS )NFr,   )	rR   rT   r]   r^   r_   r`   ra   rb   rc   r,   r,   r-   use_cascade_attention   s   z.AttentionMetadataBuilder.use_cascade_attentionF)r#   r$   r%   rG   rI   rK   r   r)   rL   r
   r*   r   r   liststrr   r'   rP   rS   r   r+   rF   rX   rZ   r\   npZndarrayrd   r,   r,   r,   r-   rJ      sh   
 


	
rJ   c                  C   sD   t d urt } td|  | S tj} | d u rt } | S td|  | S )NzM`_KV_CACHE_LAYOUT_OVERRIDE` variable detected. Setting KV cache layout to %s.zT`VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to %s.)_KV_CACHE_LAYOUT_OVERRIDEloggerZ	info_onceenvsZVLLM_KV_CACHE_LAYOUTr   cache_layoutr,   r,   r-   get_kv_cache_layout   s   rn   rm   c                 C   s   | a d S rQ   )ri   rl   r,   r,   r-   set_kv_cache_layout  s   ro   c                   @   s:   e Zd ZU dZeed< ee ed< eed< dZe	ed< dS )PerLayerParametersa  
    Currently, FlashInfer backend only support models in which all layers share
    the same values for the following hyperparameters. Should not be used for
    trtllm-gen backend since it supports different values for the following
    hyperparameters.
    window_leftlogits_soft_capsm_scaleF	has_sinksN)
r#   r$   r%   r&   r*   r)   r
   floatrt   r+   r,   r,   r,   r-   rp     s   
 rp   rO   rN   cls_r   c                 C   s   t | t|}i }| D ]8\}}|j}t||sJ t|dd}|dur(|d nd}	t|dd}
|j}t|dddu}t|	|
||||< q|S )zc
    Scan layers in `layer_names` and determine some hyperparameters
    to use during `plan`.
    Zsliding_windowNr   r:   rr   Zsinks)r   r   itemsimpl
isinstancegetattrscalerp   )rO   rN   rv   Zlayersper_layer_paramskeylayerrx   Zwindow_sizerq   rr   rs   rt   r,   r,   r-   get_per_layer_parameters#  s   
r   r|   c                 C   s`   t | dks
J dt|  }|d }tjs.|D ]}|j|jkr%td||ks-J dq|S )ae  
    Currently, FlashInfer backend other than trtllm-gen 
    only support models in which all layers share
    the same values for the following hyperparameters:
    - `window_left`
    - `logits_soft_cap`
    - `sm_scale`

    So this function asserts that all layers share the same values for these
    hyperparameters and returns the global values.
    r   z'No attention layers found in the model.zcWindow left is not the same for all layers. One potential fix is to set disable_sliding_window=TruezFlashInfer backend currently only supports models in which alllayers share the same values for the following hyperparameters:`window_left`, `logits_soft_cap`, `sm_scale`.)r;   rf   valuesrk   ZVLLM_USE_TRTLLM_ATTENTIONrq   
ValueError)r|   Z
param_setsZglobal_paramsparamsr,   r,   r-   infer_global_hyperparameters@  s   
r   attn_chunk_size
block_sizec                 C   sr  |j  }|j }|j}|jj}|dd  |d d  }|jd }t| || |   |	tj
}	| ||    }
dt||	 |  }t|}|d }t|| |}tj|tj
d| }t||| d }t||	 |}|	||dk< t|| |d   | |dk ||dk< tt|d	tj
}tj|d | tj
d}|
||d < || }t||||  t|
|  }|| }| | dksJ d|  d| | | }ttj|tj
d||ftj|dd }| j|jd d d	}ttj|tj
d|| }|||f |d}t|}t|}t||j|d
d||j|d
dt|t||j| ||jd
dS )Nr3   r:   r   )Zdtype)r3   r   zattn_chunk_size z  is not divisible by block_size )Zaxis)r<   T)rP   Znon_blocking)r   r   r   r   r   r   r   r   r    r!   r"   )r   numpyr   r    r   rP   shaperh   minimumZastypeZint32r   ZcumsumrepeatarangepadfullZbroadcast_toZexpand_dimsflattenZclipviewr'   Z
from_numpyr   tor;   r   r<   r!   )r   rB   r   Zquery_start_loc_npZseq_lens_npZblock_tablerP   Z	q_seqlensZactual_batch_sizeZq_tokens_in_first_blockZtokens_in_last_blockZlocal_blocksZcu_num_blocksZvirtual_batchesZblock_offsetsr   ZrarangeZseqlens_q_localZcu_seqlens_q_localZseqlens_k_localZnum_computed_tokens_localZk_seqstarts_absoluteZblock_startsZpages_per_local_batchZblock_indicesZbatch_indicesZblock_table_localr   r   r,   r,   r-   $make_local_attention_virtual_batches  s   










r   name_prefixbuilder_clsbuild_preprocess_fnc                    s@   | j  }	d	dtdtdtf fdd}t|fd|i}|S )
z
    Return a new subclass of `builder_cls` whose .build(...) method
    first calls build_preprocess_fn(common_attn_metadata) on the metadata.
    FrT   rB   rU   c                    s    | | ||S rQ   rY   rW   r   r   r,   r-   rX   (  s   z2subclass_attention_metadata_builder.<locals>.buildrX   Nre   )r#   r*   r   r+   type)r   r   r   namerX   Wrappedr,   r   r-   #subclass_attention_metadata_builder  s    

r   attention_backend_clsc                    s$   | |j  }t||fd fddiS )zN
    Return a new subclass where `get_builder_cls` returns `builder_cls`.
    Zget_builder_clsc                      s    S rQ   r,   r,   r   r,   r-   <lambda>C  s    z,subclass_attention_backend.<locals>.<lambda>)r#   r   )r   r   r   r   r,   r   r-   subclass_attention_backend9  s   
r   r3   decode_thresholdc                 C   s   | j }| j}| j}| j}||kr|d|dfS |dd |dd  }||k}t|s1|d|dfS | jdd }t	||d |ksHJ t	|d| |ksUJ |}	||	 }
||  }|| }|	|
||fS )a,  
    Assuming a reordered batch, finds the boundary between prefill and decode
    requests.

    Args:
        common_attn_metadata: CommonAttentionMetadata object containing the
            batch metadata.
        decode_threshold: The maximum query length to be considered a decode.

    Returns:
        num_decodes: The number of decode requests.
        num_prefills: The number of prefill requests.
        num_decode_tokens: The number of tokens in the decode requests.
        num_prefill_tokens: The number of tokens in the prefill requests.
    r   r3   Nr:   )dim)
r   r   r   r   r'   anyr*   Zargmaxr>   all)rB   r   r   r   
num_tokensr   r]   Z
is_prefillZfirst_prefillnum_decodesnum_prefillsnum_decode_tokensnum_prefill_tokensr,   r,   r-   split_decodes_and_prefillsF  s$   
r   input_batchr   scheduler_outputr   c                 C   s   g }g }d}d}t | jD ] \}}|j| }	|	|kr$|| ||	7 }q|| ||	7 }qt|}
t|}d}tdt|
|d D ]}||
|  }||
k rQ |S | ||d  | d}qB|S )z
    Reorders the batch to split into prefill and decode requests; places all
    requests with <= decode_threshold tokens at the front of the batch.
    
    Returns:
        True if the batch was modified, False otherwise.
    r   Fr3   T)	enumerateZreq_idsZnum_scheduled_tokensrC   r;   rangeminZswap_states)r   r   r   ZdecodesZprefillsr   r   iZreq_idr   r   r   Zmodified_batchZ
decode_idxr,   r,   r-   +reorder_batch_to_split_decodes_and_prefillsp  s,   




r   Zlogits_indices_paddedZnum_logits_indicesmetadata_clsfieldsc                 C   s   | |j  }t|||fd}|S )zH
    Return a new subclass of `metadata_cls` with additional fields
    )bases)r#   r   )r   r   r   r   r   r,   r,   r-   subclass_attention_metadata  s   
r   c                 C   s   t d| tdS )zB
    Return a new subclass of `metadata_cls` for fast prefill
    ZKVSharingFastPrefill)r   r   r   )r   'KV_SHARING_FAST_PREFILL_METADATA_FIELDS)r   r,   r,   r-   /make_kv_sharing_fast_prefill_attention_metadata  s
   r   )r   )r3   )Labcenum	functoolsr   dataclassesr   r   typingr   r   r   r   r	   r
   r   r   rh   r'   Zvllm.configr   r   Z
vllm.utilsr   Z vllm.attention.backends.abstractr   Zvllm.v1.core.sched.outputr   Zvllm.v1.worker.gpu_input_batchr   Z	vllm.envsrk   r   Zvllm.attention.layerr   Z/vllm.distributed.kv_transfer.kv_connector.utilsr   Zvllm.loggerr   Zvllm.v1.kv_cache_interfacer   r#   rj   ri   r   r.   r(   r1   r6   r@   rf   rE   rF   EnumrG   ABCrJ   	lru_cachern   rg   ro   rp   r   dictr   r   r*   r   r   r   tupler   r+   r   r   r   r   r,   r,   r,   r-   <module>   s  $!

/
L




Z
 





-
@
