o
    )i                     @   sz   d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZmZ eeZG dd	 d	ZdS )
z,CacheEngine class for managing the KV cache.    )ListN)get_attn_backend)CacheConfigDeviceConfigModelConfigParallelConfig)init_logger)STR_DTYPE_TO_TORCH_DTYPELayerBlockTypeget_dtype_sizeis_pin_memory_availablec                
   @   s   e Zd ZdZdededededdf
dd	Zd
e	de
deej fddZdejddfddZdejddfddZdejddfddZedededede	fddZdS )CacheEnginezManages the KV cache.

    This class is responsible for initializing and managing the GPU and CPU KV
    caches. It also provides methods for performing KV cache operations, such
    as swapping and copying.
    cache_configmodel_configparallel_configdevice_configreturnNc                 C   s   || _ || _|| _|| _| | _||tj| _	|
|| _|j| _|j| _| jr2|  j|j  _|j| _| jrA|  j|j  _|jdkrK|j| _nt|j | _t| j|j|j| j|j|jd| _| | j| jj| _| | jd| _d S )Nauto)use_mlacpu)r   r   r   r   get_head_size	head_sizeget_num_layers_by_block_typer
   	attentionnum_attention_layersget_num_kv_headsnum_kv_heads
block_sizeZnum_gpu_blocksZpipeline_parallel_sizeZnum_cpu_blockscache_dtypedtyper	   r   Zis_attention_freer   attn_backend_allocate_kv_cacheZdevice_type	gpu_cache	cpu_cache)selfr   r   r   r    r%   d/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/worker/cache_engine.py__init__   s<   



zCacheEngine.__init__
num_blocksdevicec           	   	      s   | j || j| j| j |dkrt nd}g }z| j  }W n ttfy1   t	t
t }Y nw t	 fdd|D }t
| jD ]}tj|| j||dj| }|| qB|S )z+Allocates KV cache on the specified device.r   Fc                 3   s    | ]} | V  qd S Nr%   ).0iZkv_cache_generic_shaper%   r&   	<genexpr>Y   s    z1CacheEngine._allocate_kv_cache.<locals>.<genexpr>)r   
pin_memoryr)   )r    Zget_kv_cache_shaper   r   r   r   Zget_kv_cache_stride_orderAttributeErrorNotImplementedErrortuplerangelenr   torchZzerosr   Zpermuteappend)	r$   r(   r)   r/   Zkv_cacheZkv_cache_stride_orderZkv_cache_allocation_shape_Zlayer_kv_cacher%   r-   r&   r!   E   s0   zCacheEngine._allocate_kv_cache
src_to_dstc                 C   0   t | jD ]}| j| j| | j| | qd S r*   )r3   r   r    swap_blocksr#   r"   r$   r8   r,   r%   r%   r&   swap_ink   
   zCacheEngine.swap_inc                 C   r9   r*   )r3   r   r    r:   r"   r#   r;   r%   r%   r&   swap_outp   r=   zCacheEngine.swap_outsrc_to_dstsc                 C   s   | j | j| d S r*   )r    Zcopy_blocksr"   )r$   r?   r%   r%   r&   copyu   s   zCacheEngine.copyc                 C   st   |  }||}||tj}| jdkr|j}nt| j }|| }|js'|nd}|| j	 ||  }	t
|}
|
|	 S )Nr   r   )r   r   r   r
   r   r   r   r	   r   r   r   )r   r   r   r   Z	num_headsr   r   Zkey_cache_entryZvalue_cache_entrytotalZ
dtype_sizer%   r%   r&   get_cache_block_sizex   s   


z CacheEngine.get_cache_block_size)__name__
__module____qualname____doc__r   r   r   r   r'   intstrr   r5   ZTensorr!   r<   r>   r@   staticmethodrB   r%   r%   r%   r&   r      sB    
,
&r   )rF   typingr   r5   Zvllm.attentionr   Zvllm.configr   r   r   r   Zvllm.loggerr   Z
vllm.utilsr	   r
   r   r   rC   loggerr   r%   r%   r%   r&   <module>   s   