o
    )i?'                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ erOd d
lmZ neZeeZed Zed Zed Zed Z eeG dd dZ!dS )    N)field)TYPE_CHECKINGAnyLiteralOptionalget_args)SkipValidationmodel_validator)	dataclass)Self)config)init_logger)	GiB_bytesget_cpu_memory)ParallelConfig)             @      )autoZfp8Zfp8_e4m3Zfp8_e5m2Zfp8_inc)r   Zfloat32)builtinsha256Zsha256_cbor_64bitc                   @   s  e Zd ZU dZdZee ed< 	 dZe	ed< 	 dZ
e	ed< 	 dZeed	< 	 d
Zeed< 	 dZee ed< 	 dZee ed< 	 dZee ed< 	 dZeed< 	 dZe	ed< 	 d
Zeed< 	 dZee ed< 	 dZee ed< 	 dZeed< 	 dZeed< 	 edd
dZee ed< 	 edd
dZee ed< 	 d
Z eed< 	 de!fddZ"d.dd Z#d!d" Z$e%d#d$de&fd%d&Z'd.d'd(Z(d.d)d*Z)d+e*ddfd,d-Z+dS )/CacheConfigzConfiguration for the KV cache.N
block_sizeg?gpu_memory_utilization   
swap_spacer   cache_dtypeFis_attention_freenum_gpu_blocks_overridesliding_windowenable_prefix_cachingr   prefix_caching_hash_algor   cpu_offload_gbcalculate_kv_scalescpu_kvcache_space_bytesmamba_page_size_paddedmamba_cache_dtypemamba_ssm_cache_dtype)defaultinitnum_gpu_blocksnum_cpu_blockskv_sharing_fast_prefillreturnc                 C   sF   g }| | j | | j | | j tjt| dd }|S )a  
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        F)usedforsecurity)	appendr   r)   r*   hashlibmd5strencode	hexdigest)selfZfactorsZhash_str r9   ]/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/config/cache.pycompute_hashz   s   zCacheConfig.compute_hashc                 C   s    | j t | _|   |   d S )N)r   r   swap_space_bytes_verify_cache_dtype_verify_prefix_cachingr8   r9   r9   r:   __post_init__   s   zCacheConfig.__post_init__c                 C   s   dd | j  D S )Nc                 S   s   i | ]	\}}|t |qS r9   )r5   ).0keyvaluer9   r9   r:   
<dictcomp>   s    z,CacheConfig.metrics_info.<locals>.<dictcomp>)__dict__itemsr?   r9   r9   r:   metrics_info   s   zCacheConfig.metrics_infoafter)modec                 C   sJ   | j dk rtd| j  | jdkrtd| j d| jr#td | S )Nr   z0CPU offload space must be non-negative, but got g      ?z2GPU memory utilization must be less than 1.0. Got .zh--kv-sharing-fast-prefill is currently work in progress and not functional yet (i.e. no prefill savings))r%   
ValueErrorr   r/   loggerZwarning_oncer?   r9   r9   r:   _verify_args   s   

zCacheConfig._verify_argsc                 C   s:   | j dkrd S | j ttv rtd d S td| j  )Nr   zUsing fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor.zUnknown kv cache dtype: )r   r   
CacheDTyperL   inforK   r?   r9   r9   r:   r=      s   
zCacheConfig._verify_cache_dtypec                 C   sZ   | j sd S | jd urtjstd| j r)| jttvr+td| j dtt dd S d S )NzmPrefix caching is not supported with sliding window. Run with --disable-sliding-window to use prefix caching.z'Unknown prefix caching hash algorithm: z. Must be one of rJ   )	r#   r"   envsZVLLM_USE_V1NotImplementedErrorr$   r   PrefixCachingHashAlgorK   r?   r9   r9   r:   r>      s"   
z"CacheConfig._verify_prefix_cachingparallel_configc                 C   sj   t  }|j}| j| }|t dd|t dd}|d| kr%td| |d| kr3td| d S d S )Nz.2fz GiB out of the z6 GiB total CPU memory is allocated for the swap space.gffffff?zToo large swap space. g?z!Possibly too large swap space. %s)r   Ztensor_parallel_sizer<   r   rK   rL   warning)r8   rS   Ztotal_cpu_memoryZnum_gpus_per_nodeZcpu_memory_usagemsgr9   r9   r:   verify_with_parallel_config   s   
z'CacheConfig.verify_with_parallel_config)r0   N),__name__
__module____qualname____doc__r   r   	BlockSize__annotations__r   floatr   r   rN   r    boolr!   r   intr"   r#   r$   rR   r%   r&   r'   r(   r)   
MambaDTyper*   r   r-   r.   r/   r5   r;   r@   rG   r	   r   rM   r=   r>   r   rV   r9   r9   r9   r:   r      sd   
 	


r   )"r3   dataclassesr   typingr   r   r   r   r   Zpydanticr   r	   Zpydantic.dataclassesr
   Ztyping_extensionsr   Z	vllm.envsrP   Zvllm.config.utilsr   Zvllm.loggerr   Z
vllm.utilsr   r   Zvllm.config.parallelr   rW   rL   r[   rN   r`   rR   r   r9   r9   r9   r:   <module>   s*   