o
    )i                    @   sR  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZmZmZ d dlmZmZ d dlmZ d dlmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%Z&d dl'Z'd dl(m)Z)m*Z*m+Z+m,Z, d d	l-m.Z. d d
l/m0Z1 d dl2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z<m=Z=m>Z>m?Z? d dl@mAZAmBZBmCZCmDZD d dlEmFZFmGZG d dlHmIZImJZJ d dlKmLZLmMZM d dlNmOZO d dlPmQZQ d dlRmSZS d dlTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZa d dlbmcZc d dldmeZemfZf d dlgmhZhmiZimjZjmkZkmlZl erkd dlmmnZn d dlompZp d dlPmq  mr  msZt d dlumq  mvZw d dlPmQZQ d dlxmyZy d dlzm{Z{ d dl|m}Z} d dl~mZ e"eeegef f Zn-eZneZpeZyeZQeZeZ{eZ}eZe"eeef eegef f Zejde d Ztejde d!ZweOeZe!d"end#Zed$ Zed% Zed& Zed' Zed( Zed) Zd*d+gg d,d-gd'Zeeee f ed.< g g d/g d'Zeeee f ed0< g d1Zeeeeeef f  ed2< d3d4 Zddd5d6ed7ee d8ee d9eeeeeef f  fd:d;Ze5G d<d= d=e ZG d>d? d?e ZG d@dA dAeejZdBee d9eeef fdCdDZdBeLdEed9efdFdGZdBeLdEed9efdHdIZedJ ZedK ZedL ZeMe.e)dMdNdOG dPdQ dQZeMe.G dRdS dSZedT ZeMe.e)dMdNdOG dUdV dVZedW ZeMe.G dXdY dYZedZ ZeMe.e)dMdNdOG d[d\ d\ZeMe.G d]d^ d^ZeMe.G d_d` d`Ze'je'je'je'je'jdaZdbdbdbdbdcZddedee'jfdfdgZddedee'jfdhdiZdjedkepdlee fdmdnZddedoe'jdpefdqdrZddsdjedkepdee"ee'jf dpedlee d9e'jfdtduZ		ddvepdwee dxee dyedzee d{ee d|ee d9efd}d~Zdedee"eee f  fddZed ZeMe.G dd dZed ZeMe.G dd dZed Zed Zeeef ZeMe.G dd dZeMe.G dd dZeMe.e)dMdNdOG dd dZdaee ed< daee ed< e		ddedee fddZedddd Zd9efddZd9efddZdd Zdd Ze!dZ	ddedee deee  d9eeef fddZeMe.G dd dZdkedeeef d9efddZdS )    N)Mapping)contextmanager)MISSINGFieldfieldfieldsis_dataclassreplace)cached_property	lru_cache)	find_spec)TYPE_CHECKINGAnyCallableClassVarLiteralOptionalProtocolTypeVarUnioncastget_args)
ConfigDictSkipValidationfield_validatormodel_validator)	dataclass)_TYPES)Selfassert_neverruntime_checkable)version)	BlockSizeCacheConfig
CacheDType
MambaDTypePrefixCachingHashAlgo)CompilationConfigCompilationLevelCUDAGraphMode
PassConfig)DistributedExecutorBackendParallelConfig)SchedulerConfigSchedulerPolicy)
ConfigTypeconfig)init_logger)QuantizationMethodscurrent_platform)ConfigFormat
get_configget_hf_image_processor_configget_hf_text_configget_pooling_config)get_sentence_transformer_tokenizer_configis_encoder_decoderis_interleaved,maybe_override_with_speculators_target_modeltry_get_generation_configtry_get_safetensors_metadatatry_get_tokenizer_config
uses_mrope)S3Model)is_s3maybe_model_redirect)DEFAULT_MAX_NUM_BATCHED_TOKENSLayerBlockType
LazyLoadercommon_broadcastable_dtyperandom_uuid)DataclassInstance)PretrainedConfig)QuantizationConfig)LoadFormats)TensorizerConfig)LogitsProcessormodel_executorz'vllm.model_executor.layers.quantizationzvllm.model_executor.modelsDataclassInstanceT)bound)	autogenerate	embeddingembedclassifyscorerewardtranscriptiondraft)rT   rZ   encoderV   rW   rY   r[   )rS   rT   poolingr[   )rT   r]   r[   )rS   nonerV   rW   rY   )r^   rV   rW   rY   rT   rZ   )rU   rV   rW   rX   rY   r[   _RUNNER_TASKSrV   rW   rY   _RUNNER_CONVERTS))ZForCausalLMrT   r^   )ZForConditionalGenerationrb   )Z	ChatModelrb   )ZLMHeadModelrb   )ZForTextEncodingr]   rV   )ZEmbeddingModelrc   )ZForSequenceClassificationr]   rW   )ZForAudioClassificationrd   )ZForImageClassificationrd   )ZForVideoClassificationrd   )ZClassificationModelrd   )ZForRewardModelingr]   rY   )ZRewardModelre   )ZModelrc   _SUFFIX_TO_DEFAULTSc                   c   s    t E d H  d S N)rf    rh   rh   `/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/config/__init__.pyiter_architecture_defaults   s   rj   )runner_typeconvert_typearchitecturerk   rl   returnc                C   sR   t  D ]#\}\}}|d u s||kr&|d u s||kr&| |r&|||ff  S qd S rg   )rj   endswith)rm   rk   rl   suffixZdefault_runner_typeZdefault_convert_typerh   rh   ri   try_match_architecture_defaults   s   rq   c                   @   s   e Zd ZdefddZdS )SupportsHashrn   c                 C      d S rg   rh   selfrh   rh   ri   compute_hash      zSupportsHash.compute_hashN)__name__
__module____qualname__strrv   rh   rh   rh   ri   rr      s    rr   c                   @   s"   e Zd Zdeeef fddZdS )SupportsMetricsInforn   c                 C   rs   rg   rh   rt   rh   rh   ri   metrics_info   rw   z SupportsMetricsInfo.metrics_infoN)rx   ry   rz   dictr{   r}   rh   rh   rh   ri   r|      s    r|   c                   @   s   e Zd ZdZdZdZdS )	ModelImplrS   vllmZtransformersN)rx   ry   rz   AUTOZVLLMZTRANSFORMERSrh   rh   rh   ri   r      s    r   clsc           	      C   s   dd }t tt| jd }t|t jst	di }||jD ]G\}}t|t j
t jfrDt|t jrDt|jt jrDt|jjtsEq#t|jj}t|t j
rU|jn|jg}|D ]}t|t jsdq[|||j< q[q#|S )zw
    Get any docstrings placed after attribute assignments in a class body.

    https://davidism.com/mit-license/
    c                 s   s0    t | }t|d}|D ]	}||fV  |}qdS )z
        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise

        Can be removed when Python 3.9 support is dropped.
        N)iternext)iterableiteratorabrh   rh   ri   pairwise   s   

zget_attr_docs.<locals>.pairwiser   zGiven object was not a class.)astparsetextwrapdedentinspect	getsourcebody
isinstanceClassDef	TypeErrorAssign	AnnAssignExprvalueConstantr{   cleandoctargetstargetNameid)	r   r   Zcls_nodeoutr   r   docr   r   rh   rh   ri   get_attr_docs   s,   
r   namec                 C   s   t | stddd t| D }||vr!td| d| j d|| }|j }tur1t|dS |j }tur=t|dS t| j d| d	)
zrGet the default factory field of a dataclass by name. Used for getting
    default factory fields in `EngineArgs`.z#The given class is not a dataclass.c                 S   s   i | ]}|j |qS rh   r   .0frh   rh   ri   
<dictcomp>       zget_field.<locals>.<dictcomp>zField 'z' not found in .default_factory)defaultz. must have a default value or default factory.)	r   r   r   
ValueErrorrx   r   r   r   r   )r   r   
cls_fieldsZnamed_fieldr   r   rh   rh   ri   	get_field   s   

r   c                    s   t  fddt| D jS )Nc                 3   s    | ]
}|j  kr|V  qd S rg   r   r   r   rh   ri   	<genexpr>   s    z is_init_field.<locals>.<genexpr>)r   r   init)r   r   rh   r   ri   is_init_field   s   r   )rS   ZslowZmistralZcustom)rS   halffloat16bfloat16floatfloat32)raw_logprobsZ
raw_logitsZprocessed_logprobsZprocessed_logitsT)Zarbitrary_types_allowedr0   c                	   @   sB  e Zd ZU dZdZeed< 	 dZeed< 	 dZ	e
ed< 	 dZee ed< 	 dZee ed	< 	 dZeed
< 	 dZeed< 	 dZeeejf ed< 	 dZee ed< 	 dZee ed< 	 dZeed< 	 dZee ed< 	 dZee ed< 	 eedZ eee!f ed< 	 dZ"ee# ed< 	 dZ$ee ed< 	 dZ%ee ed< 	 dZ&ee ed< 	 dZ'eee(  ed< 	 dZ)eed< 	 dZ*eed< 	 dZ+eed< 	 d Z,e-ed!< 	 dZ.eed"< 	 dZ/eed#< 	 dZ0eed$< 	 dZ1eed%< 	 dZ2eeee3e f  ed&< 	 eedZ4eeef ed'< 	 dZ5eed(< 	 dZ6eed)< 	 eedZ7eeeee!f f ed*< 	 d+Z8eed,< 	 e9j:j;Z<eee9f ed-< 	 dZ=eeeef  ed.< 	 eedZ>e?ed/< 	 dZ@eeee!f  ed0< 	 d1ZAeed2< 	 eedZBeee!f ed3< 	 edd4ZCed5 ed6< 	 dZDeeed5f  ed7< 	 dZEee ed8< 	 dZFeed9< 	 eedZGeee!f ed:< 	 dZHeed;< 	 eIj:j;ZJeeeIf ed<< 	 dZKee ed=< 	 dZLee3eeeMeN f   ed>< 	 d?efd@dAZOddBdCZPeQddDdEeRdFe!d?e!fdGdHZSeTdIdEddKdLZUd?efdMdNZVd?efdOdPZWeXdQdR ZYeXd?e3e fdSdTZZeXd?efdUdVZ[ded	ed?dfdWdXZ\d?edY fdZd[Z]dFed?dfd\d]Z^d^d_ Z_d?ed5 fd`daZ`ddbdcZadde3e d?ebfdedfZcdde3e ded?ebfdgdhZddde3e diebd?eefdjdkZfdde3e diebde
d?eefdldmZgdde3e dneed?e3eh fdodpZidde3e d?ejdq fdrdsZkdde3e dneed?e3eh fdtduZldde3e diebdneed?e3eh fdvdwZmdxdy Znddzd{Zodd|d}Zpdd~dZqdddZr			?	dddZs	?	dddZt			?	dddZud?ee fddZvd?efddZwd?efddZxeXd?efddZyd?efddZzd?efddZ{ddd?efddZ|ddd?efddZ}ddd?e~eef fddZddd?efddZejfddded?efddZd?ee fddZdddZd?eee!f fddZd?eee!f fddZeXd?efddZeXd?efddZeXd?efddZeXd?efddZeXd?efddZd?efddZeXd?efddZeXd?efddZeXd?efddZeXd?efddZeXd?efddZeXd?efddZeXddĄ ZeXd?efddƄZeXd?efddȄZeXd?efddʄZeXdd̄ ZeXd?efdd΄ZdefddЄZdS )ModelConfigzConfiguration for the model.zQwen/Qwen3-0.6BmodelrS   runnerconvertNtask	tokenizertokenizer_modeFtrust_remote_codedtypeseedhf_config_path allowed_local_media_pathrevisioncode_revisionr   rope_scaling
rope_thetatokenizer_revisionmax_model_lenspec_target_max_model_lenquantizationenforce_eageri    max_seq_len_to_capture   max_logprobsr   logprobs_modedisable_sliding_windowdisable_cascade_attnskip_tokenizer_initenable_prompt_embedsserved_model_namelimit_mm_per_promptinterleave_mm_stringsskip_mm_profilingmedia_io_kwargsTuse_async_output_procconfig_formathf_tokenhf_overridesmm_processor_kwargs   mm_processor_cache_gboverride_neuron_configr   PoolerConfigpooler_configoverride_pooler_configlogits_processor_patterngeneration_configoverride_generation_configenable_sleep_mode
model_imploverride_attention_dtypelogits_processorsrn   c                 C   s   g }| | j | | j | | j | | j | | j | | j | | j | | j | | j	 | | j
 | | j | | j | | j | | j | | j  t|}t| tt|  S )  
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        )appendr   r   r   r   r   r   r   r   r   r   r   r   r   r   	hf_configZto_json_stringr{   assert_hashablehashlibsha256r\   	hexdigest)ru   factorsZstr_factorsrh   rh   ri   rv     s&   zModelConfig.compute_hashc              	      sN  t jrjd u rd_t jstdj jdkr*tjj	j
jd\__	tjj_tj_j	d u rAj_	jd u rJj
_tj	_	tjtr\tj_tjrgi }j}nj}d }jrdji}|| t|}d| d}tjt|dd	 jd urd
ji}|| t|}d| d}tjt|dd	 jj	 t j }r|dkrtdd u rt dddl!m"} j#d ur|$ stjddd	 j%r|& st dtj'trt(j'_'t)jpjjj
j*j'||d}|_+t,j+_-t.j-dd _/0 _1t2jj3j
d_4j5 j6}	|	7 }
|	8 }dt9dt:f fdd}j;d urd}d}d}d}j;t<d v }j;t<d v }|
r|r|rvd}d}d}n8|rd}d}d}n.	 n,|
s|r|rd}d}d }n|rd}|j;}d!| d"}n	 n	t=d#j;d$|_|_>| d%| }tj|tdd	 ? j_@A j@j>_Bj@dkr|
stCd }jB|vrt d&j@dkr|stCd }jB|vrd'd(D| d) }t d*| d+E j@jB_F|	G \}}|_H|_ItJd,| K _LtMjj+jNj@dkj
d-_NjOsitPj-rit jsit j }d.v ritQd/j-jR|j-jS d0_OjT_UVjT_TW _XjOrd j-_SjYsZ  |[ sj\rt d1d2_]^  _  `  d S )3Nr   zThe global random seed is set to %d. Since VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may affect the random state of the Python process that launched vLLM.r[   )r   r   r   r   r   z[`--rope-scaling` will be removed in a future release. 'Please instead use `--hf-overrides 'z'`   )
stacklevelr   zY`--rope-theta` will be removed in a future release. 'Please instead use `--hf-overrides '
FLASHINFERZ
flashinferzVLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer module was not found. See https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile for instructions on how to install it.r3   z;override-attention-dtype is set but not using ROCm platformz0Sleep mode is not supported on current platform.)hf_overrides_kwhf_overrides_fnattention_chunk_size)r   r   r   rn   c                    sR   | dks| dkr
dS | dkrdS | dkrdS | dkr'  }|dkr%dS dS dS )NrU   rV   rW   rY   rX   r^   )_get_default_pooling_task)r   new_taskarchitecturesru   rh   ri   _task_to_convertm  s   
z3ModelConfig.__post_init__.<locals>._task_to_convertrS   zdThe 'task' option has been deprecated and will be removed in v0.13.0 or v1.0, whichever comes first.zPlease remove this option.rT   r]   zgPlease replace this option with `--runner generate` to continue using this model as a generative model.zcPlease replace this option with `--runner pooling` to continue using this model as a pooling model.zPlease remove this optionz+Please replace this option with `--convert z2` to continue using this model as a pooling model.zFThe model should be a generative or pooling model when task is set to r    z0This model does not support `--runner generate`.<|>zHThis model does not support `--runner pooling`. You can pass `--convert z" to adapt it into a pooling model.zResolved architecture: %s)is_pooling_modelr   )ZXFORMERSr   z%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).Tz5`override_neuron_config` is only supported on Neuron.F)aenvsVLLM_USE_V1r   ZVLLM_ENABLE_V1_MULTIPROCESSINGloggerwarningr   r=   r   r   r   r   get_served_model_namer   rD   r   r   r   r{   callabler   r   updatejsondumpswarningswarnDeprecationWarningr   !maybe_pull_model_tokenizer_for_s3ZVLLM_ATTENTION_BACKENDr   r   vllm.platformsr4   r   is_rocmr   Zis_sleep_mode_availabler   r5   r6   r   r   r8   hf_text_configgetattrr   _get_encoder_configencoder_configr7   r   Zhf_image_processor_configr   registryis_text_generation_modelr  
TaskOptionConvertTyper   r_   AssertionErrorr   _get_runner_typerk   _get_convert_typerl   ra   join_get_supported_taskssupported_tasksZinspect_model_cls_model_info_architectureinfo_init_pooler_configr   _get_and_verify_dtyper   r   r<   warning_once
model_typesliding_windowr   Zoriginal_max_model_lenget_and_verify_max_len_init_multimodal_configmultimodal_configr   _verify_tokenizer_mode	is_neuronr   config_updated_verify_quantization_verify_cuda_graph_verify_bnb_config)ru   r   r   Zhf_overrideZhf_overrides_strmsgbackendr4   r   r  Zis_generative_modelr  r  r   r   Z
msg_prefixZmsg_hintZis_generative_taskZis_pooling_taskZgenerate_convertsZpooling_convertsZconvert_optionZ
model_infoarchrh   r   ri   __post_init__  s  



















	
zModelConfig.__post_init__beforemoder   c                 C   s   t |tr	| S |S rg   )r   r{   lower)r   r   rh   rh   ri   validate_quantization_before  s   
z(ModelConfig.validate_quantization_beforeafterru   c                 C   s,   t | jts
tdt | jtstd| S )Nz/tokenizer must be a string after __post_init__.z5max_model_len must be an integer after __post_init__.)r   r   r{   r   r   intrt   rh   rh   ri   validate_model_config_after  s   z'ModelConfig.validate_model_config_afterc                 C   s*   t | d| jdkrdS | j| jkrdS dS )zsDetermine which Transformers backend class will be used if
        `model_impl` is set to `transformers` or `auto`.rk   r]   ZTransformersModelZTransformersForMultimodalLMZTransformersForCausalLM)r  r   r   r  rt   rh   rh   ri   _get_transformers_backend_cls
  s
   z)ModelConfig._get_transformers_backend_clsc                 C   s   | j |  kS )z;Check if the model is using the Transformers backend class.)rm   rA  rt   rh   rh   ri   using_transformers_backend     z&ModelConfig.using_transformers_backendc                 C   s   t jS rg   )	me_modelsZModelRegistryrt   rh   rh   ri   r    s   zModelConfig.registryc                 C   s   t | jdg S )Nr   r  r   rt   rh   rh   ri   r     rC  zModelConfig.architecturesc                 C      | j S )z$The architecture vllm actually used.)r%  rt   rh   rh   ri   rm   !  s   zModelConfig.architecturec                 C   s   t |s
t |s
dS t |r4t }|j|g dd || _|j| _||kr4|j|g dd |j| _dS t |rJt }|j|g dd |j| _dS dS )zPull model/tokenizer from S3 to temporary directory when needed.

        Args:
            model: Model name or path
            tokenizer: Tokenizer name or path
        N)z*.modelz*.pyz*.json)Zallow_pattern)z*.ptz*.safetensorsz*.binz	*.tensors)Zignore_pattern)rC   rB   Z
pull_filesZmodel_weightsdirr   r   )ru   r   r   Zs3_modelZs3_tokenizerrh   rh   ri   r  &  s.   z-ModelConfig.maybe_pull_model_tokenizer_for_s3MultiModalConfigc                 C   s,   | j jrt| j| j| j| j| j| jdS d S )N)limit_per_promptr   r   r   r   r   )	r$  Zsupports_multimodalrH  r   r   r   r   r   r   rt   rh   rh   ri   r-  J  s   z#ModelConfig._init_multimodal_configc                 C   s   |   }|| _||_d S rg   )get_multimodal_configr   )ru   r   	mm_configrh   rh   ri   set_mm_processor_cache_gbV  s   
z%ModelConfig.set_mm_processor_cache_gbc                 C   s   t | j| jS rg   )r:   r   r   rt   rh   rh   ri   r  \  s   zModelConfig._get_encoder_configc                 C   s   | j dkrIt| jtrtdi | j| _| jpt }t| j| j}|d ur;| D ]\}}t	||d u r:t
||| q)| jj}|jd u rG||_|S d S )Nr]   rh   )rk   r   r   r~   r   r9   r   r   itemsr  setattrr$  default_pooling_typepooling_type)ru   r   Zbase_configkvrO  rh   rh   ri   r'  `  s"   

zModelConfig._init_pooler_configc                 C   sB   t t| j }|ttvrtd| j dtt d|| _d S )NzUnknown tokenizer mode: . Must be one of r   )r   TokenizerModer   r<  r   r   )ru   r   rh   rh   ri   r/  w  s   

z"ModelConfig._verify_tokenizer_moder   c                 C   sv   | j }t| j| jrdS |D ]*}|| v r(||| r dS ||| r( dS t|}|r8|\}\}}|  S qdS )Nr]   rT   )r  r9   r   r   get_supported_archsr  r  rq   )ru   r   r  r7  match_rk   rh   rh   ri   _get_default_runner_type  s   z$ModelConfig._get_default_runner_typec                 C   s.   |dkr|S |  |}|dkrtd| |S )NrS   rT   z]Resolved `--runner auto` to `--runner %s`. Pass the value explicitly to silence this message.)rX  r	  r&  )ru   r   r   rk   rh   rh   ri   r    s   
zModelConfig._get_runner_typerk   c                 C   s   | j }|D ]4}|| v r'|dkr||| r dS |dkr'||| r' dS t||d}|r9|\}\}}|  S q|dkr@dS dS )NrT   r^   r]   rk   rV   )r  rU  r  r  rq   )ru   r   rk   r  r7  rV  rW  rl   rh   rh   ri   _get_default_convert_type  s.   
z%ModelConfig._get_default_convert_typec                 C   s0   |dkr|S |  ||}|dkrtd| |S )NrS   r^   z_Resolved `--convert auto` to `--convert %s`. Pass the value explicitly to silence this message.)rZ  r	  r&  )ru   r   rk   r   rl   rh   rh   ri   r     s   zModelConfig._get_convert_typerl   c                 C   s^   | j }||| rdgS tt  }||| s|td v r"|d ||| r-|d |S )NrZ   rT   )r  Zis_transcription_only_modellist_ResolvedTaskr  ra   r   Zis_transcription_model)ru   r   rl   r  r#  rh   rh   ri   _get_supported_generation_tasks  s   


z+ModelConfig._get_supported_generation_tasksr`   c                 C   sP   | j || r	dS |D ]}t|dd}|r%|\}\}}|dks!J |  S qdS )NrW   r]   rY  r^   rV   )r  Zis_cross_encoder_modelrq   )ru   r   r7  rV  rW  rl   rh   rh   ri   r     s   z%ModelConfig._get_default_pooling_taskc                 C   sV   | j }tt  }||| s|td v r)|d |dkr"| |n|}|| |S )Nr]   r\   r^   )r  r[  r\  r  ra   r   r   )ru   r   rl   r  r#  Z
extra_taskrh   rh   ri   _get_supported_pooling_tasks  s   



z(ModelConfig._get_supported_pooling_tasksc                 C   sB   |dkr
|  ||S |dkr| ||S |dkrdgS t| d S )NrT   r]   r[   )r]  r^  r   )ru   r   rk   rl   rh   rh   ri   r"    s   z ModelConfig._get_supported_tasksc                 C   s   t | jdd }|d u rt | jdd }|S |di d}|dkrI|di d}|dkr4d|d	< |S |d
kr>d|d	< |S |d urItd| |S )Nquantization_configZcompression_configZproducerr   modeloptr   
quant_algoZFP8quant_methodZNVFP4modelopt_fp4zUnknown ModelOpt quant algo: )r  r   getr   )ru   	quant_cfgZproducer_namera  rh   rh   ri   _parse_quant_hf_config(  s,   z"ModelConfig._parse_quant_hf_configc           
         sx  t j}g d}| jd urtt j| j| _|  }|d ur|dd }|dd}||d< g d  fdd|D }|  }|D ],}t 	|}|
|| j}|d urm|tt jv rf| vrftd	| d
|}|| _ nqA| jd u rw|| _n| j|krtd| d| j d| jd ur| j|vrtd| j d| dddlm}	 |	| j | j|vrtd| j d S d S d S )N)Zfp8marlinr`  gptq_marlin_24gptq_marlin
awq_marlinZ
fbgemm_fp8compressed-tensorsZexperts_int8Zquarkrc  bitblasgptq_bitblasincrb  r   Zcompressed_tensorsrk  )
rg  rl  rh  ri  rm  rj  ZipexZ	moe_wna16r`  rc  c                    s   g | ]}| vr|qS rh   rh   )r   q	overridesrh   ri   
<listcomp>e  s    z4ModelConfig._verify_quantization.<locals>.<listcomp>zQuantization method z is an override but is has not been added to the `overrides` list above. This is necessary to ensure that the overrides are checked in order of preference.z3Quantization method specified in the model config (zS) does not match the quantization method specified in the `quantization` argument ().zUnknown quantization method: rS  r   r   r3   z^%s quantization is not fully optimized yet. The speed can be slower than non-quantized models.)me_quantZQUANTIZATION_METHODSr   r   r2   rf  rd  r<  r	   get_quantization_configZoverride_quantization_methodr   r   r  r4   Zverify_quantizationr	  r
  )
ru   Zsupported_quantizationZoptimized_quantization_methodsre  rb  Zquantization_methodsr   methodZquantization_overrider4   rh   rp  ri   r2  >  sv   









z ModelConfig._verify_quantizationc                 C   s~   | j }| jrt|t| jdd}t| j|| _dg}| jj|v p"| j}|r9| js;t	
 r=td| jj d| _d S d S d S d S )NZmax_source_positionsr   mllamazGCUDA graph is not supported for %s on ROCm yet, fallback to eager mode.T)r   r;   maxr  r   minr   r*  r   r4   r  r	  r
  )ru   Zeffective_max_seq_lenZROCM_UNSUPPORTED_MODELSZunsupported_rocmrh   rh   ri   r3    s0   

zModelConfig._verify_cuda_graphc                 C   s`   | j dk}t| jdddu}|r| jjddnd}t|||| j gr.td d| _dS dS )z
        The current version of bitsandbytes (0.46.1) with 8-bit models does not
        yet support CUDA graph.
        # TODO Remove this when bitsandbytes supports.
        Zbitsandbytesr_  NZload_in_8bitFzQCUDA graph is not supported on BitsAndBytes 8bit yet, fallback to the eager mode.T)	r   r  r   r_  rd  allr   r	  r
  )ru   Zis_bitsandbytesZhas_quantization_configZis_8bitrh   rh   ri   r4    s.   


zModelConfig._verify_bnb_configc                 C   sD   g d}d}|D ]}t | j|d}|dkr nq|dk r tdd S )N)Zmoe_num_expertsnum_expertsZn_routed_expertsZnum_local_expertsr      zYNumber of experts in the model must be greater than 0 when expert parallelism is enabled.)r  r  r   )ru   Znum_expert_namesr{  r   rh   rh   ri   _verify_with_expert_parallelism  s   z+ModelConfig._verify_with_expert_parallelismload_config
LoadConfigc                 C   sZ   t | jdr'ddlm} || |}|r)|| jjd< d| jjvr+d| jjd< d S d S d S d S )Ndual_chunk_attention_configr   )get_sparse_attention_configZsparse_attention_configZsparse_attention_enabledT)hasattrr   -vllm.model_executor.model_loader.weight_utilsr  r  )ru   r~  r  Zsparse_attn_configrh   rh   ri   "verify_dual_chunk_attention_config  s$   
z.ModelConfig.verify_dual_chunk_attention_configc                 C   sr   | j sd S |jdkrd| _ d S ddlm} || js d| _ d S tjr(d| _ d S | jdkr0d| _ |r7d| _ d S d S )Nr|  Fr   r3   r]   )	r   pipeline_parallel_sizer  r4   Zis_async_output_supportedr   r  ZVLLM_USE_RAY_SPMD_WORKERrk   )ru   parallel_configspeculative_configdevice_configr4   rh   rh   ri   verify_async_output_proc  s"   


z$ModelConfig.verify_async_output_procr  r,   c                 C   s   |j dkr| jd usJ dt| jdd}|j}|| dkr)td| d| d|jr0|   |j}|dkrK| j	
| j| sCtd	| jrMd
| _d S d S d S )NZexternal_launcherzpSeed must be set when using external launcher backend to make sure sampling results are the same across workers.num_attention_headsr   z!Total number of attention heads (z-) must be divisible by tensor parallel size (rs  r|  zlPipeline parallelism is not supported for this model. Supported models implement the `SupportsPP` interface.F)distributed_executor_backendr   r  r  tensor_parallel_sizer   Zenable_expert_parallelr}  r  r  Zis_pp_supported_modelr   NotImplementedErrorr   )ru   r  Ztotal_num_attention_headsr  r  rh   rh   ri   verify_with_parallel_config  s8   


z'ModelConfig.verify_with_parallel_configc                 C      t | jddS )z?Get the sliding window size from the HF text config if present.r+  Nr  r  rt   rh   rh   ri   get_sliding_window,  rC  zModelConfig.get_sliding_windowc                 C   r  )NZ
vocab_sizer   r  rt   rh   rh   ri   get_vocab_size0     zModelConfig.get_vocab_sizec                 C   r  )Nhidden_sizer   r  rt   rh   rh   ri   get_hidden_size3  r  zModelConfig.get_hidden_sizec                 C   sR   t | jdsdS | jjdv r| jjd uS | jjdkr'| jjjdv o&| jjd uS dS )Nr*  F)deepseek_v2deepseek_v3deepseek_mtpZkimi_k2eagle)r  r  )r  r  r*  kv_lora_rankr   rt   rh   rh   ri   is_deepseek_mla6  s   
zModelConfig.is_deepseek_mlac                 C   s   | j r"t| jdd}| jr| jj| S t| jdd}|r"|r"|| S t| jdr2| jjdkr2| jjS | jr7dS t| jdd d urD| jj	S | jj
| jj S )Nqk_rope_head_dimr   qk_nope_head_dimr*  zamba2head_dim)r  r  r  use_mlar  r  r*  Zattention_head_dimis_attention_freer  r  r  )ru   r  r  rh   rh   ri   get_head_sizeE  s2   zModelConfig.get_head_sizec                 C   s  g d}| j j|v ot| j dd}|st| jddrdS | j jdkr2d| j jv r.| j jd S | j jS | j jdkrBt| j jd| j jS | j jd	kra| j jD ]}|jjs\| j j|jj	   S qLt
d
| jrfdS g d}|D ]}t| j|d}|dur}|  S ql| jjS )z%Returns the total number of KV heads.)ZfalconZ
RefinedWebZRefinedWebModelZnew_decoder_architectureFZmulti_queryr|  ZmptZ
kv_n_headsZdbrxznemotron-nasz%Couldn't determine number of kv headsr   )Z	n_head_kvnum_kv_headsZnum_key_value_headsZmulti_query_group_numN)r   r*  r  r  Zattn_configr  block_configs	attentionno_opZn_heads_in_groupRuntimeErrorr  )ru   Zfalcon_model_typesZnew_decoder_arch_falconblock
attributesattrr  rh   rh   ri   get_total_num_kv_headsb  sB   

	z"ModelConfig.get_total_num_kv_headsc                 C   s"   | j rdS |  }td||j S )z'Returns the number of KV heads per GPU.r|  )r  r  rx  r  )ru   r  Ztotal_num_kv_headsrh   rh   ri   get_num_kv_heads  s   zModelConfig.get_num_kv_headsc                 C   s   t | jdd}||j S )Nr  r   )r  r  r  )ru   r  Z	num_headsrh   rh   ri   get_num_attention_heads  s   
z#ModelConfig.get_num_attention_headsc                 C   s~   ddl m} | jjdks| jjdks| jjdkr t| jdd}nt| jdd}|j|j |j }|j}||||\}}||fS )Nr   )get_pp_indicesr  mimo_mtpglm4_moe_mtpnum_nextn_predict_layersnum_hidden_layers)	Zvllm.distributed.utilsr  r  r*  r   r  Zrankr  r  )ru   r  r  Ztotal_num_hidden_layersZpp_rankZpp_sizestartendrh   rh   ri   get_layers_start_end_indices  s    
z(ModelConfig.get_layers_start_end_indicesc                 C   s   |  |\}}|| S rg   )r  )ru   r  r  r  rh   rh   ri   get_num_layers  s   zModelConfig.get_num_layers
block_typec           
         s^   t jk}| j o| j o| j }| |\}}|r"|r || S dS | jr-|r)dS || S | jrA| jj}tdd ||| D S t	| jdd }|d ur{t
| jdrl| jjdkrl|rgtdd ||| D S | |S t fdd||| D S t	| jd	d }	|	rtd
d |	|| D S |d u r|	d u rtd j dtdd |	|| D S )Nr   c                 s   s    | ]}|j j V  qd S rg   )r  r  )r   bcrh   rh   ri   r         z;ModelConfig.get_num_layers_by_block_type.<locals>.<genexpr>Zlayers_block_typer*  r  c                 s       | ]}|d kV  qdS )ZhybridNrh   r   trh   rh   ri   r     s    c                 3   s    | ]}| j kV  qd S rg   )r   r  r  rh   ri   r     r  attn_type_listc                 s   r  r|  Nrh   r  rh   rh   ri   r         ztThe model is an hybrid without alayers_block_type or an attn_type_list in the hf_config,cannot determine the num of z layersc                 s   r  r  rh   r  rh   rh   ri   r     r  )rF   r  	is_hybrid	has_noopsr  r  r   r  sumr  r  r  r*  r  r   r   )
ru   r  r  Zattn_block_typeZis_transformerr  r  r  Zlayers_block_type_valuer  rh   r  ri   get_num_layers_by_block_type  sX   




z(ModelConfig.get_num_layers_by_block_typec                 C   s(   t | jdd}|du rt | jdd}|S )z;
        Returns the mamba chunk size if it exists
        Zmamba_chunk_sizeN
chunk_sizer  )ru   r  rh   rh   ri   get_mamba_chunk_size  s   z ModelConfig.get_mamba_chunk_sizec                 C   s   | j du r	td| j S )z
        Get the multimodal configuration of the model.

        Raises:
            ValueError: If the model is not multimodal.
        NzThe model is not multimodal.)r.  r   rt   rh   rh   ri   rJ    s   
z!ModelConfig.get_multimodal_configc                 C   sJ   | j dv rt| jp| j| j| jd}nt| j | jd}|du r!i S | S )a  
        This method attempts to retrieve the non-default values of the
        generation config for this model.

        The generation config can contain information about special tokens, as
        well as sampling parameters. Which is why this method exists separately
        to `get_diff_sampling_param`.

        Returns:
            A dictionary containing the non-default generation config.
        >   rS   r   r   r   )r   N)r   r>   r   r   r   r   Zto_diff_dict)ru   r0   rh   rh   ri   r>     s   

z%ModelConfig.try_get_generation_configc                    s   | j dkri  n|    | j g d}t fdd|D r6 fdd|D }d|v r5|d|d< ni }|r?td	 |S )
a  
        This method returns a dictionary containing the non-default sampling
        parameters with `override_generation_config` applied.

        The default sampling parameters are:

        - vLLM's neutral defaults if `self.generation_config="vllm"`
        - the model's defaults if `self.generation_config="auto"`
        - as defined in `generation_config.json` if
            `self.generation_config="path/to/generation_config/dir"`

        Returns:
            A dictionary containing the non-default sampling parameters.
        r   )Zrepetition_penaltyZtemperatureZtop_kZtop_pZmin_pmax_new_tokensc                 3   s    | ]}| v V  qd S rg   rh   r   pr   rh   ri   r   K  r  z6ModelConfig.get_diff_sampling_param.<locals>.<genexpr>c                    s&   i | ]}  |d ur|  |qS rg   )rd  r  r   rh   ri   r   L  s    
z7ModelConfig.get_diff_sampling_param.<locals>.<dictcomp>r  Z
max_tokenszDefault sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.)r   r>   r  r   anypopr	  r)  )ru   Zavailable_paramsZdiff_sampling_paramrh   r   ri   get_diff_sampling_param,  s(   

z#ModelConfig.get_diff_sampling_paramc                 C   s$   	 t  r| jjdkrdS t| jS )z*Extract the HF encoder/decoder model flag.rw  F)r4   r0  r   r*  r;   rt   rh   rh   ri   r;   `  s
   
zModelConfig.is_encoder_decoderc                 C   s
   t | jS rg   )rA   r   rt   rh   rh   ri   rA   o     
zModelConfig.uses_mropec                 C   
   | j d uS rg   )r.  rt   rh   rh   ri   is_multimodal_models  r  zModelConfig.is_multimodal_modelc                 C      | j }|du r	dS |jdkS )z7Whether the multi-modal processor should output hashes.NFr   r.  r   ru   rK  rh   rh   ri   processor_return_mm_hashesw     
z&ModelConfig.processor_return_mm_hashesc                 C   r  )z:Whether the multi-modal processor cache should be enabled.NFr   r  r  rh   rh   ri   enable_mm_processor_cache  r  z%ModelConfig.enable_mm_processor_cachec                 C   s   | j }|d u r	dS tjS Nr   )r.  r  ZVLLM_MM_INPUT_CACHE_GIBr  rh   rh   ri   get_mm_input_cache_gb  s   z!ModelConfig.get_mm_input_cache_gbc                 C   s   | j jp| jdkS )NrW   )r$  Zsupports_cross_encodingrl   rt   rh   rh   ri   is_cross_encoder  s   zModelConfig.is_cross_encoderc                 C      | j jS rg   )r$  Zsupports_pprt   rh   rh   ri   is_pp_supported     zModelConfig.is_pp_supportedc                 C   r  rg   )r$  Zsupports_multimodal_raw_inputrt   rh   rh   ri   !is_multimodal_raw_input_supported  r  z-ModelConfig.is_multimodal_raw_input_supportedc                 C   r  rg   )r$  r  rt   rh   rh   ri   r    r  zModelConfig.is_attention_freec                 C   r  rg   )r$  r  rt   rh   rh   ri   r    r  zModelConfig.is_hybridc                 C   r  rg   )r$  r  rt   rh   rh   ri   r    r  zModelConfig.has_noopsc                 C   r  rg   )r$  has_inner_statert   rh   rh   ri   r    r  zModelConfig.has_inner_statec                 C   s
   | j j S rg   )r$  Zsupports_v0_onlyrt   rh   rh   ri   is_v1_compatible  r  zModelConfig.is_v1_compatiblec                 C   s   | j otj S rg   )r  r  ZVLLM_MLA_DISABLErt   rh   rh   ri   r    rC  zModelConfig.use_mlac                 C   s    t t| jdd pt| jddS )Nmatryoshka_dimensionsis_matryoshkaF)boolr  r   rt   rh   rh   ri   r    s   zModelConfig.is_matryoshkac                 C   s   t | jdd S )Nr  rE  rt   rh   rh   ri   r    rC  z!ModelConfig.matryoshka_dimensionsc                 C   r  )Nuse_pad_tokenTrE  rt   rh   rh   ri   r    s   zModelConfig.use_pad_tokenc              	   C   sf   d }| j dkrt| jdddkrt| j| j| jd}t| j||| j	| 
 | j| jd}td| |S )Nr]   Zposition_embedding_typer   absoluter  )r   tokenizer_configr   r   r+  r   r  zUsing max model len %s)rk   r  r   r@   r   r   r   _get_and_verify_max_lenr  r   r  r   r  r	  r&  )ru   r   r  rh   rh   ri   r,    s,   z"ModelConfig.get_and_verify_max_lenrn   N)ru   r   rn   r   )r~  r  rn   N)r  r,   rn   N)rn   rH  )rx   ry   rz   __doc__r   r{   __annotations__r   RunnerOptionr   ConvertOptionr   r   r  r   r   r   rT  r   r  r   r   
ModelDTypetorchr   r?  r   r   r   r   r   r~   r   r   r   r   r   r   r   r   r2   r   r   r   r   LogprobsModer   r   r   r   r   r[  r   r   r   r   r   r5   r   r   r   r   r   HfOverridesr   r   r   r   r   r   r   r   r   r   r   r   r   typerO   rv   r8  r   classmethodr=  r   r@  rA  rB  propertyr  r   rm   r  r-  rL  r  r'  r/  
RunnerTyperX  r  r  rZ  r   r\  r]  r   r   r^  r"  rf  r2  r3  r4  r}  r  r  r  r  r  r  r  r  r  r  r  tupler  r  rF   r  r  r  rJ  r>   r  r;   rA   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r,  rh   rh   rh   ri   r      s  
 " 
!  


$










Z




!6



7
4

r   c                   @   s   e Zd ZU dZdZeeef ed< 	 dZ	e
e ed< 	 eedZeeef ed< 	 dZe
e ed< 	 dZe
eee ef  ed	< 	 d
Zeed< 	 dZeeeeef f ed< 	 defddZdd ZdS )r  z,Configuration for loading the model weights.rS   load_formatNdownload_dirr   model_loader_extra_configdeviceignore_patternsTuse_tqdm_on_loadcpupt_load_map_locationrn   c                 C   "   g }t jt| dd }|S r   Fusedforsecurityr   md5r{   r\   r   ru   r   hash_strrh   rh   ri   rv        zLoadConfig.compute_hashc                 C   sB   | j  | _ | jd urt| jdkrtd| j d S dg| _d S )Nr   z<Ignoring the following patterns when downloading weights: %szoriginal/**/*)r  r<  r  lenr	  r&  rt   rh   rh   ri   r8  $  s   zLoadConfig.__post_init__)rx   ry   rz   r  r  r   r{   rM   r  r  r   r   r~   r  rN   r  r  r[  r  r  r  rv   r8  rh   rh   rh   ri   r    s(   
 
r  )rS   cudaneuronr  tpuZxpuc                   @   s\   e Zd ZU dZdZeeeee	jf   e
d< 	 eddZee
d< 	 defdd	Zd
d ZdS )DeviceConfigz7Configuration for the device to use for vLLM execution.rS   r  Fr   device_typern   c                 C   r  r  r  r  rh   rh   ri   rv   @  s   zDeviceConfig.compute_hashc                 C   s   | j dkrddlm} |j| _| jstdnt| j tr"| j | _nt| j tj r.| j j| _| jdv r;t d| _ d S | jdv rEd | _ d S t | j| _ d S )NrS   r   r3   zFailed to infer device type, please set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to turn on verbose logging to help debug the issue.)r  r  )r	  )	r  r  r4   r  r  r   r{   r  r  )ru   r4   rh   rh   ri   r8  T  s"   





zDeviceConfig.__post_init__N)rx   ry   rz   r  r  r   r   r   Devicer  r  r   r  r{   rv   r8  rh   rh   rh   ri   r
  1  s   
 r
  )ngramr  eagle3medusamlp_speculatordraft_modelr  c                	   @   s6  e Zd ZU dZdZee ed< 	 dZe	e
 ed< 	 dZe	e ed< 	 dZe	e ed< 	 dZeed< 	 dZe	ej ed	< 	 dZe	e ed
< 	 dZe	e
 ed< 	 dZe	e
 ed< 	 dZe	e ed< 	 dZe	e ed< 	 dZe	e ed< 	 dZe	e
 ed< 	 dZee ed< 	 dZee ed< 	 dZee ed< 	 dZee ed< 	 dZ ee ed< 	 dZ!ee ed< 	 de
fddZ"e#de$de$fddZ%dd Z&e#de	e d ed!edefd"d#Z'e#ded$e	e d%e$defd&d'Z(e#ded$edefd(d)Z)e*d*d+de+fd,d-Z,e-defd.d/Z.defd0d1Z/de
fd2d3Z0dS )4SpeculativeConfigz'Configuration for speculative decoding.Nnum_speculative_tokensr   rv  draft_tensor_parallel_sizeTdisable_logprobsr   r   r   r   disable_by_batch_sizeprompt_lookup_maxprompt_lookup_minspeculative_token_treetarget_model_configtarget_parallel_configenable_chunked_prefilldisable_log_statsdraft_model_configdraft_parallel_configrn   c                 C   s2   g }| | jdk tjt| dd }|S )r   r  Fr  )r   rv  r   r  r{   r\   r   r  rh   rh   ri   rv     s   zSpeculativeConfig.compute_hashr   c                 C   s   | j dkrd| _ | j dkrt| dd }| |dgd | jd dkr6d| _ t| dd }| d|d	gd
 | jd dkrPd| _ t| dd }| d|dgd
 | S )Nr  r  r  ZDeepSeekMTPModel)	n_predictr   r   ZMiMoForCausalLMr  ZMiMoMTPModel)r  r   r   ZGlm4MoeForCausalLMr  ZGlm4MoeMTPModel)r*  r  r  r   )r   r   rh   rh   ri   hf_config_override  s2   

z$SpeculativeConfig.hf_config_overridec                 C   sX  | j d u r.| jd ur.| jr!| jjjdks| jjjdkr!| jj | _ n| jdv r*d| _ ntd| jd u r@| j d ur@| j dv r@d| _| jdv rd| _| jd u rY| jd u rYd| _d| _n!| jd u rj| jd useJ | j| _n| jd u rz| jd usvJ | j| _| jdk rtd| j d	| jdk rtd
| j d	| j| jkrtd| j d| j | j| _	| j
| _d S d| _d| _| j d ur*td7i d| j ddd| jjd| jjd| jjd| jjd| jjd| jjd| jd| jd| jjd| jjd| jd| jjd| jjd| jjdtj| _	| jdv rnLd | j	j  v s+d!| j	j  v r/d"| _n6| j	jjd#kr;d#| _n*| j	jjd$krGd$| _n| j	jjd%v r^d&| _| jdkr]t d' nd(| _t!d)| jdv r| j"rwt#j$swtd*dd+l%m&} dd,l'm(} t)| j	j||frn|| j	j| jd"d-}|| j	_| jd urt*| j	jd.r| j| j	j_+t,| j	jd/d }|d ur| jd u r|| _n| j|kr| j| dkrtd0| j d1|| j-d u rt.d2d3 t/| jD | _-nt01| j-}t.t2|d4d5 d6| _-t3| j
| j4| j	j| _4t5| j| j	j| jj| j	_t6| j
| j4| _d S d S )8Nr  Zmimo)r  z[ngram]r  z>num_speculative_tokens was provided without speculative model.   r|  zprompt_lookup_min=z must be > 0zprompt_lookup_max=z must be <= prompt_lookup_max=r   r   r   r[   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r  r  zeagle-zeagle3-r  r  r  )r  r  r  r  zeAll Deepseek MTP models only have one layer. Might need some code changes to support multiple layers.r  zSpeculative decoding with draft model is not supported yet. Please consider using other speculative decoding methods such as ngram, medusa, eagle, or deepseek_mtp.z;Chunked prefill and EAGLE are not compatible when using V0.)SpeculatorsConfig)EAGLEConfig)rv  r*  num_lookahead_tokensr   znum_speculative_tokens:z  must be divisible by n_predict=c                 S   s   g | ]}|d  d qS )r|  r   rh   r   irh   rh   ri   rr    s    
z3SpeculativeConfig.__post_init__.<locals>.<listcomp>c                 S   s   t | | fS rg   )r  )r  rh   rh   ri   <lambda>  s    z1SpeculativeConfig.__post_init__.<locals>.<lambda>)keyrh   )7r   r  r  r  r*  rv  r   r  r  r  r  r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r!  r<  r   r	  r
  r  r  r  r  Zvllm.transformers_utils.configsr#  Z%vllm.transformers_utils.configs.eagler$  r   r  r%  r  r  r{   ranger   literal_evalsorted_verify_and_get_draft_tpr  #_maybe_override_draft_max_model_lencreate_draft_parallel_config)ru   r#  r$  Zeagle_configr   Ztree_choicesrh   rh   ri   r8    s^  











	


zSpeculativeConfig.__post_init__speculative_max_model_lendraft_max_model_lentarget_max_model_lenc                 C   sN   | dur"| |krt d| d|| |kr t d| d|| S t||S )a  Determine the max sequence len for the draft model. This is usually
        the draft_max_model_len, but may be the target_max_model_len if it is
        less than the draft_max_model_len, or may be speculative_max_model_len
        if it is specified.

        This is necessary so that sequences do not exceed the capacity of the
        draft model or the target model.

        speculative_max_model_len is mainly used for testing that sequences can
        skip speculation.
        Nzspeculative_max_model_len=z+ cannot be larger than draft_max_model_len=z, cannot be larger than target_max_model_len=)r   ry  )r1  r2  r3  rh   rh   ri   r/    s   

z5SpeculativeConfig._maybe_override_draft_max_model_len&speculative_draft_tensor_parallel_sizedraft_hf_configc                 C   s^   |du r|j dkrd}| jdkrtd|j  |S | j}|S |d| jfvr-td|d|S )z
        Verifies and adjusts the tensor parallel size for a draft model
        specified using speculative_draft_tensor_parallel_size.
        Nr  r|  zV%s cannot currently be run with tp>1; setting speculative_draft_tensor_parallel_size=1z'speculative_draft_tensor_parallel_size=zB cannot be other value than 1 or target model tensor_parallel_size)r*  r  r	  r
  r   )r  r4  r5  rh   rh   ri   r.    s&   


z*SpeculativeConfig._verify_and_get_draft_tpc              	   C   s&   t | j|| j| j| j| j| jd}|S )zCreate a parallel config for use by the draft worker.

        This is mostly a copy of the target parallel config, except the tp_size.
        )r  r  r  max_parallel_loading_workersdisable_custom_all_reduceray_workers_use_nsightplacement_group)r,   r  r  r6  r7  r8  r9  )r  r4  r  rh   rh   ri   r0    s   	z.SpeculativeConfig.create_draft_parallel_configr>  r:  c                    s    j d u r	td j dkrtd j  d jr! j j  jd ur3 jdk r3td jddg} jd	krW jrWt fd
d|D sWtd| d jj	j
 S )Nz}num_speculative_tokens must be provided with speculative model unless the draft model config contains an n_predict parameter.r   z9Expected num_speculative_tokens to be greater than zero (rs  r   zmExpect the batch size threshold of disabling speculative decoding is > 1, but got self.disable_by_batch_size=llamaZqwenr  c                 3   s    | ]
}| j jjv V  qd S rg   )r  r  r*  )r   Zsupported_modelrt   rh   ri   r   ,	  s    
z1SpeculativeConfig._verify_args.<locals>.<genexpr>zEagle3 is only supported for z@ models. Got self.target_model_config.hf_text_config.model_type=)r  r   r  r  r  r  rv  r  r  r  r*  )ru   Zeagle3_target_supportedrh   rt   ri   _verify_args	  s6   




zSpeculativeConfig._verify_argsc                 C   rF  )zThe number of additional slots the scheduler should allocate per
        step, in addition to the slots allocated for each known token.

        This is equal to the number of speculative tokens, as each speculative
        token must be scored.
        )r  rt   rh   rh   ri   num_lookahead_slots6	  s   z%SpeculativeConfig.num_lookahead_slotsc                 C   s
   | j dv S )N)r  r  r  )rv  rt   rh   rh   ri   	use_eagle@	  s   
zSpeculativeConfig.use_eaglec                 C   s8   | j }|dkr	d n| jj}| j}d|d|d|dS )Nr  zSpeculativeConfig(method=z, model=z, num_spec_tokens=))rv  r  r   r  )ru   rv  r   Znum_spec_tokensrh   rh   ri   __repr__C	  s   zSpeculativeConfig.__repr__)1rx   ry   rz   r  r  r   r?  r  r   r   r{   rv  SpeculativeMethodr  r  r  r   rt  r2   r   r   r   r  r  r  r  r  r   r  r,   r  r  r  r  rv   staticmethodrK   r!  r8  r/  r.  r0  r   r   r;  r  r<  r=  r?  rh   rh   rh   ri   r  s  s   
  E" 	r  )rS   r   r   c                   @   s   e Zd ZU dZdZeed< 	 dZeed< 	 dZe	ed< 	 dZ
ee ed	< 	 d
Zeejef ed< 	 dZeed< 	 e Zee ed< dZeeeef  ed< 	 dZe	ed< 	 defddZdd ZdefddZdefddZdS )
LoRAConfigzConfiguration for LoRA.   max_lora_rankr|  	max_lorasFfully_sharded_lorasNmax_cpu_lorasrS   
lora_dtype   lora_extra_vocab_sizelora_vocab_padding_sizedefault_mm_lorasbias_enabledrn   c                 C   sv   g }| | j | | j | | j | | j | | j | | j | | j tj	t
| dd }|S r  )r   rD  rE  rF  rH  rJ  rK  rM  r   r  r{   r\   r   r  rh   rh   ri   rv   s	  s   zLoRAConfig.compute_hashc                 C   s   d}d}| j |vrtd| j  d| d| j|vr&td| j d| d| jdk r4td| j d	| jd u r?| j| _d S | j| jk rRtd
| j d| j dd S )N)   rC      @      rI  i@     )rI  rR  zmax_lora_rank (z) must be one of r   zlora_extra_vocab_size (r|  zmax_loras (z) must be >= 1.zmax_cpu_loras (z) must be >= max_loras (r>  )rD  r   rJ  rE  rG  )ru   Zpossible_max_ranksZpossible_lora_extra_vocab_sizerh   rh   ri   r8  	  s2   






zLoRAConfig.__post_init__cache_configc                 C   s    |j dkrtjstdd S d S )Nr   z4V0 LoRA does not support CPU offload, please use V1.)cpu_offload_gbr  r  r   )ru   rS  rh   rh   ri   verify_with_cache_config	  s
   z#LoRAConfig.verify_with_cache_configmodel_configc                 C   s8   | j dv r|j| _ d S t| j trtt| j | _ d S d S )N)NrS   )rH  r   r   r{   r  r  )ru   rV  rh   rh   ri   verify_with_model_config	  s
   
z#LoRAConfig.verify_with_model_config) rx   ry   rz   r  rD  r?  r  rE  rF  r  rG  r   rH  r   r  r   	LoRADTyperJ  r4   Zget_lora_vocab_padding_sizerK  r   rL  r~   r{   rM  rv   r8  r#   rU  r   rW  rh   rh   rh   ri   rB  M	  s2   
 	rB  c                   @   s   e Zd ZU dZeeeef ee	dZ
eeef ed< 	 eedZeeeeef f ed< 	 dZeeeef  ed< 	 dZeed	< 	 d
Zeed< 	 d
Zeed< 	 defddZdedefddZdeeef deeef fddZdS )rH  z+Controls the behavior of multimodal models.r   rI  r   r   Nr   r   r   Fr   r   rn   c                 C   r  r  r  r  rh   rh   ri   rv   	  r  zMultiModalConfig.compute_hashmodalityc                 C   s   | j |tjr
dS dS )zj
        Get the maximum number of input items allowed per prompt
        for the given modality.
        i  r|  )rI  rd  r  r  )ru   rY  rh   rh   ri   get_limit_per_prompt	  s   z%MultiModalConfig.get_limit_per_promptinference_kwargsc                 C   s   | j pi }|t|B S )z
        Get the keyword arguments to pass to the multi-modal processor
        according to the extra arguments passed during inference.
        )r   r~   )ru   r[  kwargsrh   rh   ri   merge_mm_processor_kwargs
  s   
z*MultiModalConfig.merge_mm_processor_kwargs)rx   ry   rz   r  r   r~   r{   r?  r   r   rI  r  r   r   r   r   r   objectr   r   r  r   rv   rZ  r   r]  rh   rh   rh   ri   rH  	  s,   
 "

	


rH  c                   @   s   e Zd ZU dZdZee ed< 	 dZee	 ed< 	 dZ
ee ed< 	 dZee	 ed< 	 dZee	 ed< 	 dZee ed< 	 dZeee  ed	< 	 dZee	 ed
< 	 dZee ed< 	 defddZdS )r   z:Controls the behavior of output pooling in pooling models.NrP  	normalize
dimensions
activationsoftmaxstep_tag_idreturned_token_idsenable_chunked_processingmax_embed_lenrn   c                 C   r  r  r  r  rh   rh   ri   rv   P
  r  zPoolerConfig.compute_hash)rx   ry   rz   r  rP  r   r{   r  r_  r  r`  r?  ra  rb  rc  rd  r[  re  rf  rv   rh   rh   rh   ri   r   
  s*   
 	r   )r   r   r   r   r   z>Numerical instability. Please use bfloat16 or float32 instead.)Zgemma2gemma3Zplamo2Zglm4r*  r   c                 C   s   | t v r|tjkrdS dS )NFT)_FLOAT16_NOT_SUPPORTED_MODELSr  r   )r*  r   rh   rh   ri   _is_valid_dtypeu
  s   ri  c                 C   s2   | t v r|tjkrt |  }td| d| dS )NzThe model type z# does not support float16. Reason: T)rh  r  r   r   )r*  r   reasonrh   rh   ri   _check_valid_dtype|
  s   
rk  model_idr0   r   c                C   s   t |dd }|d u rt | dd }|d u r"t|dr"t |jdd }|d u r2t|dr2t |jdd }|d u rRt| |d}|rR|j }rRdd | D }|rRt|S |d u rYt	j
}|S )Ntorch_dtypevision_configr  r   c                 S   s(   h | ]}|j D ]
}|tv rt| qqS rh   )Zparameter_count_SAFETENSORS_TO_TORCH_DTYPE)r   Zfile_mtZ	dtype_strrh   rh   ri   	<setcomp>
  s    z_find_dtype.<locals>.<setcomp>)r  Zget_text_configr  rn  r  r?   Zfiles_metadatavaluesrH   r  r   )rl  r0   r   config_dtypeZrepo_mtZfiles_mtZparam_dtypesrh   rh   ri   _find_dtype
  s$   rt  rs  r  c          
         s   ddl m}  fdd|jD }|rtj|v rtj}n|d }|tjkr&|}||v r,|S | }| }|d u r<|}n| }	|d|	 d}t	
d||| |S )Nr   r3   c                    s   g | ]	}t  |r|qS rh   )ri  )r   r   r*  rh   ri   rr  
  s    z'_resolve_auto_dtype.<locals>.<listcomp>z (with compute capability r>  zHYour device %s doesn't support %s. Falling back to %s for compatibility.)r  r4   supported_dtypesr  r   r   Zget_device_nameget_device_capabilityZas_version_strr	  r
  )
r*  rs  r  r4   rv  Zpreferred_dtypeZdevice_nameZdevice_capabilityZ
device_strversion_strrh   ru  ri   _resolve_auto_dtype
  s0   

ry  ro  c                C   s   t | ||d}|j}t|tr/| }|dkrt|||d}n |tvr*td|t| }nt|tj	r8|}ntd| t
|| ||krk|tjkrVtd|| |S |tjkrdtd|| |S td|| |S )Nro  rS   )r  zUnknown dtype: zUpcasting %s to %s.zDowncasting %s to %s.zCasting %s to %s.)rt  r*  r   r{   r<  ry  _STR_DTYPE_TO_TORCH_DTYPEr   r  r   rk  r   r	  r&  r
  )rl  r0   r   r  r   rs  r*  rm  rh   rh   ri   r(  
  s4   




r(  r   r  r   r   r+  r   r  c              	   C   s  t d}g d}d}	|D ]}
t| |
d}|dur%||k r|
n|	}	t||}qt| dd }r2d}	|}|r@|dur@||k r@d}	|}|rM|d|}t||}|t dkrj|durY|S |dur_|S d}td|| |}t| dd}|durd	| jvr|d
 }|dvr|rtd|dd}|dkr|d }||9 }|rd|v r|d }|du rt|}t	
 rtd| t|S ||krt| dd}|dur||kr|rtdt|S d| d|	 d| d| d	}tjrtd| t|S t| dt|S )z*Get and verify the model's maximum length.inf)	Zmax_position_embeddingsZn_positionsZmax_seq_lenZ
seq_lengthmodel_max_lengthZmax_target_positionsZmax_sequence_lengthmax_seq_lengthZseq_lenNr|  r+  i   zThe model's config.json does not contain any of the following keys to determine the original maximum length of the model: %s. Assuming the model's maximum length is %d.r   rg  	rope_type)ZsuZlongropeZllama3ztDisabling sliding window is not supported for models with rope_scaling. Please raise an issue so we can investigate.factorg      ?yarnZ original_max_position_embeddingsr}  z--max-model-len is not specified, it's currently using model's default length %s, which might be too large.Please input with --max-model-len based on your request input length and output length, to avoid unnecessary degradation.zDisabling sliding window is not supported for models model_max_length in the config. Please raise an issue so we can investigate.zUser-specified max_model_len (z-) is greater than the derived max_model_len (=z or model_max_length=zR in model's config.json). This may lead to incorrect model outputs or CUDA errors.zD%s Make sure the value is correct and within the model context size.zR To allow overriding this maximum, set the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1)r   r  ry  rd  r	  r
  r*  r  r?  r4   Zis_tpur  ZVLLM_ALLOW_LONG_MAX_MODEL_LENr   )r   r  r   r   r+  r   r  Zderived_max_model_lenZpossible_keysZmax_len_keyr*  max_lenZtmp_max_lenZtokenizer_model_max_lengthZdefault_max_lenr   r~  Zscaling_factorr|  r5  rh   rh   ri   r    s   

#r  r   r   c                 C   s   |s| S t |tr|d S |S )a  
    If the input is a non-empty list, the first model_name in
    `served_model_name` is taken.
    If the input is a non-empty string, it is used directly.
    For cases where the input is either an empty string or an
    empty list, the fallback is to use `self.model`.
    r   )r   r[  )r   r   rh   rh   ri   r    s
   	
r  )rS   xgrammarguidanceZoutlinesc                   @   sn   e Zd ZU dZdZeed< 	 dZeed< 	 dZ	eed< 	 dZ
eed< 	 dZeed	< 	 d
efddZdd ZdS )DecodingConfigz=Dataclass which contains the decoding strategy of the engine.rS   r6  Fdisable_fallbackdisable_any_whitespacedisable_additional_propertiesr   reasoning_backendrn   c                 C   r  r  r  r  rh   rh   ri   rv     r  zDecodingConfig.compute_hashc                 C   s8   | j r| jdvrtd| jr| jdkrtdd S d S )N)r  r  zLdisable_any_whitespace is only supported for xgrammar and guidance backends.r  zIdisable_additional_properties is only supported for the guidance backend.)r  r6  r   r  rt   rh   rh   ri   r8    s   
zDecodingConfig.__post_init__N)rx   ry   rz   r  r6  GuidedDecodingBackendr  r  r  r  r  r  r{   rv   r8  rh   rh   rh   ri   r    s   
 r  )r   workerrz  c                   @   s   e Zd ZU dZdZee ed< 	 ede	fddZ
dZee ed< 	 dZeee  ed< 	 ede	fd	d
Zede	fddZdefddZdd Zdd ZdS )ObservabilityConfigz6Configuration for observability - metrics and tracing.Nshow_hidden_metrics_for_versionrn   c                 C   s   | j du rdS t| j S )z,Check if the hidden metrics should be shown.NF)r  r!   Z_prev_minor_version_wasrt   rh   rh   ri   show_hidden_metrics  s
   
z'ObservabilityConfig.show_hidden_metricsotlp_traces_endpointcollect_detailed_tracesc                 C      | j duod| j v pd| j v S )z6Whether to collect model forward time for the request.Nr   rz  r  rt   rh   rh   ri   collect_model_forward_time     

z.ObservabilityConfig.collect_model_forward_timec                 C   r  )z6Whether to collect model execute time for the request.Nr  rz  r  rt   rh   rh   ri   collect_model_execute_time  r  z.ObservabilityConfig.collect_model_execute_timec                 C   r  r  r  r  rh   rh   ri   rv     r  z ObservabilityConfig.compute_hashc                 C   sd   | j d urt| j dkrd| j d v r|   ddlm}m} | s.| jd ur0td| d S d S )Nr|  ,r   )is_otel_availableotel_import_error_tracebackzOpenTelemetry is not available. Unable to configure 'otlp_traces_endpoint'. Ensure OpenTelemetry packages are installed. Original error:
)r  r  _parse_collect_detailed_tracesZvllm.tracingr  r  r  r   )ru   r  r  rh   rh   ri   r8  ,  s   
z!ObservabilityConfig.__post_init__c                 C   s0   t | jtsJ ttt | jd d| _d S )Nr   r  )r   r  r[  r   DetailedTraceModulessplitrt   rh   rh   ri   r  9  s
   
z2ObservabilityConfig._parse_collect_detailed_traces)rx   ry   rz   r  r  r   r{   r  r
   r  r  r  r  r[  r  r  r  rv   r8  r  rh   rh   rh   ri   r    s"   
 r  )Zkv_producerkv_both)Zkv_consumerr  c                   @   s.  e Zd ZU dZdZee ed< 	 dZee ed< 	 dZ	ee ed< 	 dZ
eed< 	 dZee ed	< 	 dZee ed
< 	 dZeed< 	 dZeed< 	 dZeed< 	 eedZeeef ed< 	 dZee ed< 	 defddZd!ddZedefddZedefddZedefddZdefdd ZdS )"KVTransferConfigz0Configuration for distributed KV cache transfer.Nkv_connector	engine_idr  kv_buffer_deviceg    eAkv_buffer_sizekv_rolekv_rankr|  kv_parallel_sizez	127.0.0.1kv_ipi8  kv_portr   kv_connector_extra_configkv_connector_module_pathrn   c                 C   r  r  r  r  rh   rh   ri   rv   s  r  zKVTransferConfig.compute_hashc                 C   sx   | j d u rtt | _ | jd ur%| jttvr%td| j dtt | jd ur8| jd u r:tdtt d S d S )NzUnsupported kv_role: z. Supported roles are zLPlease specify kv_disagg_role when kv_connector is set, supported roles are )	r  r{   uuiduuid4r  r   KVRoler   r  rt   rh   rh   ri   r8    s   
zKVTransferConfig.__post_init__c                 C      | j d uo| jttv S rg   )r  r  r   r  rt   rh   rh   ri   is_kv_transfer_instance     
z(KVTransferConfig.is_kv_transfer_instancec                 C   r  rg   )r  r  r   
KVProducerrt   rh   rh   ri   is_kv_producer  r  zKVTransferConfig.is_kv_producerc                 C   r  rg   )r  r  r   
KVConsumerrt   rh   rh   ri   is_kv_consumer  r  zKVTransferConfig.is_kv_consumerc                 C   s   | j ||S rg   )r  rd  )ru   r*  r   rh   rh   ri   get_from_extra_config  r  z&KVTransferConfig.get_from_extra_configr  ) rx   ry   rz   r  r  r   r{   r  r  r  r  r   r  r  r  r?  r  r  r  r   r~   r  r   r  rv   r8  r  r  r  r  r  r  rh   rh   rh   ri   r  E  sB   
 
r  c                   @   s   e Zd ZU dZdZeed< 	 dZeed< 	 dZ	eed< 	 dZ
ee ed	< 	 d
Zeed< 	 dZeed< 	 dZeed< 	 dZeed< dS )KVEventsConfigz&Configuration for KV event publishing.Fenable_kv_cache_eventsnull	publisherztcp://*:5557endpointNreplay_endpointi'  buffer_stepsi hwmmax_queue_sizer   topic)rx   ry   rz   r  r  r  r  r  r{   r  r  r   r  r?  r  r  r  rh   rh   rh   ri   r    s$   
 r  c                   @   s   e Zd ZU dZdZeed< 	 eedZ	eed< 	 ee
dZe
ed< 	 eedZeed< 	 eedZeed< 	 eedZeed	< 	 dZee ed
< 	 dZee ed< 	 eedZeed< 	 dZee ed< 	 dZee ed< 	 eedZeed< 	 dZee  ed< 	 dZ!ee" ed< 	 ee#dZ$e%e#e&f ed< 	 dZ'e(ed< 	 de(fddZ)de*de*fddZ+e,ded	edee fddZ-e,ded	edee fddZ.	d1de/d ee0e(  dd fd!d"Z1d#d$ Z2d%e0de0fd&d'Z3d(d) Z4d*e*fd+d,Z5d-d. Z6d/d0 Z7dS )2
VllmConfigzDataclass which contains all vllm-related configuration. This
    simplifies passing around the distinct configurations in the codebase.
    NrV  r   rS  r  scheduler_configr  r~  lora_configr  decoding_configobservability_configquant_configcompilation_configkv_transfer_configkv_events_configadditional_configr   instance_idrn   c                 C   sf  g }g }ddl m} || |tj | jr!|| j  n|d | jr2|| j  n|d | jrC|| j  n|d | j	rT|| j	  n|d | j
re|| j
  n|d | jrv|| j  n|d | jr|| j  |t| j	j n|d | jr|| j  n|d | jr|| j  n|d | jr|| j  n|d | jr	 | jr|| j  n|d | jr|| j  n|d | jrt| j }trtjtj|dd dd }n| }|| n|d || tjt| dd dd	 }|S )
r   r   )__version__NoneT)	sort_keysFr  N
   )r   r  r   r  r  rV  rv   rS  r  r  r  r~  r  r{   max_num_batched_tokensr  r  r  r  r  r  r  r   r~   r   r  r  r  r\   r   )ru   r   Zvllm_factorsr  r  Zadditional_config_hashr  rh   rh   ri   rv     s   
















zVllmConfig.compute_hash
batch_sizec                 C   s   | j j| S rg   )r  Zbs_to_padded_graph_size)ru   r  rh   rh   ri   pad_for_cudagraphg  s   zVllmConfig.pad_for_cudagraphc                 C   s   ddl m} | jdurRddlm} || |}| }|dur9| }|| k r9td| j d|  d| d|	 }| j
|vrPt| j
 d	| j d
| |S dS )zGet the quantization config.r   r3   N)get_quant_configzThe quantization method z; is not supported for the current GPU. Minimum capability: z. Current capability: r   z* is not supported for quantization method z. Supported dtypes: )r  r4   r   r  r  rw  Zto_intZget_min_capabilityr   Zget_supported_act_dtypesr   )rV  r~  r4   r  r  Zcapability_tupleZ
capabilityrv  rh   rh   ri   _get_quantization_confign  s4   



z#VllmConfig._get_quantization_configc                 C   s   dd l }t|| |S r  )copyr  r  deepcopy)rV  r~  r  rh   rh   ri   ru    s   z"VllmConfig.get_quantization_configr   r   c                 C   s6   |d urt |}||_t | j}||_t| |dS )N)rV  )r  r  r   rV  r   r	   )ru   r   r   rV  rh   rh   ri   with_hf_config  s   
zVllmConfig.with_hf_configc                 C   s  |    | jdur"| j| j| j| j | j| j | j| j | j	| j | j
dur<| j
| j	 | j
| j | jdu rO| jdurOt| j| j| _ddlm} | jdurp| jjrp| jjtjkrp| dkrptd | jjdu rtjr| jdur| jjstj | j_ntj!| j_ntj!| j_| jj"j#rd| jj"_$| jj"j$r| jj%&d |' r| jj(du rtjr| jjtj krt)j | j_(nt)j*| j_(| jdur| jjrt+d t)j*| j_(ntjrd	| j_,| -  nt)j*| j_(| j	j.dkr
| jjtj!kr
tjs
t/d
 tj!| j_tjs&| j
dur&| jjtj!kr&t/d tj!| j_g }| jrV| jj0rV| jj0j1}|du sB|2 dkrH|&d nt3| jj4ddsV|&d |rv|D ]}t+| q[d| j_d| j_5| j	durvd| j	_6| j7dur| j7j8r| j	j6st/d | j7dur| j7j9dkr| j7j8st/d |:|  tjr|' r| jj(t)j;kr| jdur| jj<st+d d| j_<| jj(= r| jjtj ksJ d| jj( | j>st? dd | _>tjr| jjtj kr| j@  tjrZ| jjAs\|B s|C sd| j_A| jDdur"d| j_A| j7dur,d| j_A| jdur^| jjEdur`| jdurK| jF rKd| j_AdS tjGsbt/d d| j_AdS dS dS dS dS dS )z?Verify configs are valid & consistent with each other.
        Nr   r3   )   r"  zTuring devices tensor cores do not support float32 matmul. To workaround this limitation, vLLM will set 'ieee' input precision for chunked prefill triton kernels.Tz	+rms_normz&Cudagraph is disabled under eager moder|  zWCPU offload is not supported with `torch.compile` in v0 yet. Disabling `torch.compile`.zQLoRA for V0 is not supported with `torch.compile` yet. Disabling `torch.compile`.lastzPOnly "last" pooling supports chunked prefill and prefix caching; disabling both.Z	is_causalz_Only models using causal attention supports chunked prefill and prefix caching; disabling both.Fz`KV cache events are on, but prefix caching is not enabled.Use --enable-prefix-caching to enable.r  zKV cache events are disabled,but the scheduler is configured to publish them.Modify KVEventsConfig.enable_kv_cache_eventsto True to enable.zaCUDAGraphMode.FULL is not supported with cascade attention currently. Disabling cascadeattention.zxCompilation level should be CompilationLevel.PIECEWISE when cudagraph_mode piecewise cudagraphs is used, cudagraph_mode=r"  zThere is a latency regression when using chunked local attention with the hybrid KV cache manager. Disabling it, by default. To enable it, set the environment VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1.)Htry_verify_and_update_configrV  r  r  r  r  r  r  r~  rS  r  rU  rW  r  r  r  r  r4   r  chunked_prefill_enabledr   r  r   rw  r	  r)  r  levelr  r  r   r(   	PIECEWISEZNO_COMPILATIONpass_configZenable_async_tpenable_sequence_parallelismZ
custom_opsr   Zis_cuda_alikeZcudagraph_moder)   NONEr&  Zcudagraph_num_of_warmups_set_cudagraph_sizesrT  r
  r   rP  r<  r  r   Zlong_prefill_token_thresholdenable_prefix_cachingr  r  r  Zcheck_and_update_configZFULLr   Zrequires_piecewise_compilationr  rI   Zset_splitting_ops_for_v1Zdisable_hybrid_kv_cache_managerZis_cudar  r  r   r=  Z2VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE)ru   r4   Zdisable_chunked_prefill_reasonsrP  rj  rh   rh   ri   r8    s2  



















zVllmConfig.__post_init__possible_sizesc                    s:    fdd|D }|rt d| jj  fdd|D S )Nc                    s    g | ]}| j j d kr|qS r&  r  r  r   sizert   rh   ri   rr  j  
    zDVllmConfig.update_sizes_for_sequence_parallelism.<locals>.<listcomp>zkBatch sizes %s are removed because they are not multiple of tp_size %d when sequence parallelism is enabledc                    s    g | ]}| j j d kr|qS r&  r  r  rt   rh   ri   rr  u  r  )r	  r
  r  r  )ru   r  Zremoved_sizesrh   rt   ri   %update_sizes_for_sequence_parallelismf  s   

z0VllmConfig.update_sizes_for_sequence_parallelismc                    s`  t jsOg }jdurNjdurNjjsNg ddd tddD  }jjdkr1jj	j
r1|}fdd|D }|rA|d  n|d	   fd
d|D }nYg }jdurjjsjj}t|dkrxg ddd td|d d dD  }nt|dkrt|}ntd|djjdkrjj	j
r|}jjfdd|D }j| dS )a  
        cudagraph batchsize padding logic:

        `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible
        batch sizes that cudagraph will capture.

        Depending on the engine's configuration of `max_num_seqs`, the
        candidate batch sizes to capture cudagraph will shrink to the subset
        which just cover the range of `[1, max_num_seqs]`. In the common case,
        `max_num_seqs` is 256, and the cudagraph batch sizes will be
        `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`.

        However, if users specify the cudagraph capture sizes through
        compilation config, we will use the specified sizes instead.

        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
        will be the final sizes to capture cudagraph (in descending order).

        During runtime, if batchsize is larger than
        `vllm_config.compilation_config.cudagraph_capture_sizes`,
        no cudagraph will be used.
        If the batch size is no larger than
        `vllm_config.compilation_config.cudagraph_capture_sizes`,
        we can quickly find the padded graph size for a given batch size by
        looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
        N)r|  r   r   c                 S   s   g | ]}d | qS )rN  rh   r'  rh   rh   ri   rr    r   z3VllmConfig._set_cudagraph_sizes.<locals>.<listcomp>r|  i  c                    s   g | ]
}| j jkr|qS rh   )r  Zmax_num_seqs)r   xrt   rh   ri   rr    s
    r   c                       g | ]}| kr|qS rh   rh   r  )max_batchsize_to_capturerh   ri   rr    
    c                 S   s   g | ]}|qS rh   rh   r'  rh   rh   ri   rr    s    rN  z#Invalid value for cuda_graph_sizes=r   c                    r  rh   rh   r  )max_num_tokensrh   ri   rr    r  )r  r  r  rV  r   r+  r  r  r  r  r  r  cuda_graph_sizesr  r-  r   r  Zinit_with_cudagraph_sizes)ru   Zbatch_size_capture_listr  Zlarger_sizesr  rh   )r  r  ru   ri   r  z  s^   








zVllmConfig._set_cudagraph_sizesr   c                 C   s$   | j }||}|| j _|| j_d S rg   )rV  r,  r   r  )ru   r   rV  rh   rh   ri   recalculate_max_model_len  s   
z$VllmConfig.recalculate_max_model_lenc                 C   s   | j d u rd S t| j ddrd S d| j _| j j}|d u rd S ddlm}m} ||d }|d ur5||  | j j	r>||  | j j
dkrQddlm} ||  d S d S )Nr1  FTr   )MODELS_CONFIG_MAPHybridAttentionMambaModelConfigrW   )SequenceClassificationConfig)rV  r  r1  rm   Z!vllm.model_executor.models.configr  r  rd  Zverify_and_update_configr  rl   Z#vllm.model_executor.models.adaptersr  )ru   rm   r  r  r   r  rh   rh   ri   r    s$   


z'VllmConfig.try_verify_and_update_configc                 C   s  d g d| jjd| jd| jjd| jj d| jj d| jj d| jj d	| jj	 d
| jj
 d| jj d| jj d| jjd| jj d| jj d| jj d| jj d| jj d| jj d| jj d| jj d| jd| jd| jj d| jj d| jj d| jj  d| jj! d| jj"d| j#S )Nr   zmodel=z, speculative_config=z, tokenizer=z, skip_tokenizer_init=z, tokenizer_mode=z, revision=z, override_neuron_config=z, tokenizer_revision=z, trust_remote_code=z, dtype=z, max_seq_len=z, download_dir=z, load_format=z, tensor_parallel_size=z, pipeline_parallel_size=z, disable_custom_all_reduce=z, quantization=z, enforce_eager=z, kv_cache_dtype=z, device_config=z, decoding_config=z, observability_config=z, seed=z, served_model_name=z, enable_prefix_caching=z, chunked_prefill_enabled=z, use_async_output_proc=z, pooler_config=z, compilation_config=)$r!  rV  r   r  r   r   r   r   r   r   r   r   r   r~  r  r  r  r  r  r7  r   r   rS  Zcache_dtyper  r  r  r  r   r   r  r  r  r   r   r  rt   rh   rh   ri   __str__  st   	
zVllmConfig.__str__rg   )8rx   ry   rz   r  rV  r   r  r   r#   rS  r,   r  r-   r  r
  r  r  r~  r  r   rB  r  r  r  r  r  r  r  rL   r'   r  r  r  r  r  r~   r  r   rr   r  r{   rv   r?  r  rA  r  ru  rK   r[  r  r8  r  r  r  r  r  rh   rh   rh   ri   r    s   
 [

 C
Tr  _current_vllm_config_current_prefixFvllm_configprefixc                 c   s    t }t}ddlm} |j}zdz	| a |adV  W n ty     w td| jj	 td| jj
 |rR| jjtjkr]|j|krhtd| jj W |a |at  dS W |a |at  dS W |a |at  dS W |a |at  dS |a |at  w )a  
    Temporarily set the current vLLM config.
    Used during model initialization.
    We save the current vLLM config in a global variable,
    so that all modules can access it, e.g. custom ops
    can access the vLLM config to determine how to dispatch.
    r   )compilation_counterNzenabled custom ops: %szdisabled custom ops: %sz`torch.compile` is turned on, but the model %s does not support it. Please open an issue on GitHub if you want it to be supported.)r  r  Zvllm.compilation.counterr  num_models_seen	Exceptionr	  debugr  Zenabled_custom_opsZdisabled_custom_opsr  r(   r  r
  rV  r   get_cached_compilation_configcache_clear)r  Zcheck_compiler  Zold_vllm_configZ
old_prefixr  r  rh   rh   ri   set_current_vllm_config  sV   


r  r|  )maxsizec                   C   s   t  jS )zACache config to avoid repeated calls to get_current_vllm_config())get_current_vllm_configr  rh   rh   rh   ri   r  E  s   r  c                  C   s(   t d u rtd ddlm}  |  S t S )NzCurrent vLLM config is not set.r   r  )r  r	  r
  Zvllm.configr  r  rh   rh   ri   r  K  s
   
r  c                   C   s   t dusJ dt S )zI
    Get the prefix of the model that's currently being initialized.
    Nz!Current model prefix is not set. )r  rh   rh   rh   ri   get_current_model_prefixV  s   
r   c                 C   s   d}t || }|duS )ak  
    Check if the text looks like a printed Python object, e.g.
    contains any substring matching the pattern: "at 0xFFFFFFF>"
    We match against 0x followed by 2-16 hex chars (there's
    a max of 16 on a 64 bit system).

    Args:
        text (str): The text to check

    Returns:
        result (bool): `True` if a match is found, `False` otherwise.
    zat 0x[a-fA-F0-9]{2,16}>N)research)textpatternrV  rh   rh   ri   contains_object_print_  s   r  c                 C   s   t | sdS td|  )NTzvLLM tried to hash some configs that may have Python objects ids in them. This is a bug, please file an issue. Text being hashed: )r  r  )r  rh   rh   ri   r   q  s   r   T
layer_typelayer_namesc                    s4   |du rt | jj }| jj  fdd|D S )z
    Get layers from the vLLM config.

    Args:
        vllm_config: The vLLM config.
        layer_type: The type of the layer to get.
        layer_names: The names of the layers to get. If None, return all layers.
    Nc                    s$   i | ]}t  | r| | qS rh   )r   )r   Z
layer_nameZforward_contextr  rh   ri   r     s    z/get_layers_from_vllm_config.<locals>.<dictcomp>)r[  r  Zstatic_forward_contextkeys)r  r  r  rh   r	  ri   get_layers_from_vllm_config}  s   
r  c                   @   s`   e Zd ZU dZdZeed< 	 dZeed< 	 dZ	eed< 	 dZ
ee ed	< 	 ed
efddZdS )SpeechToTextConfigz(Configuration for speech-to-text models.i>  sample_rate   max_audio_clip_sr|  overlap_chunk_secondi@  min_energy_split_window_sizern   c                 C   r  rg   )r  rt   rh   rh   ri   allow_audio_chunking  r  z'SpeechToTextConfig.allow_audio_chunkingN)rx   ry   rz   r  r  r   r  r  r?  r  r  r   r  r  r  rh   rh   rh   ri   r    s   
 r  rq  c                 C   s   i }|  D ]F\}}t| |sJ t|  d| dt| |}t|rHt|sHt|tsCJ dt|  d| dt| dt| t||}|||< qt| fi |S )Nz has no field ``zOverrides to r   z must be a dict  or z
, but got )	rM  r  r  r  r   r   r~   update_configr	   )r0   rq  Zprocessed_overrides
field_namer   current_valuerh   rh   ri   r    s,   

r  )NN)FNrg   )r   r  enumr   r   r  r   r  r  collections.abcr   
contextlibr   dataclassesr   r   r   r   r   r	   	functoolsr
   r   importlib.utilr   typingr   r   r   r   r   r   r   r   r   r   r   regexr  r  Zpydanticr   r   r   r   Zpydantic.dataclassesr   Zsafetensors.torchr   rp  Ztyping_extensionsr   r   r    Z	vllm.envsr  r   r!   Zvllm.config.cacher"   r#   r$   r%   r&   Zvllm.config.compilationr'   r(   r)   r*   Zvllm.config.parallelr+   r,   Zvllm.config.schedulerr-   r.   Zvllm.config.utilsr/   r0   Zvllm.loggerr1   Z'vllm.model_executor.layers.quantizationr2   r  r4   Zvllm.transformers_utils.configr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   Z vllm.transformers_utils.s3_utilsrB   Zvllm.transformers_utils.utilsrC   rD   Z
vllm.utilsrE   rF   rG   rH   rI   Z	_typeshedrJ   Z transformers.configuration_utilsrK   rP   Zlayersr   rt  Zvllm.model_executor.modelsmodelsrD  Z3vllm.model_executor.layers.quantization.base_configrL   Z vllm.model_executor.model_loaderrM   Z+vllm.model_executor.model_loader.tensorizerrN   Zvllm.v1.sample.logits_processorrO   r~   r  r  ZBaseModelLoaderr{   globalsrx   r	  rQ   r  r\  r  r  r  r  r_   r[  r  ra   rf   r  rj   rq   rr   r|   Enumr   r   r   r  r   rT  r  r  r   r  r  r
  r@  r  rX  rB  rH  r   r   r   r   rz  rh  r   ri  rk  rt  ry  r(  r?  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r   r  r  r  r  rh   rh   rh   ri   <module>   s  
 4<$	
4           fR<   X^`T
	
(
4
3
 
6T^(    E.
		



