o
    )iL                 	   @   s.  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
 d dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlm Z m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZL d dlMmNZN d d	lOmPZPmQZQ d d
lRmSZS d dlTmUZU d dlVmWZW d dlXmYZYmZZZ d dl[m\Z\ d dl]m^Z^ d dl_m`Z`maZambZbmcZcmdZd d dlemfZf erd dlgmhZh d dlimjZj d dlkmlZl d dlmmnZn neZheZjeZleZneNeoZpedZqeere esf Zteereq esf Zudeevgeqf deevgeqf fddZwdeevgeqf deevgeeq f fddZxdevdeeeveyevevf f  fddZzd etd!eude#eu fd"d#Z{d$e|et d!eude}fd%d&Z~d$e|et d!eudeufd'd(Zd$e|et deyevef fd)d*Zd etde}fd+d,Zd etde|et fd-d.Zd/ede}fd0d1Zejd2d3d4e-deyevef fd5d6Zd4e-deyevef fd7d8ZeG d9d: d:ZeG d;d< d<eZd=evd>e}fd?d@Zd=evde}fdAdBZdCdD ZdS )E    N)MISSING	dataclassfieldsis_dataclass)permutations)TYPE_CHECKING	AnnotatedAnyCallableDictListLiteralOptionalTypeTypeVarUnioncastget_args
get_origin)TypeAdapterValidationError)TypeIs
deprecated)%	BlockSizeCacheConfig
CacheDTypeCompilationConfigConfigFormat
ConfigTypeConvertOptionDecodingConfigDetailedTraceModulesDeviceDeviceConfigDistributedExecutorBackendGuidedDecodingBackendHfOverridesKVEventsConfigKVTransferConfig
LoadConfigLogprobsMode
LoRAConfig
MambaDTypeModelConfig
ModelDType	ModelImplMultiModalConfigObservabilityConfigParallelConfigPoolerConfigPrefixCachingHashAlgoRunnerOptionSchedulerConfigSchedulerPolicySpeculativeConfig
TaskOptionTokenizerMode
VllmConfigget_attr_docs	get_field)init_logger)CpuArchEnumcurrent_platformload_general_plugins)is_ray_initialized)ReasoningParserManager)MODEL_WEIGHTS_S3_BUCKETMODELS_ON_S3)is_interleaved)check_gguf_file)STR_DUAL_CHUNK_FLASH_ATTN_VALFlexibleArgumentParser	GiB_bytesget_ipis_in_ray_actor)LogitsProcessor)ExecutorBase)QuantizationMethods)LoadFormatsUsageContextTreturn_typereturnc                    s   dt dtf fdd}|S )NvalrV   c              
      s>   z | W S  t y } ztd|  d  d|d }~ww )NzValue z cannot be converted to .)
ValueErrorargparseArgumentTypeError)rW   erU    a/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/engine/arg_utils.py_parse_typeG   s   
zparse_type.<locals>._parse_type)strrT   )rU   r`   r^   r]   r_   
parse_typeE   s   rb   c                    s   dt dtt f fdd}|S )NrW   rV   c                    s    | dks| dkr
d S t  | S )N None)rb   rW   r]   r^   r_   _optional_typeT   s   z%optional_type.<locals>._optional_type)ra   r   rT   )rU   rf   r^   r]   r_   optional_typeQ   s   rg   rW   c                 C   s"   t d| s
t| S ttj| S )Nz(?s)^\s*{.*}\s*$)rematchra   rg   jsonloadsre   r^   r^   r_   union_dict_and_str\   s   rl   	type_hinttypec                 C   s   | |u p	t | |u S )z*Check if the type hint is a specific type.)r   )rm   rn   r^   r^   r_   is_typeb   s   ro   
type_hintsc                    s   t  fdd| D S )z0Check if the type hints contain a specific type.c                 3       | ]}t | V  qd S Nro   ).0rm   rn   r^   r_   	<genexpr>i       z contains_type.<locals>.<genexpr>)anyrp   rn   r^   ru   r_   contains_typeg   s   rz   c                    s   t  fdd| D dS )z*Get the specific type from the type hints.c                 3   s    | ]
}t | r|V  qd S rr   rs   rt   thru   r^   r_   rv   n   s    zget_type.<locals>.<genexpr>N)nextry   r^   ru   r_   get_typel   s   r~   c                    st   t | t}t|}t|d  t fdd|D s)td| ddd |D  t| tr0dnd	}d
 |t|iS )zGet the `type` and `choices` from a `Literal` type hint in `type_hints`.

    If `type_hints` also contains `str`, we use `metavar` instead of `choices`.
    r   c                 3   rq   rr   )
isinstance)rt   optionZoption_typer^   r_   rv   y   rw   z$literal_to_kwargs.<locals>.<genexpr>z*All options must be of the same type. Got z with types c                 S   s   g | ]}t |qS r^   ru   )rt   cr^   r^   r_   
<listcomp>|   s    z%literal_to_kwargs.<locals>.<listcomp>metavarchoicesrn   )	r~   r   r   rn   allrY   rz   ra   sorted)rp   rm   optionskwargr^   r   r_   literal_to_kwargsq   s   
r   c                 C   s
   | j dkS )z*Check if the class is not a built-in type.builtins)
__module__)rm   r^   r^   r_   is_not_builtin   s   
r   c                 C   sf   t  }t| }t| }|tu r|t|d  |S |tu r,|D ]	}|t| q |S ||  |S )z6Extract type hints from Annotated or Union type hints.r   )setr   r   r   updateget_type_hintsr   add)rm   rp   originargsargr^   r^   r_   r      s   
r   quantizationc                 C   s   | dv S )N)incr^   )r   r^   r^   r_   is_online_quantization   s   r      )maxsizeclsc                    sz  t | }i }t| D ]}t|j}dd |D }t|d }|jtur'|j}n	|jtur0| }|j}|| 	 }	|	
dd}	||	d||< d}
|d url|fdtdtfd	d
}||| d< || d  d|
 7  < n
t|trytj|| d< nt|tr|| t| nt|trt|t}t|}|d  t fdd|D sJ d| d || d< t|v rdnt||| d< nt|trt|t}t|}|d }t|tu rd}tt|v sJ |t}||| d< d|| d< nt|trt|| d< |dv rt|| d< nit|trt|| d< n\t|t r7t|ts0t!dd |D r7t"|| d< n?t|t rTt#t$j%|| d< || d  d|
 7  < n"t|tsdt!dd |D rkt|| d< nt&d| d| dt|| 'dtu r|| t|| d h td |v rt|tst(|| d || d< || 'dr|| d )d q
|S )Nc                 s   s    | ]	}t |r|V  qd S rr   )r   r{   r^   r^   r_   rv      s    z"_compute_kwargs.<locals>.<genexpr>%z%%)defaulthelpzFShould either be a valid JSON string or JSON keys passed individually.rW   rV   c              
   S   s:   zt || W S  ty } ztt||d }~ww rr   )r   Zvalidate_jsonr   rZ   r[   repr)rW   r   r\   r^   r^   r_   parse_dataclass   s   z(_compute_kwargs.<locals>.parse_dataclassrn   r   z

actionr   c                 3   s     | ]}|t ur| u V  qd S rr   )Ellipsis)rt   tZ
tuple_typer^   r_   rv      s    z>All non-Ellipsis tuple elements must be of the same type. Got rX   +nargsz,List type must contain str if it is a Union.>   max_model_lenmax_num_batched_tokensc                 s       | ]}t |V  qd S rr   r   r{   r^   r^   r_   rv          c                 s   r   rr   r   r{   r^   r^   r_   rv      r   zUnsupported type z for argument r   rd   )*r<   r   r   rn   r}   r   r   default_factorynamestripreplacera   r	   rz   boolrZ   BooleanOptionalActionr   r   r   tupler~   r   r   r   lenlistr   r   inthuman_readable_intfloatdictrx   rl   rb   rj   rk   rY   getrg   append)r   Zcls_docskwargsfieldrp   	generatorZdataclass_clsr   r   r   Zjson_tipr   rm   typesZ	list_typemsgr^   r   r_   _compute_kwargs   s   










r   c                 C   s   t t| S )zReturn argparse kwargs for the given Config dataclass.

    The heavy computation is cached via functools.lru_cache, and a deep copy
    is returned so callers can mutate the dictionary without affecting the
    cached version.
    )copydeepcopyr   )r   r^   r^   r_   
get_kwargs   s   r   c                   @   sF	  e Zd ZU dZejZeed< ejZe	e
eee f  ed< ejZe	e ed< ejZe	e ed< ejZeed< ejZeed< ejZe	e ed< ejZeed	< ejZeed
< ejZeed< ejZeed< ejZeed< ejZe	e ed< ejZe
eef ed< ejZeed< ej Z e!ed< e"j#Z$e%ed< ej&Z&e	e' ed< ej(Z(e	e' ed< e)e*dZ+e,e' ed< e-j.Z.e	e
e/e0e1 f  ed< e-j2Z2e'ed< e-j3Z3e'ed< e-j4Z4e'ed< dZ5e	e' ed< dZ6e	e' ed< dZ7e	e' ed< dZ8e	e ed< dZ9e	e' ed< d Z:eed!< e-j;Z;eed"< e-j<Z<eed#< e-j=Z=eed$< e-j>Z>e'ed%< e-j?Z?e'ed&< e-j@Z@e'ed'< e-jAZAeed(< e-jBZBe	e' ed)< e"jCZCe	eD ed*< e"jEZEe	e ed+< e"jFZFeGed,< ejHZHeed-< ejIZIeed.< e"jJZJeKed/< e"jLZLeKed0< e"jMZMeKed1< e*jNZNe	e' ed2< e*jOZOe'ed3< e*jPZPe'ed4< e*jQZQe'ed5< e*jRZRe	e' ed6< ejSZSe'ed7< ejTZTeUed8< d ZVeed9< ejWZWe	e ed:< ejXZXe	e ed;< e)ed<ZYeZee[f ed<< ej\Z\e	eK ed=< ej]Z]e	e
eef  ed>< e)ed?Z^e_ed?< ej`Z`e	e ed@< ejaZae	eb edA< ejcZceedB< ejdZde'edC< e-jeZeeedD< e)efdEZgeZee'f edF< efjhZheedG< e)efdHZieZeeZee[f f edH< efjjZje	ekee[f  edI< d ZleedJ< efjmZme'edK< efjnZneedL< d ZoeedM< epjqZreedN< epjsZse'edO< epjtZte'edP< epjuZue	ekeef  edQ< epjvZveedR< epjwZwe	e' edS< epjxZxe	e
eeyj f  edT< epjzZze'edU< e-j{Z{eedV< e"j|Z|e	e' edW< e*j}Z}e'edX< e)edYZ~eZedY< ejZe	e
eee f  edZ< e*jZe	e ed[< e*jZeKed\< e*jZe	e ed]< e*jZeed^< e*jZeed_< ejZeed`< ejZeeda< ejZeedb< ejZeedc< ejZe	e edd< dZe	ekee[f  ede< ejZe	e edf< ejZe	e edg< ejZe	e,e  edh< ej Zeedi< e*jZeedj< e*jZe
ee0e f edk< e)edlZeZee[f edl< ejZe	e
eZef  edm< e)ednZeedn< e-jZeedo< e-jZeedp< dZe	e edq< dZe	e edr< ejZeeds< ejZeedt< e)eduZeZee[f edu< ejZeedv< ejZeedw< e"jZeedx< e"jZeedy< e"jZeedz< e)ed{ZeZee[f ed{< ejZeed|< ejZeed}< ejZeed~< e-jZeed< ejZe	e,e
eee f   ed< 	 e*jZeed< d Zeed< e"jZeed< dd ZededefddZedejfddZdefddZdd ZdefddZdede-d]ed9ede	d f
ddZ		 dde	e dedefddZdedefddZdeddfddZdededdfddZdS )
EngineArgszArguments for vLLM engine.modelserved_model_name	tokenizerhf_config_pathrunnerconverttaskskip_tokenizer_initenable_prompt_embedstokenizer_modetrust_remote_codeallowed_local_media_pathdownload_dirload_formatconfig_formatdtypekv_cache_dtypeseedr   cuda_graph_sizesdistributed_executor_backendpipeline_parallel_sizetensor_parallel_sizedata_parallel_sizeNdata_parallel_rankdata_parallel_start_rankdata_parallel_size_localdata_parallel_addressdata_parallel_rpc_portFdata_parallel_hybrid_lbdata_parallel_backendenable_expert_parallelenable_eplbnum_redundant_expertseplb_window_sizeeplb_step_intervaleplb_log_balancednessmax_parallel_loading_workers
block_sizeenable_prefix_cachingprefix_caching_hash_algodisable_sliding_windowdisable_cascade_attn
swap_spacecpu_offload_gbgpu_memory_utilizationr   max_num_partial_prefillsmax_long_partial_prefillslong_prefill_token_thresholdmax_num_seqsmax_logprobslogprobs_modedisable_log_statsrevisioncode_revisionrope_scaling
rope_thetahf_tokenhf_overridestokenizer_revisionr   enforce_eagermax_seq_len_to_capturedisable_custom_all_reducelimit_per_promptlimit_mm_per_promptinterleave_mm_stringsmedia_io_kwargsmm_processor_kwargsdisable_mm_preprocessor_cachemm_processor_cache_gbskip_mm_profilingenable_loraenable_lora_bias	max_lorasmax_lora_rankdefault_mm_lorasfully_sharded_lorasmax_cpu_loras
lora_dtypelora_extra_vocab_sizeray_workers_use_nsightnum_gpu_blocks_overridenum_lookahead_slotsmodel_loader_extra_configignore_patternspreemption_modescheduler_delay_factorenable_chunked_prefilldisable_chunked_mm_inputdisable_hybrid_kv_cache_managerguided_decoding_backend guided_decoding_disable_fallback&guided_decoding_disable_any_whitespace-guided_decoding_disable_additional_propertieslogits_processor_patternspeculative_configshow_hidden_metrics_for_versionotlp_traces_endpointcollect_detailed_tracesdisable_async_output_procscheduling_policyscheduler_clsoverride_neuron_configoverride_pooler_configcompilation_config
worker_clsworker_extension_clskv_transfer_configkv_events_configgeneration_configenable_sleep_modeoverride_generation_config
model_imploverride_attention_dtypecalculate_kv_scalesmamba_cache_dtypemamba_ssm_cache_dtypeadditional_configreasoning_parseruse_tqdm_on_loadpt_load_map_location'enable_multimodal_encoder_data_parallellogits_processorsasync_schedulingenable_prompt_adapterkv_sharing_fast_prefillc                 C   s4   t | jtrtdi | j| _ddlm} |  d S )Nr   rA   r^   )r   r+  r   r   vllm.pluginsrB   )selfrB   r^   r^   r_   __post_init__  s   
zEngineArgs.__post_init__parserrV   c                 C   s  t t}| jdtjd}dtjdd v rdtjdd v s)|jd2i |d  |jd3i |d
  |jd4i |d  |jd5i |d ddi |jd6i |d  |jd7i |d  |jd8i |d  |jd9i |d  |jd:i |d  |jd;i |d  |jd<i |d  |jd=i |d   |jd>i |d"  |jd?i |d$  |jd@i |d&  |jdAi |d(  |jdBi |d*  |jdCi |d-  |jdDi |d/  |jdEi |d1  |jdFi |d3  |jdGi |d5  |jdHi |d7  |jdIi |d9  |jdJi |d;  |jdKi |d=  |jdLi |d?  |jd@dAtjdBdC |jdMdEdFdG t	D i|dH  |jdIt
dJd|dK dL |dK dM dN |jdNi |dP  |jdOi |dR  |jdPi |dT  |jdQi |dV  |jdRi |dX  |jdSi |dZ  |jdTi |d\  |jdUdEd^dG tD i|d_  |jdVi |da  |jdWi |dc  t t}| jddtjd}|jdXi |df  |jdYi |dh  |jdZi |dj  |jd[i |dl  |jd\i |dn  |jd]i |dp  t t}| jdqtjd}|jd^i |ds  |j	td_i |du  |j	vd`i |dw  |j	xdai |dy  |j	zdbdEttji|d{  t t}| jd|tjd}|j	}dci |d~  |j	ddi |d  |jdei |d  |jdfi |d  |jddtdd |jddtdd |jddtdd |jddt
dd |jddtdd |jddt
ddd |j	dgi |d  |j	dhi |d  |jdii |d  |jdji |d  |jdki |d  |jdli |d  |jdmi |d  |j	dni |d  |j	doi |d  |j	dpi |d  |jdqi |d  |jdri |d  |j	dsi |d  t t}	| jdtjd}
|
jdti |	d  |
jdui |	d  |
jdvi |	d  |
jdwi |	d  |
jdxi |	d  |
jdyi |	d  |
jdzi |	d  |
jd{i |	d  |
jd|i |	d  |
jd}i |	d  |
jd~i |	d  |
jdi |	d  t t}| jdtjd}|jdi |d  |jdi |d  |j	Րdi |d  |j	אdi |d  |jddAddڍ |j	ېdi |d  |jdi |d  t t}| jdtjd}|jdtjdd |jdi |d  |jdi |d  |jdi |d  |jdi |d  |j	di |d  |jdi |d  |jdi |d  |jdi |d  t t}| jdtjd}|j	di |d  |j	di |d  |d dE }dd| d}||d d< |d dE  ddG tttddD 7  < |j	 di |d  t t}| jdtjd}|j	di |d  |jdi |d  |j	di |d  |j	di |d	  |jdi |d  |j	di |d  |jdi |d  |jdi |d  |jdi |d  |jdi |d  |j	di |d  |j	di |d  |jdi |d  |j	di |d  |jdi |d  t t}| jd tjd}tt j!|d! d"< |jdi |d!  |jdi |d%  |jdi |d'  |jdi |d*  |jdi |d,  | jd-dAd.d | jd/dAdd0d1 | S (  z%Shared CLI arguments for vLLM engine.r-   )titledescriptionZserve   Nz--help--modelr   --runnerr   	--convertr   --taskr   r   T--tokenizerr   --tokenizer-moder   --trust-remote-coder   --dtyper   --seedr   --hf-config-pathr   --allowed-local-media-pathr   
--revisionr   --code-revisionr   --rope-scalingr   --rope-thetar   --tokenizer-revisionr   --max-model-lenr   --quantization-qr   --enforce-eagerr   --max-seq-len-to-capturer   --max-logprobsr   --logprobs-moder   --disable-sliding-windowr   --disable-cascade-attnr   --skip-tokenizer-initr   --enable-prompt-embedsr   --served-model-namer   --disable-async-output-proc
store_truezFDisable async output processing. This may result in lower performance.r   r   r   --config-formatr   c                 S      g | ]}|j qS r^   valuert   fr^   r^   r_   r         z+EngineArgs.add_cli_args.<locals>.<listcomp>r   z
--hf-token?r   r   r   )rn   r   constr   r   --hf-overridesr   --override-neuron-configr)  --override-pooler-configr*  --logits-processor-patternr!  --generation-configr0  --override-generation-configr2  --enable-sleep-moder1  --model-implc                 S   rh  r^   ri  rk  r^   r^   r_   r   )  rm  r3  --override-attention-dtyper4  --logits-processorsr=  r)   --load-formatr   --download-dirr   --model-loader-extra-configr  --ignore-patternsr  --use-tqdm-on-loadr:  --pt-load-map-locationr;  r    --guided-decoding-backendbackend"--guided-decoding-disable-fallbackdisable_fallback(--guided-decoding-disable-any-whitespacedisable_any_whitespace/--guided-decoding-disable-additional-propertiesdisable_additional_properties--reasoning-parserreasoning_backendr2   --distributed-executor-backendr   --pipeline-parallel-size-ppr   --tensor-parallel-size-tpr   --data-parallel-size-dpr   z--data-parallel-rankz-dpnzSData parallel rank of this instance. When set, enables external load balancer mode.)rn   r   z--data-parallel-start-rankz-dprz0Starting data parallel rank for secondary nodes.z--data-parallel-size-localz-dplz5Number of data parallel replicas to run on this node.z--data-parallel-addressz-dpaz+Address of data parallel cluster head-node.z--data-parallel-rpc-portz-dppz)Port for data parallel RPC communication.z--data-parallel-backendz-dpbmpz0Backend for data parallel, either "mp" or "ray".)rn   r   r   --data-parallel-hybrid-lbr   --enable-expert-parallelr   --enable-eplbr   --num-redundant-expertsr   --eplb-window-sizer   --eplb-step-intervalr   --eplb-log-balancednessr   --max-parallel-loading-workersr   --ray-workers-use-nsightr  --disable-custom-all-reducer  --worker-clsr,  --worker-extension-clsr-  )--enable-multimodal-encoder-data-parallelr<  r   --block-sizer   --gpu-memory-utilizationr   --swap-spacer   --kv-cache-dtypecache_dtype--num-gpu-blocks-overrider  --enable-prefix-cachingr   --prefix-caching-hash-algor   --cpu-offload-gbr   --calculate-kv-scalesr5  --kv-sharing-fast-prefillr@  --mamba-cache-dtyper6  --mamba-ssm-cache-dtyper7  r0   --limit-mm-per-promptr  --media-io-kwargsr  --mm-processor-kwargsr  --mm-processor-cache-gbr  z--disable-mm-preprocessor-cache)r   r   --interleave-mm-stringsr  --skip-mm-profilingr	  r+   z--enable-loraz*If True, enable handling of LoRA adapters.)r   r   --enable-lora-biasbias_enabled--max-lorasr  --max-lora-rankr  --lora-extra-vocab-sizer  --lora-dtyper  --max-cpu-lorasr  --fully-sharded-lorasr  --default-mm-lorasr  r1   !--show-hidden-metrics-for-versionr#  --otlp-traces-endpointr$  r%  {,}r   c                 S   s   g | ]}d  |qS )r  )join)rt   pr^   r^   r_   r     s       )r--collect-detailed-tracesr6   --max-num-batched-tokensr   --max-num-seqsr   --max-num-partial-prefillsr   --max-long-partial-prefillsr   --cuda-graph-sizesr   --long-prefill-token-thresholdr   --num-lookahead-slotsr  --scheduler-delay-factordelay_factor--preemption-moder  --scheduling-policypolicy--enable-chunked-prefillr  --disable-chunked-mm-inputr  --scheduler-clsr(  !--disable-hybrid-kv-cache-managerr  --async-schedulingr>  r;   r"  rn   --speculative-config--kv-transfer-configr.  --kv-events-configr/  --compilation-config-Or+  --additional-configr8  z--disable-log-statszDisable logging statistics.z--enable-prompt-adapterzp[DEPRECATED] Prompt adapter has been removed. Setting this flag to True or False has no effect on vLLM behavior.)r   r   r   )rH  )rI  )rJ  )rK  )rL  )rM  )rN  )rO  )rP  )rQ  )rR  )rS  )rT  )rU  )rV  )rW  )rX  )rY  rZ  )r[  )r\  )r]  )r^  )r_  )r`  )ra  )rb  )rc  )rg  )rp  )rq  )rr  )rs  )rt  )ru  )rv  )rw  )rx  )ry  )rz  )r{  )r|  )r}  )r~  )r  )r  )r  )r  )r  )r  )r  )r  r  )r  r  )r  r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  r  )r  )"r   r-   add_argument_group__doc__sysargvadd_argumentr   r&  r   ra   r/   r)   r    r   rD   Zreasoning_parsersr2   r   r   r0   r+   rZ   r   r1   r  r   r   r!   r6   r;   rg   rj   rk   )rD  Zmodel_kwargsZmodel_groupZload_kwargsZ
load_groupZguided_decoding_kwargsZguided_decoding_groupZparallel_kwargsZparallel_groupZcache_kwargsZcache_groupZmultimodal_kwargsZmultimodal_groupZlora_kwargsZ
lora_groupZobservability_kwargsZobservability_groupr   r   Zscheduler_kwargsZscheduler_groupZvllm_kwargsZ
vllm_groupr^   r^   r_   add_cli_args  s  $















































































zEngineArgs.add_cli_argsr   c                    s4   dd t | D }| di  fdd|D }|S )Nc                 S   rh  r^   )r   rt   attrr^   r^   r_   r   g  rm  z,EngineArgs.from_cli_args.<locals>.<listcomp>c                    s   i | ]}|t  |qS r^   )getattrr  r   r^   r_   
<dictcomp>i  s    z,EngineArgs.from_cli_args.<locals>.<dictcomp>r^   )dataclassesr   )r   r   attrsZengine_argsr^   r  r_   from_cli_argsd  s   zEngineArgs.from_cli_argsc                 C   s  t | jrd | _| _t| ts)tjr)| jtv r)| jdkr)t	 d| j | _d| _| j
r5td d| _ntjdkrEtdtj tj| _td7i d	| jd
| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| j d| j!d| jd| j"d| j#d| j$d | j%d!| j&d"| j'd#| j(d$| j)d%| j*d&| j+d'| j,d(| j-d)| j.d*| j/ d+| j0d,| j1d-| jd.| j2d/| j3d0| j4d1| j5d2| j6d3| j7d4| j8d5| j9d6| j:S )8NZggufauto/Zrunai_streamerz}`--disable-mm-preprocessor-cache` is deprecated and will be removed in v0.13. Please use `--mm-processor-cache-gb 0` instead.r      zuVLLM_MM_INPUT_CACHE_GIB` is deprecated and will be removed in v0.13. Please use `--mm-processor-cache-gb %d` instead.r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r	  use_async_output_procr   r  r  r)  r*  r!  r0  r2  r1  r3  r4  r=  r^   );rH   r   r   r   r   AsyncEngineArgsenvsZVLLM_CI_USE_S3rF   rE   r  loggerwarningr  ZVLLM_MM_INPUT_CACHE_GIBr-   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r	  r&  r   r  r)  r*  r!  r0  r2  r1  r3  r4  r=  rB  r^   r^   r_   create_model_configl  s   

	
 !"#$%&'()*+,-.zEngineArgs.create_model_configc                 C   s:   ddl m} | jD ]}||jv r| j| | jd |< q	d S )Nr   )TensorizerConfigtensorizer_config)Z+vllm.model_executor.model_loader.tensorizerr  r  _fields)rB  r  keyr^   r^   r_   validate_tensorizer_args  s   

z#EngineArgs.validate_tensorizer_argsc              	   C   s   | j dkrd| _| jdkr*t| jdr| j | _i | jd< | j| jd d< |   t| j| jt	| j r6dnd | j| j
| j| jdS )NbitsandbytesZ
tensorizerto_serializabler  Ztensorizer_dircpu)r   r   devicer  r  r:  r;  )r   r   hasattrr  r  r   r  r)   r   r   r  r:  r;  r  r^   r^   r_   create_load_config  s.   


zEngineArgs.create_load_configtarget_model_configtarget_parallel_configr8   c                 C   s   ddl m} ddlm} | jdu r>|| jp| j| j| j| j	| j
}t||r<i | _|j| jd< | j| jd< |j| jd< ndS | j||||d td	i | jS )
a[  Initializes and returns a SpeculativeConfig object based on
        `speculative_config`.

        This function utilizes `speculative_config` to create a
        SpeculativeConfig object. The `speculative_config` can either be
        provided as a JSON string input via CLI arguments or directly as a
        dictionary from the engine.
        r   )
get_config)SpeculatorsConfigNZnum_speculative_tokensr   methodr  r   r  r   r^   )vllm.transformers_utils.configr  Z0vllm.transformers_utils.configs.speculators.baser  r"  r   r   r   r   r   r   r   Znum_lookahead_tokensr  r   r8   )rB  r  r   r  r   r  r  	hf_configr^   r^   r_   create_speculative_config  s.   

z$EngineArgs.create_speculative_configusage_contextheadlessc                 C   s  t   tttt jd}|  }d}tjpt	d }|r%| 
|r%d}t	dr2|tjks1J nt| |rV| || t  rUt  tjtjfv rUtd d| _n| | | jdusbJ tjtfv r}| jsoJ dt  swJ d|r}J d	d}t|js| }t| j| j| j | j!|j"| j#|| j$| j%| j&| j'| j(| j)| j*d
}d}	t+ rddl,}
|
- j.}	td|	 d}t/ rddl,}
|
j01 }|r| j2rJ d| j3du}|r| j4dv sJ dd}d| _2n;| j4dur| j4}| j5r|sd| _2| j2r|dkrd}d| _2|| j6krd| _2| j5pd| _3n| j2r J d| j6}| j7du rL| j8dkr;t9 }td| |}n| j8dksHJ d| j8ft:j;}n| j7}| j<durX| j<nt:j<}| j=r| j>du rmd| _>td | j>dkrwt?d| j@dkrt?d| jAdurt?dt:dOi d| j@d| jBd| j6d| j3pdd|d |d!|d"|d#| j8d$| j2d%| jCd&| jDd'| jEd(| jFd)| jGd*| jHd+| jId,| jJd-| jKd.|	d/|d0| j>d1| jLd2| jMd3| jN}|jOr| j6dkp|}|s|jPdkrtQd4 |Rd | jS||| j| jTd5}| jU}|dur&|jU}tVdOi d6|jWd7| jXd8| jYd9|jZd:| j[d;|d<| j\d=| jd>| j]d?|jOd@| j^dAtj_o[|j`dB| jadC| jbdD| jcdE| jddF| jedG| jfdH| j=}|jOs| jgrt?dI| jhrti| jj| jk| jl| jg| jm| jn| jo| jpr| jpdkr| jpnddJnd}|jqdKkrdK | _q| _r| s }tt| ju| jv| jw| jx| jydL}tz| j{| j|| j}dM}t~||||||||||| j| j| j| jdN}|S )Pa  
        Create the VllmConfig.

        NOTE: for autoselection of V0 vs V1 engine, we need to
        create the ModelConfig first, since ModelConfig's attrs
        (e.g. the model arch) are needed to make the decision.

        This function set VLLM_USE_V1=X if VLLM_USE_V1 is
        unspecified by the user.

        If VLLM_USE_V1 is specified by the user but the VllmConfig
        is incompatible, we raise an error.
        )r  FVLLM_USE_V1TzUChunked prefill is not supported for ARM and POWER CPUs; disabling it for V1 backend.NzCuda graph is not supported with DualChunkFlashAttention. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.z;DualChunkFlashAttention is only supported on CUDA platform.zmDualChunkFlashAttention is not supported on V1 engine. To run the model in V0 engine, try set 'VLLM_USE_V1=0')r   r   r   r  is_attention_freer  sliding_windowr   r   r   r5  r@  r6  r7  r   zUsing ray runtime env: %sz:data_parallel_hybrid_lb is not applicable in headless mode)rG  NzAdata_parallel_size_local must be 1 when data_parallel_rank is setrG  zDdata_parallel_size_local must be set to use data_parallel_hybrid_lb.rayz3Using host IP %s as ray-based data parallel addressr  z3data_parallel_backend can only be ray or mp, got %szAUsing mp-based distributed executor backend for async scheduling.uniz;Async scheduling is not supported with uni-process backend.zBAsync scheduling is not supported with pipeline-parallel-size > 1.zGCurrently, speculative decoding is not supported with async scheduling.r   r   r   r   data_parallel_external_lbr   data_parallel_master_ipr   r   r   r   r   r   r   r   r   r   r  r  ray_runtime_envplacement_groupr   r,  r-  r<  zMulti-modal processor cache is disabled because it is not compatible with data parallelism when there does not exist a one-to-one correspondance between API and engine core processes.r  runner_typer   r   r   r   r  r  r  r  is_multimodal_modelr  Zsend_delta_datar  r(  r   r   r   r  r>  zJDefault modality-specific LoRA(s) were provided for a non multimodal model)r  r  r  r  r  r  r  r  r  )r  r  r  r  r  )r#  r$  r%  )model_configcache_configparallel_configscheduler_configdevice_configlora_configr"  load_configdecoding_configobservability_configr+  r.  r/  r8  r^   )r@   pre_register_and_updater#   r   r"   device_typer  r  r
  is_set_is_v1_supported_oracleZset_vllm_use_v1_set_default_args_v1is_cpuZget_cpu_architecturer?   ZPOWERPCZARMr  infor  _set_default_args_v0VLLM_ATTENTION_BACKENDrI   r   is_cudarG   Zhf_text_configget_sliding_windowr   r   r   r   r   r  r  r   r   r   r5  r@  r6  r7  rC   r  Zget_runtime_contextZruntime_envrM   utilZget_current_placement_groupr   r   r   r   r   r   r   rL   r2   r  r   r>  r   rY   r   r"  r   r   r   r   r   r   r   r   r  r  r,  r-  r<  r  r  r  Zset_mm_processor_cache_gbr  r   r  r6   r  r   r   r   r   r  r  r  ZVLLM_USE_RAY_SPMD_WORKERZuse_rayr'  r(  r   r   r   r  r  r
  r+   r  r  r  r  r  r  r  r   r   r  r    r  r  r  r   r9  r1   r#  r$  r%  r;   r+  r.  r/  r8  )rB  r  r	  r  r  Zuse_v1Ztry_v1r  r  r  r  r  r  r   host_ipr   r   r  Zdp_supports_mm_processor_cacher"  r  r  r  r  r  r  configr^   r^   r_   create_engine_config  s:  












	

	
	
		zEngineArgs.create_engine_configr  c                 C   s  | j dkrtd| j  dd dS | jtjkrtddd dS | jtjkr-tddd dS | jtjkr;tddd dS | jtjkrItd	dd dS t	
 r`t	 r`t	 jd
k r`tddd dS | jdkrut	| j}|sutddd dS | jrtddd dS |jst|jdd dS |jrtddrdS | jtjks| jtjkrtddd dS | js| jrtddd dS | jdur| jddkrtdg d}tdrtj|vrdtj }t|dd dS t	j|dstdt	j dd dS t ! t " krtdrdS | j#dkr,t$| j%dd}|s,| j%t&j%dd d!fvr,d"}t|dd dS t	j'|ds;tt	j(r;dS t	) rO|* durOtd#dd dS dS )$z5Oracle for whether to use V0 or V1 Engine by default.Zsharded_statez--load_format F)feature_namerecommend_to_removers  r  Trd  r     zCompute Capability < 8.0r  r  rb  ZMamba)r-  zConcurrent Partial Prefillr  Nr  Zdraft_modelzSpeculative decoding with draft model is not supported yet. Please consider using other speculative decoding methods such as ngram, medusa, eagle, or deepseek_mtp.)ZFLASH_ATTN_VLLM_V1Z
FLASH_ATTNZPALLASZPALLAS_VLLM_V1ZTRITON_ATTN_VLLM_V1Z
TRITON_MLAZCUTLASS_MLAZFLASHMLAZ
FLASHINFERZFLASHINFER_VLLM_V1ZROCM_AITER_MLAZTORCH_SDPA_VLLM_V1ZFLEX_ATTENTIONZ	TREE_ATTNZXFORMERS_VLLM_V1r&  zVLLM_ATTENTION_BACKEND=)r  zdevice type=zEngine in background threadrG  supports_ppr  r  Zexternal_launcherzfPipeline Parallelism without Ray distributed executor or multiprocessing executor or external launcherzsliding window (CPU backend))+r   _raise_or_fallbackr!  r   r  r6   r&  r  r  r@   r'  Zget_device_capabilitymajorr   Zis_kv_cache_dtype_supportedr   Zis_v1_compatibleZarchitecturesZhas_inner_state_warn_or_fallbackr   r   r$  r%  r"  r   NotImplementedErrorr  r   r&  Zsupports_v1r  	threadingcurrent_threadmain_threadr   r  r   r2   Z
default_v1device_namer#  r(  )rB  r  	supportedZV1_BACKENDSr   r0  r^   r^   r_   r!  R  s   








z"EngineArgs._is_v1_supported_oraclec                 C   s  |j }|dk}| jdu rE|js|jrd| _n'|r=t }| du}| jdu}|r=|s=|s=| js=|j	dkr=d| _t
d | jdu rEd| _| jsQ|rQt
d| n| jr_|j	dkr_d}t|| jrv|jrmt
d	 d| _| jd
krvtd| jdu rd| _dS dS )z$Set Default Arguments for V0 Engine.i   NFpoolingTzChunked prefill is enabled by default for models with max_model_len > 32K. Chunked prefill might not work with some features or models. If you encounter any issues, please disable by launching with --enable-chunked-prefill=False.zThe model has a long context length (%s). This may causeOOM during the initial memory profiling phase, or result in low performance due to small KV cache size. Consider setting --max-model-len to a smaller value.z3Chunked prefill is not supported for pooling modelsz[--enable-prefix-caching is not supported for multimodal models in V0 and has been disabled.sha256zNsha256 is not supported for prefix caching in V0 engine. Please use 'builtin'.   )r   r  r  Zuse_mlar@   r'  r(  r"  r
  r  r  r  rY   r   r   r   )rB  r  r   Zuse_long_contextZis_gpuZuse_sliding_windowZuse_spec_decoder   r^   r^   r_   r%    sV   








zEngineArgs._set_default_args_v0c                 C   sz  |j dkrd| _| jdu rd| _n9|jj}t|jdd}|duo'| dko'|}|r,dnd}| jdu r<|| _t	d| | jdu rJ|| _t	d	| | j
tj
krSd
| _
zt }t  }W n tyj   d}Y nw ddlm}	 |dt krd|vr|	jd|	jdi}
|	jd|	jdi}n|	jd|	jdi}
|	jd|	jdi}t r|	jdddd|	jddddi}t r| j| j }|	jd| |	jd| i}
|	jd| |	jd| i}|r|jnd}| jdu r||
v rt rt }||| v r|| | | _n|
| | _n| js
|j| _n|
| | _td| j| | jdu r9||v r;t|| | jp,tj | _td| j| dS dS dS )z$Set Default Arguments for V1 Engine.r:  TN	is_causallastZEnablingZ	Disablingz(%s) chunked prefill by defaultz(%s) prefix caching by defaultz&vllm.v1.core.sched.scheduler.Schedulerr   rR   F   Za100i @  i       i   r<  i   )ZV6EZV5EZV5Pi      z:Setting max_num_batched_tokens to %d for %s usage context.z0Setting max_num_seqs to %d for %s usage context.)!r  r  r   Zpooler_configpooling_typer  r  lowerr  r$  r(  r   r@   Zget_device_total_memoryZget_device_name	Exceptionvllm.usage.usage_librS   rK   Z	LLM_CLASSZOPENAI_API_SERVERZis_tpur#  r   r   rj  r   r   debugr   minr  r   )rB  r  r  rB  r=  Zincremental_prefill_supportedr   Zdevice_memoryr8  rS   Zdefault_max_num_batched_tokensZdefault_max_num_seqsZ"default_max_num_batched_tokens_tpuZ
world_sizeZuse_context_valueZ	chip_namer^   r^   r_   r"  0  s   












zEngineArgs._set_default_args_v1)NF)__name__r   __qualname__r  r-   r   ra   __annotations__r   r   r   r   r   r   r   r5   r   r   r   r9   r   r   r   r   r:   r   r   r)   r   r   rQ   r   r   r.   r   r  r   r   r   r   r   r=   r6   r   r   r2   r   r$   r   rO   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r4   r   r   r   r   r   r   r   r   r   r   r   r   r   r*   r   r   r   r   r   r	   r   r   r   r&   r   r   rP   r   r   r  r0   r  r  r  r  r   r  r  r	  r
  r+   r  r  r  r  r  r  r  r  torchr  r  r  r  r  r  r  r  r  r  r  r  r    r  r  r%   r  r  r  r  r  r   r!  r"  r1   r#  r$  r%  r!   r  r&  r  r'  r7   r(  objectr)  r*  r3   r;   r+  r   r,  r-  r.  r(   r/  r'   r0  r1  r2  r3  r4  r5  r6  r,   r7  r8  r  r9  r:  r;  r<  r=  rn   rN   r>  r?  r@  rC  staticmethodrJ   r  classmethodrZ   	Namespacer  r  r  r  r  rS   r,  r!  r%  r"  r^   r^   r^   r_   r     s  
 













   M
3
  I !>r   c                   @   sr   e Zd ZU dZdZeed< eeddefddZ	e	j
eddefd	dZ	e	dd
ededefddZdS )r  z'Arguments for asynchronous vLLM engine.Fenable_log_requestsz`disable_log_requests` is deprecated and has been replaced with `enable_log_requests`. This will be removed in v0.12.0. Please use `enable_log_requests` instead.rV   c                 C   s   | j  S rr   rP  r  r^   r^   r_   disable_log_requests  s   z$AsyncEngineArgs.disable_log_requestsrj  c                 C   s   | | _ d S rr   rQ  )rB  rj  r^   r^   r_   rR    s   rD  async_args_onlyc                 C   sR   t   |s
t| } | jdtjtjdd | jdtjtj ddd t	|  | S )Nz--enable-log-requestszEnable logging requests.rf  z--disable-log-requestsz&[DEPRECATED] Disable logging requests.T)r   r   r   r   )
rB   r   r  r  rZ   r   r  rP  r@   r  )rD  rS  r^   r^   r_   r    s    

zAsyncEngineArgs.add_cli_argsN)F)rH  r   rI  r  rP  r   rJ  propertyr   rR  setterrM  rJ   r  r^   r^   r^   r_   r    s(   
 r  r-  r.  c                 C   s\   t drt jrtd|  d|  d}|d7 }|r'|d|  d7 }|d7 }t| d S )	Nr
  z$VLLM_USE_V1=1 is not supported with rX   z$ is not supported by the V1 Engine. zFalling back to V0. zWe recommend to remove z from your config zin favor of the V1 Engine.)r  r   r
  r4  r  r  )r-  r.  r   r^   r^   r_   r1    s   

r1  c                 C   s8   t drt jrtd|  d}|S td|  d}|S )Nr
  zlDetected VLLM_USE_V1=1 with %s. Usage should be considered experimental. Please report any issues on Github.Fz?%s is experimental on VLLM_USE_V1=1. Falling back to V0 Engine.T)r  r   r
  r  r  r$  )r-  Zshould_exitr^   r^   r_   r3    s   r3  c                 C   s   |   } td| }|r[dddd}dddd	}| \}}||v r.|| }tt|| S ||v r[|| }zt|| W S  tyZ } ztd
| d| |	  d|d}~ww t| S )zParse human-readable integers like '1k', '2M', etc.
    Including decimal values with decimal multipliers.

    Examples:
    - '1k' -> 1,000
    - '1K' -> 1,024
    - '25.6k' -> 25,600
    z(\d+(?:\.\d+)?)([kKmMgGtT])i  i@B i ʚ;)kmgr@  i   i   @)KMGz3Decimals are not allowed with binary suffixes like z. Did you mean to use z	 instead?N)
r   rh   	fullmatchgroupsr   r   rY   rZ   r[   rC  )rj  ri   Zdecimal_multiplierZbinary_multipliernumbersuffixZmultr\   r^   r^   r_   r     s@   	r   )rZ   r   r  	functoolsrj   r  r5  r   r   r   r   	itertoolsr   typingr   r   r	   r
   r   r   r   r   r   r   r   r   r   r   regexrh   rK  Zpydanticr   r   Ztyping_extensionsr   r   Z	vllm.envsr  Zvllm.configr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   Zvllm.loggerr>   Zvllm.platformsr?   r@   rA  rB   Zvllm.ray.lazy_utilsrC   Zvllm.reasoningrD   Zvllm.test_utilsrE   rF   r  rG   Zvllm.transformers_utils.utilsrH   Z
vllm.utilsrI   rJ   rK   rL   rM   Zvllm.v1.sample.logits_processorrN   Zvllm.executor.executor_baserO   Z'vllm.model_executor.layers.quantizationrP   Z vllm.model_executor.model_loaderrQ   rE  rS   rH  r  rT   rn   rL  ZTypeHintZ	TypeHintTra   rb   rg   r   rl   ro   r   r   rz   r~   r   r   r   r   	lru_cacher   r   r   r  r1  r3  r   r^   r^   r^   r_   <module>   s   @&
&
b
           8*