o
    )iU>                    @   s*  d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZ d dlmZ d d	lmZmZmZmZ d dlZd d
lm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dlFmGZG d dlHmIZI d dlJmKZK d dlLmMZMmNZN d dlOmPZP d dlQmRZRmSZSmTZT d dlUmVZV d dlWmXZXmYZY d dlZm[Z[m\Z\m]Z]mZm^Z^m_Z_m`Z`maZambZb d d lcmdZdmeZemfZfmgZg d d!lhmiZi d d"ljmkZk d d#llmmZmmnZn d d$lompZpmqZqmrZr d d%lsmZmtZtmuZumvZv d d&lwmxZy d d'lzm{Z{ eEe|Z}d(Z~e d)eSeRZe d*ed+ZeG d,d- d-ZG d.d/ d/eZG d0d1 d1ZG d2d3 d3Ze"d4re"jrd d5lmZ eZdS dS dS )6    N)Counter)deque)contextmanager)	dataclass)partial)TYPE_CHECKINGAnyCallableClassVarDequeDictIterableListLiteralMapping
NamedTupleOptional)Sequence)SetTypeUnioncast)TypeVar)DecodingConfig
LoRAConfigModelConfigObservabilityConfigParallelConfigSchedulerConfig
VllmConfig)ScheduledSequenceGroupSchedulerOutputs)
EngineArgs)StatLoggerBaseStats)SequenceGroupOutputProcessor)StopChecker)get_logits_processors)ExecutorBase)ProcessorInputs
PromptTypeSingletonInputs)split_enc_dec_inputs)InputPreprocessor)init_logger)get_bad_words_logits_processors)LoRARequest)SamplerOutput)MULTIMODAL_REGISTRYMultiModalRegistry)EncDecMultiModalProcessor)PoolingRequestOutputRequestOutputRequestOutputFactory)PoolingParams)RequestOutputKindSamplingParams)	ExecuteModelRequestParallelSampleSequenceGroupPoolingSequenceGroupOutputr   SequenceGroupSequenceGroupBaseSequenceGroupMetadataSequenceGroupOutputSequenceStatus)SpanAttributesSpanKindextract_trace_contextinit_tracer)Detokenizer)AnyTokenizer)TokenizerGroupinit_tokenizer_from_configs)UsageContextis_usage_stats_enabledusage_message)r   Deviceresolve_obj_by_qualname	weak_bind)__version__)InputProcessingError   _O_Rdefaultc                   @   sR   e Zd ZU dZdZeee  ed< dZ	ee
 ed< dZeed< dZee ed< dS )SchedulerOutputStatezFCaches the scheduler outputs for a virtual engine. Used for Multi-StepNseq_group_metadata_listscheduler_outputsFallow_async_output_proclast_output)__name__
__module____qualname____doc__rY   r   r   r@   __annotations__rZ   r!   r[   boolr\   r1    rc   rc   b/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/engine/llm_engine.pyrX   C   s   
 rX   c                   @   sV   e Zd ZU ee ed< ee ed< eed< eed< eed< e	e ed< ee
 ed< dS )	
OutputDataoutputsrY   rZ   is_asyncis_last_stepis_first_step_outputskipN)r]   r^   r_   r   r1   ra   r@   r!   rb   r   intrc   rc   rc   rd   re   L   s   
 re   c                   @   sD   e Zd ZdddZdee dee deded	ed
e	e fddZ
dS )SchedulerContextreturnNc                 C   s   t  | _g | _d | _d | _d S N)r   output_queuerequest_outputsrY   rZ   selfrc   rc   rd   __init__]   s   
zSchedulerContext.__init__rf   rY   rZ   rg   rh   ri   c                 C   s"   | j t||||||g d d S )N)rf   rY   rZ   rg   rh   ri   rj   )ro   appendre   )rr   rf   rY   rZ   rg   rh   ri   rc   rc   rd   append_outpute   s   zSchedulerContext.append_outputrm   N)r]   r^   r_   rs   r   r1   r@   r!   rb   r   ru   rc   rc   rc   rd   rl   [   s    

rl   c                   @   s  e Zd ZU dZdZee ed< 	 ee	dd Z
ededee defd	d
Zedee dee dee fddZee ed< ejdedfdedee dededeeeef  dededdfddZdddZededee fddZeejddfdededeeeef  dedd f
dd Z eejdfd!e!dedeeeef  dd fd"d#Z"d$d% Z#d&d' Z$defd(d)Z%	dd*ee& de'fd+d,Z(defd-d.Z)dd/d0Z*		1dd2ed3e+d4e,e-e.f d5e/d*ee& d6ee0eef  d7e1dee2 fd8d9Z3dd:d;Z4					1dd2ed<e5d4e,e-e.f d5ee/ d*ee& d=ee6ee7f  d6ee0eef  d7e1ddfd>d?Z8			1dd2ed@e9dAe-d5e/d*ee& d6ee0eef  dBee9 d7e1de2fdCdDZ:		1dd2ed@e9dEe.d5e/d*ee& dBee9 d7e1de2fdFdGZ;d2e,ee<e f ddfdHdIZ=defdJdKZ>de?fdLdMZ@deAfdNdOZBdeCfdPdQZDdeEfdRdSZFdeGfdTdUZHde1fdVdWZIdefdXdYZJdZe1defd[d\ZKdefd]d^ZLdd_eeM defd`daZNeOdbe2deeP ddfdcddZQ	ddeeRd2ee ddfdfdgZSdeTdheeU dieeV ddfdjdkZWdee,eXeYf  fdldmZZd2edZe1dheeU dne[doeddfdpdqZ\dheeeU  defdrdsZ]dZe1dheeeU  dne[doeddf
dtduZ^dZe1deeeT  ddfdvdwZ_dZe1dee`ja fdxdyZbdzed{eddfd|d}Zcdzeddfd~dZd				ddnee[ deeeT  deee1  deee1  ddf
ddZe			ddnee[ deeeT  deee1  deee1  deff
ddZgd*e&defddZhde1defddZideje1 fddZkde1defddZldddZmdddZndde1ddfddZoddeepe  ddfddZqdefddZrdddZsdefddZt	ddne[deee1  ddfddZudbe2ddfddZvde+d*ee& fddZwdexd*ee& deyd fddZzdAe-d*ee& de-fddZ{			dde,ee|de}f f dee/ de~dee6ee7f  depe} f
ddZdS )	LLMEnginea!  An LLM engine that receives requests and generates texts.

    This is the main class for the vLLM engine. It receives requests
    from clients and generates texts from the LLM. It includes a tokenizer, a
    language model (possibly distributed across multiple GPUs), and GPU memory
    space allocated for intermediate states (aka KV cache). This class utilizes
    iteration-level scheduling and efficient memory management to maximize the
    serving throughput.

    The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
    and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
    class wraps this class for online serving.

    The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].

    Args:
        vllm_config: The configuration for initializing and running vLLM.
        executor_class: The model executor class for managing distributed
            execution.
        log_stats: Whether to log statistics.
        usage_context: Specified entry point, used for usage info collection.
    FDO_VALIDATE_OUTPUTc                 c   s    d| _ d V  d| _ d S )NTF)rx   )clsrc   rc   rd   enable_output_validation   s   
z"LLMEngine.enable_output_validationoutputoutput_typerm   c                 C   s:   | j }ts|rt||std| dt| tt|S NzExpected output of type z, but found type )rx   r   
isinstance	TypeErrortyper   rT   )ry   r{   r|   do_validaterc   rc   rd   validate_output   s   

zLLMEngine.validate_outputrf   c                 C   sT   | j }ts|r&g }|D ]}t||std| dt| || q|S |}|S r}   )rx   r   r~   r   r   rt   )ry   rf   r|   r   Zoutputs_r{   rc   rc   rd   validate_outputs   s   

zLLMEngine.validate_outputs	tokenizerNvllm_configexecutor_class	log_statsusage_contextstat_loggersmm_registryuse_cached_outputsc                    s  t jrtd|_|j_|j_|j_|j_|j_|j	_	|j
_
|j_|jp/t _|jp6t _tdt|| |_|_jjrSd _d _d n _tj_ dtdtffdd}t _j _ t!jj|_"||d_#jj$dkr%  t& rd	d
l'm(}	 t)j*|	j|t+jj,jj-jj.jj/jj0t+jj1t2jjj3jj4jj5d
d dd t6jj7D _8dd t6jj7D _9jj:rt;j<fddt6jj7D _=ng _=d _>t?jjj@t+rtAjjj@ njjj@  fddt6jj7D _BjrW|d ur1|_Cn&d	dlDmE}
mF} |
tG|d|tGtHjjId|dd_CjCd dj d _JjjKrgtLdjjK_JtMjNjjjBj|tOjjP|d_Qi _Rd_ST  d S )NzUsing V0 LLMEngine, but envs.VLLM_USE_V1=True. This should not happen. As a workaround, try using LLMEngine.from_vllm_config(...) or explicitly set VLLM_USE_V1=0 or 1 and report this issue on Github.zKInitializing a V0 LLM engine (v%s) with config: %s, use_cached_outputs=%s, sequencerm   c                    s    sJ d  | jS )NzFtokenizer_group cannot be None, make sure skip_tokenizer_init is False)get_lora_tokenizerlora_request)r   )tokenizer_grouprc   rd   get_tokenizer_for_seq   s   z1LLMEngine.__init__.<locals>.get_tokenizer_for_seqr   poolingr   )get_architecture_class_name)
dtypetensor_parallel_size
block_sizegpu_memory_utilizationquantizationZkv_cache_dtypeZenable_loraenable_prefix_cachingenforce_eagerdisable_custom_all_reduce)Z	extra_kvsc                 S      g | ]}t  qS rc   )rX   .0_rc   rc   rd   
<listcomp>)      z&LLMEngine.__init__.<locals>.<listcomp>c                 S   r   rc   )rl   r   rc   rc   rd   r   .  r   c                    s   g | ]}t  j| d qS )ctx)r   scheduler_contextsr   Zv_id)process_model_outputsrr   rc   rd   r   6  s    c              	      s8   g | ]} j jjjjjjrj| nd qS rn   )scheduler_configcache_configlora_configparallel_configpipeline_parallel_sizemodel_configuse_async_output_procasync_callbacksr   )	Schedulerrr   rc   rd   r   J  s    )LoggingStatLoggerPrometheusStatLogger)local_intervalr   )Z
model_name)r   labelsr   )logging
prometheusr   r   zvllm.llm_engine)Zstop_checkerF)UenvsVLLM_USE_V1
ValueErrorr   r   r   r   r   r   device_configspeculative_configZload_configdecoding_configr   Zobservability_configr   loggerinfoVLLM_VERSIONr   r   Zskip_tokenizer_initr   Zdetokenizer_init_tokenizerrG   get_tokenizer_groupr   rH   r   seq_counterZtry_get_generation_configgeneration_config_fieldsr-   input_preprocessormodel_executorrunner_type_initialize_kv_cachesrL   Z vllm.model_executor.model_loaderr   rM   Zreport_usagestrr   r   r   r   r   Zcache_dtyperb   r   r   r   ranger   cached_scheduler_outputsr   r   rP   _process_model_outputsr    process_request_outputs_callbackr~   Zscheduler_clsrO   	schedulerr   Zvllm.engine.metricsr   r   _LOCAL_LOGGING_INTERVAL_SECdictZserved_model_nametracerZotlp_traces_endpointrF   r%   Zcreate_output_processorr&   max_model_lenoutput_processorseq_id_to_seq_group_skip_scheduling_next_stepreset_mm_cache)rr   r   r   r   r   r   r   r   r   r   r   r   rc   )r   r   rr   r   rd   rs      s   













zLLMEngine.__init__c                 C   sv   t   }| j \}}| jjdur| jj}td|| |}|| j_|| j_| j	|| t   | }td| dS )zInitialize the KV cache in the worker(s).

        The workers will determine the number of blocks in both the GPU cache
        and the swap CPU cache.
        Nz<Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%dzFinit engine (profile, create kv cache, warmup model) took %.2f seconds)
timer   Zdetermine_num_available_blocksr   num_gpu_blocks_overrider   r   num_gpu_blocksnum_cpu_blocksZinitialize_cache)rr   startr   r   r   elapsedrc   rc   rd   r     s$   zLLMEngine._initialize_kv_cachesengine_configc                 C   s   |j j}t|trt|tstd| d|}|S |dkr(ddlm} |}|S |dkr=ddl	m
} tjr9J d|}|S |d	krKdd
lm} |}|S |dkrYddlm} |}|S td| )NzEdistributed_executor_backend must be a subclass of ExecutorBase. Got .Zrayr   )RayDistributedExecutormp)"MultiprocessingDistributedExecutorzXmultiprocessing distributed executor backend does not support VLLM_USE_RAY_SPMD_WORKER=1uni)UniProcExecutorZexternal_launcher)ExecutorWithExternalLauncherz+unrecognized distributed_executor_backend: )r   distributed_executor_backendr~   r   
issubclassr(   r   Z&vllm.executor.ray_distributed_executorr   Z%vllm.executor.mp_distributed_executorr   r   ZVLLM_USE_RAY_SPMD_WORKERZvllm.executor.uniproc_executorr   r   r   )ry   r   r   r   r   r   r   r   rc   rc   rd   _get_executor_cls  sB   

	zLLMEngine._get_executor_clsdisable_log_statsc                 C   s   | ||  || ||dS )N)r   r   r   r   r   )r   )ry   r   r   r   r   rc   rc   rd   from_vllm_config  s   zLLMEngine.from_vllm_configengine_argsc                 C   s8   | |}| }tjrddlm} |}|j||||jdS )z0Creates an LLM engine from the engine arguments.r   rw   )r   r   r   r   )Zcreate_engine_configr   r   vllm.v1.engine.llm_enginerw   r   r   )ry   r   r   r   r   Z
engine_clsV1LLMEnginerc   rc   rd   from_engine_args  s   
	zLLMEngine.from_engine_argsc                 C   s   t d)Nz LLMEngine should not be pickled!)RuntimeErrorrq   rc   rc   rd   
__reduce__  s   zLLMEngine.__reduce__c                 C   s    t | dd  }r|  d S d S )Nr   )getattrshutdown)rr   r   rc   rc   rd   __del__  s   zLLMEngine.__del__c                 C   s   | j d u r	td| j S )Nz;Unable to get tokenizer because skip_tokenizer_init is True)r   r   rq   rc   rc   rd   r     s   
zLLMEngine.get_tokenizer_groupr   c                 C   s   |   |S rn   )r   r   rr   r   rc   rc   rd   get_tokenizer  s   zLLMEngine.get_tokenizerc                 C   s   t | j| j| jdS )N)r   r   r   )rJ   r   r   r   rq   rc   rc   rd   r   	  s
   zLLMEngine._init_tokenizerc                 C   sF   | j | j | j| j | jr!| j| j  | j| j d S d S rn   )r   Zverify_with_parallel_configr   r   r   Zverify_with_model_configZverify_with_scheduler_configr   rq   rc   rc   rd   _verify_args  s   zLLMEngine._verify_argsr   
request_idprocessed_inputsparamsarrival_timetrace_headerspriorityc              
   C   s  t |tr|jdkrtj|| ||||||d dS | || | jj}t| j	}	| j
|}
t|\}}t|	|||
|}|du rBdnt|	|||
|}t |tr]| j||||||||d}nt |tro| j|||||||d}ntddd | jD }| j|t| }|| |S )	ziAdd a processed request to the engine's request pool.
        return the created sequence group.
           )r   r   r   r   r   N)r   r   r   encoder_seqr   )r   r   r   r   z8Either SamplingParams or PoolingParams must be provided.c                 S      g | ]}|  qS rc   Zget_num_unfinished_seq_groupsr   r   rc   rc   rd   r   X      z4LLMEngine._add_processed_request.<locals>.<listcomp>)r~   r:   nr<   add_request_validate_model_inputsr   r   nextr   r   Zget_eos_token_idr,   r   $_create_sequence_group_with_samplingr8   #_create_sequence_group_with_poolingr   r   indexminZadd_seq_group)rr   r   r   r   r   r   r   r   r   Zseq_ideos_token_idencoder_inputsdecoder_inputsseqr   	seq_groupZcostsZmin_cost_schedulerrc   rc   rd   _add_processed_request  sj   





		
z LLMEngine._add_processed_requestc                 C      | j   d S rn   )r   !stop_remote_worker_execution_looprq   rc   rc   rd   r  a     z+LLMEngine.stop_remote_worker_execution_loopprompttokenization_kwargsc	              	   C   s   t |tstdt| |dur| jstd| d|dkr/| jjdks/td| dt |tr;|j	r;td	|du rCt

 }t |trd|d
ddurd|ddsd|d
 jd }	dg|	 |d< | jj|||d}
| j||
|||||d dS )a&  Add a request to the engine's request pool.

        The request is added to the request pool and will be processed by the
        scheduler as `engine.step()` is called. The exact scheduling policy is
        determined by the scheduler.

        Args:
            request_id: The unique ID of the request.
            prompt: The prompt to the LLM. See
                [PromptType][vllm.inputs.PromptType]
                for more details about the format of each input.
            params: Parameters for sampling or pooling.
                [SamplingParams][vllm.SamplingParams] for text generation.
                [PoolingParams][vllm.PoolingParams] for pooling.
            arrival_time: The arrival time of the request. If None, we use
                the current monotonic time.
            lora_request: The LoRA request to add.
            trace_headers: OpenTelemetry trace headers.
            priority: The priority of the request.
                Only applicable with priority scheduling.

        Details:
            - Set arrival_time to the current time if it is None.
            - Set prompt_token_ids to the encoded prompt if it is None.
            - Create `n` number of [Sequence][vllm.Sequence] objects.
            - Create a [SequenceGroup][vllm.SequenceGroup] object
              from the list of [Sequence][vllm.Sequence].
            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
              scheduler.

        Example:
            >>> # initialize engine
            >>> engine = LLMEngine.from_engine_args(engine_args)
            >>> # set request arguments
            >>> example_prompt = "Who is the president of the United States?"
            >>> sampling_params = SamplingParams(temperature=0.0)
            >>> request_id = 0
            >>>
            >>> # add the request to the engine
            >>> engine.add_request(
            >>>    str(request_id),
            >>>    example_prompt,
            >>>    SamplingParams(temperature=0.0))
            >>> # continue the request processing
            >>> ...
        z!request_id must be a string, got NzGot lora_request z but LoRA is not enabled!r   r   zGot priority z( but Priority scheduling is not enabled.z:Logits processors are not supported in multi-step decodingZprompt_embedsprompt_token_ids)r  r   )r   r   r   r   r   r   r   )r~   r   r   r   r   r   r   policyr:   logits_processorsr   r   getshaper   
preprocessr  )rr   r   r  r   r   r   r  r   r   Zseq_lenr   rc   rc   rd   r  d  sH   
9



zLLMEngine.add_requestr  sampling_paramsr   c	                 C   s   |   j}	|jr|j|	ks|jr|j|	krtd|	 d| ||}| }|| j|j	 d}
| j
jdur>| j
jjd }
t||g|||||||
d	}|S )z,Creates a SequenceGroup with SamplingParams.zCannot request more than z
 logprobs.r   N)	r   seqsr   r  r   r   r   r   
draft_size)get_model_configmax_logprobslogprobsZprompt_logprobsr   _build_logits_processorscloneZupdate_from_generation_configr   r	  r   r   Znum_speculative_tokensr>   )rr   r   r  r  r   r   r   r   r   r  r  r  rc   rc   rd   r    s>   




z.LLMEngine._create_sequence_group_with_samplingpooling_paramsc           	   	   C   s$   |  }t||g|||||d}|S )z+Creates a SequenceGroup with PoolingParams.)r   r  r   r   r"  r   r   )r!  r>   )	rr   r   r  r"  r   r   r   r   r  rc   rc   rd   r    s   z-LLMEngine._create_sequence_group_with_poolingc                 C   s    | j D ]
}|j|| jd qdS )a  Aborts a request(s) with the given ID.

        Args:
            request_id: The ID(s) of the request to abort.

        Details:
            - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].

        Example:
            >>> # initialize engine and add a request with request_id
            >>> request_id = str(0)
            >>> # abort the request
            >>> engine.abort_request(request_id)
        )r   N)r   Zabort_seq_groupr   )rr   r   r   rc   rc   rd   abort_request  s
   
zLLMEngine.abort_requestc                 C      | j S )zGets the vllm configuration.r   rq   rc   rc   rd   get_vllm_config      zLLMEngine.get_vllm_configc                 C   r$  )zGets the model configuration.)r   rq   rc   rc   rd   r  $  r&  zLLMEngine.get_model_configc                 C   r$  )z Gets the parallel configuration.)r   rq   rc   rc   rd   get_parallel_config(  r&  zLLMEngine.get_parallel_configc                 C   r$  )z Gets the decoding configuration.)r   rq   rc   rc   rd   get_decoding_config,  r&  zLLMEngine.get_decoding_configc                 C   r$  )z!Gets the scheduler configuration.)r   rq   rc   rc   rd   get_scheduler_config0  r&  zLLMEngine.get_scheduler_configc                 C   r$  )zGets the LoRA configuration.)r   rq   rc   rc   rd   get_lora_config4  r&  zLLMEngine.get_lora_configc                 C      t dd | jD S )z'Gets the number of unfinished requests.c                 s       | ]}|  V  qd S rn   r   r   rc   rc   rd   	<genexpr>:      z8LLMEngine.get_num_unfinished_requests.<locals>.<genexpr>)sumr   rq   rc   rc   rd   get_num_unfinished_requests8     z%LLMEngine.get_num_unfinished_requestsc                 C   r+  )z.Returns True if there are unfinished requests.c                 s   r,  rn   )has_unfinished_seqsr   rc   rc   rd   r-  ?  r.  z4LLMEngine.has_unfinished_requests.<locals>.<genexpr>)anyr   rq   rc   rc   rd   has_unfinished_requests=  r1  z!LLMEngine.has_unfinished_requestsvirtual_enginec                 C   s   | j |  S )zW
        Returns True if there are unfinished requests for the virtual engine.
        )r   r2  rr   r5  rc   rc   rd   *has_unfinished_requests_for_virtual_engineB  s   z4LLMEngine.has_unfinished_requests_for_virtual_enginec                 C   s   | j j| jS )zReset the multi-modal cache.)r   r   Zreset_processor_cacher   rq   rc   rc   rd   r   I  s   zLLMEngine.reset_mm_cachedevicec                 C   s"   d}| j D ]	}|o||}q|S )z#Reset prefix cache for all devices.T)r   reset_prefix_cache)rr   r8  successr   rc   rc   rd   r9  N  s   
zLLMEngine.reset_prefix_cacher  c                 C   s&   |d j | _|  D ]}tj|_q
d S Nr   )dataZpooled_dataget_seqsrB   ZFINISHED_STOPPEDstatus)r  rf   r  rc   rc   rd   _process_sequence_group_outputsV  s   
z)LLMEngine._process_sequence_group_outputsr   c                 C   s  t   }t|jdkrdS |r|jd \}}}}}}	}
n|j \}}}}}}	}
t|t|jks3J t|dk}|r=J |}|rcg }t|D ]\}}|j|kr]||
vsVJ ||  nqG|sbdS ntt|}g }g }|D ]}||
v rvqo|| }|j| }|j	}|
 r|| qo|r|| }n|d | g}|s||jpd |r|D ]:}t|tr|jdur|jjdur|j j|jpd7  _n|j|j_|jjdur|j j|jpd7  _q|j|j_q| jjdkr| || n| j|| |jr| j||| |
 r|| qo|D ]-}|j| }|j	}|| | s)|| tj|| j| jd}|r<|j | q|rgt|dksJJ |
|d  |re| j!dure| !|j  |j "  dS |ru| j#D ]}|$  qm|D ]>}||
v s||v s||v rqw|j| }|j	}|| | s|| tj|| j| jd}|r|j | qw|j%D ],}|j&}|dur|j't(j)kr|
 sѐqtj|| j| jd}|r|j | q|j r| j!dur| !|j  |j "  |r| *||||
 | +|| dS )zApply the model output to the sequences in the scheduled seq groups
        and return responses.

        ctx: The virtual engine context to work on
        request_id: If provided, then only this request is going to be processed
        r   Nr   r   )Z	use_cache),r   lenro   popleftscheduled_seq_groups	enumerater   rt   r   r  is_finishedupdate_num_computed_tokenstoken_chunk_sizer~   r1   metricsmodel_forward_timemodel_execute_timer   r   r?  r   Zprocess_prompt_logprob	do_sampleZprocess_outputsZmaybe_set_first_token_time
is_prefillZset_last_token_timer7   creater   r   rp   r   clearr   Zfree_finished_seq_groupsZignored_seq_groupsr  Zoutput_kindr9   ZDELTAdo_log_stats
do_tracing)rr   r   r   nowrf   rY   rZ   rg   rh   ri   rj   Zhas_multiple_outputsZoutputs_by_sequence_groupindicesiZseq_group_metafinished_beforeZfinished_nowscheduled_seq_groupr  r{   oZrequest_outputr   r   rc   rc   rd   r   b  s  

























z LLMEngine._process_model_outputsrY   rB  c                 C   s   t |||D ]G\}}}|j}| rq|jdur|jnd}|| |jrMt|jdks0J d|jd }	t|jdks>J |jd }
|
	|	j
|	j|	j qdS )zGiven model output from a single run, append the tokens to the
        sequences. This is normally done inside output processor, but it is
        required if the worker is to perform async forward pass to next step.
        Nr   r   zKAsync output processor expects a single sample (i.e sampling_params.n == 1))zipr  rD  rF  rE  rJ  r@  Zsamplesr  Zappend_token_idZoutput_tokenr  Zoutput_embed)rr   r{   rY   rB  Zseq_group_metadataZsequence_group_outputsrT  r  rF  sampler  rc   rc   rd   _advance_to_next_step   s0   
	


zLLMEngine._advance_to_next_stepc              
   C   s~  | j jdkr
tdd}| j| }|j}|j}|j}| j| }|j	  | 
|sa| jsa| j|  \}}}||_||_| j|  }|D ]}|| jv rP| j|= qE|s`t|jdkr`| j|d nt }|dusjJ |duspJ | s| |}	t||j|j|j|j|j||	d}
|r| j| |
_z| jj|
d}d| _W n) t y } z|j!}| j"|||||d	  d}~ww t|jdkr| j|d g }| 
|s|sdn|d j#j$dk}|j%||||d
|d |r|rt|dksJ d| &|d ||j' |s| j|d | (|| | )| n|jS | * s<t|jdkr(| j|d t|jdks2J t+,d | j-  |jS )a  Performs one decoding iteration and returns newly generated results.

        <figure markdown="span">
        ![Overview of the step function](https://i.imgur.com/sv2HssD.png)
        <figcaption>Overview of the step function</figcaption>
        </figure>

        Details:
        - Step 1: Schedules the sequences to be executed in the next
            iteration and the token blocks to be swapped in/out/copy.

            - Depending on the scheduling policy,
                sequences may be `preempted/reordered`.
            - A Sequence Group (SG) refer to a group of sequences
                that are generated from the same prompt.

        - Step 2: Calls the distributed executor to execute the model.
        - Step 3: Processes the model output. This mainly includes:

            - Decodes the relevant outputs.
            - Updates the scheduled sequence groups with model outputs
                based on its `sampling parameters` (`use_beam_search` or not).
            - Frees the finished sequence groups.

        - Finally, it creates and returns the newly generated results.

        Example:
        ```
        # Please see the example/ folder for more detailed examples.

        # initialize engine and request arguments
        engine = LLMEngine.from_engine_args(engine_args)
        example_inputs = [(0, "What is LLM?",
        SamplingParams(temperature=0.0))]

        # Start the engine with an event loop
        while True:
            if example_inputs:
                req_id, prompt, sampling_params = example_inputs.pop(0)
                engine.add_request(str(req_id),prompt,sampling_params)

            # continue the request processing
            request_outputs = engine.step()
            for request_output in request_outputs:
                if request_output.finished:
                    # return or show the request output

            if not (engine.has_unfinished_requests() or example_inputs):
                break
        ```
        r   zqPipeline parallelism is only supported through AsyncLLMEngine as performance will be severely degraded otherwise.r   r   N)rY   blocks_to_swap_inblocks_to_swap_outblocks_to_copynum_lookahead_slotsrunning_queue_sizefinished_requests_idslast_sampled_token_ids)execute_model_reqF)r   r5  rY   rZ   r[   T)rf   rY   rZ   rg   rh   ri   z4Async postprocessor expects only a single output setz&Stopping remote worker execution loop.).r   r   NotImplementedErrorr   rY   rZ   r[   r   rp   rM  _has_remaining_stepsr   r   ZscheduleZ#get_and_reset_finished_requests_idsr   r@  ro   r   listZis_empty_get_last_sampled_token_idsr;   rY  rZ  r[  r\  r]  r   Zasync_callbackr   Zexecute_modelrR   r   _abort_and_cache_schedulestate	num_stepsru   rX  rB  rN  rO  r4  r   debugr  )rr   r5  Zcached_outputsrY   rZ   r[   r   r^  Zfinished_request_idr_  r`  rf   eZinvalid_request_idri   rc   rc   rd   step@  s   4








zLLMEngine.steprZ   r[   c           	      C   s   |  | t|D ]\}}|j|kr||=  nq	t|jD ]\}}|jj|kr-|j|=  nqt|dkrBd| _| j||||d dS dS )zAborts a single request, and caches the scheduler outputs minus that
        request. This allows the next step to continue processing the remaining
        requests without having to re-run the scheduler.r   T)r5  rZ   rY   r[   N)r#  rC  r   rB  r  r@  r   '_cache_scheduler_outputs_for_multi_step)	rr   r   r5  rY   rZ   r[   rR  metadatagrouprc   rc   rd   re    s(   


z#LLMEngine._abort_and_cache_schedulec                 C   s   dS )NFrc   )rr   rY   rc   rc   rd   rb  1  s   zLLMEngine._has_remaining_stepsc                 C   s&   | j | }||_||_||_d |_d S rn   )r   rY   rZ   r[   r\   )rr   r5  rY   rZ   r[   corc   rc   rd   rk  6  s
   

z1LLMEngine._cache_scheduler_outputs_for_multi_stepc                 C   s~   | j jdkr9t|dkr;|d d ur=|d }|d usJ |jd us#J |jd u s*J |jd u s1J || j| _d S d S d S d S )Nr   r   )r   r   r@  Zsampled_token_ids_cpuZsampled_token_idsZsampled_token_probsr   r\   )rr   r5  r{   r\   rc   rc   rd   _update_cached_scheduler_outputB  s   z)LLMEngine._update_cached_scheduler_outputc                 C   s   d S rn   rc   r6  rc   rc   rd   rd  O  s   z%LLMEngine._get_last_sampled_token_idslogger_namer   c                 C   s6   | j std|| jv rtd| d|| j|< d S )NKStat logging is disabled. Set `disable_log_stats=False` argument to enable.Logger with name z already exists.r   r   r   KeyError)rr   rq  r   rc   rc   rd   
add_loggerS  s   
zLLMEngine.add_loggerc                 C   s4   | j std|| jvrtd| d| j|= d S )Nrr  rs  z does not exist.rt  )rr   rq  rc   rc   rd   remove_logger\  s   
zLLMEngine.remove_loggermodel_outputrS  rj   c                 C   s8   | j r| ||||}| j D ]	}|| qdS dS )z#Forced log when no requests active.N)r   
_get_statsr   valueslog)rr   rZ   rx  rS  rj   statsr   rc   rc   rd   rN  e  s   zLLMEngine.do_log_statsc           ,      C   sH  t   }tdd | jD }tdd | jD }tdd | jD }| jj}	d}
|	r:tdd | jD }d||	  }
| jj}d}|rRtdd | jD }d||  }| jd	 tj}| jd	 tj	}| j
jd
kry||	}	}||
}
}||}}d	}d	}d	}g }g }|du rd	n|j}g }g }g }g }g }g }g }g }g }g } g }!ttdd | jD }"ttdd | jD }#d}$| jrt| jj}$|dur|j}%d	}&t|jD ]\}'}(|r|'|v r|%d8 }%q|r|'|v rq|'|jk })|(j}*|)r||(j7 }|* s|* }+||+ |&|* 7 }&n!|* }+||+ |*jjd	kr(|%|*jjd 7 }%n|%|*jjd 7 }%|* r|||*j j!  |*j j"durt|*j j#durt||*j j"|*j j!  ||*j j#|*j j"  |||*j j#  |||*j j"  |t$|*j% |&dd |*' D  |t(dd |*) D  |*j*dur||*j*j+ | |*j*j, |!&dd |*' D  q|%| |& }|| }t-d0i d|d|d|d|d|
d|d|d|d|d|d|d|d |d!|d"|d#|d$|d%|d&|d'|d(|d)|d*|d+| d,|!d-t|$d.t.|#/ d/t.|"/ S )1a.  Get Stats to be Logged to Prometheus.

        Args:
            scheduler_outputs: Optional, used to populate metrics related to
                the scheduled batch,
            model_output: Optional, used to emit speculative decoding metrics
                which are created by the workers.
            finished_before: Optional, indices of sequences that were finished
                before. These sequences will be ignored.
            skip: Optional, indices of sequences that were preempted. These
                sequences will be ignored.
        c                 s       | ]}t |jV  qd S rn   )r@  runningr   rc   rc   rd   r-        

z'LLMEngine._get_stats.<locals>.<genexpr>c                 s   r}  rn   )r@  Zswappedr   rc   rc   rd   r-    r  c                 s   r}  rn   )r@  waitingr   rc   rc   rd   r-    r  g        c                 s       | ]}|j  V  qd S rn   )block_managerZget_num_free_gpu_blocksr   rc   rc   rd   r-    
    
g      ?c                 s   r  rn   )r  Zget_num_free_cpu_blocksr   rc   rc   rd   r-    r  r   cpuNc                 S   &   g | ]}|j D ]	}|jr|jjqqS rc   )r~  r   	lora_name)r   r   Zrunning_requestrc   rc   rd   r         z(LLMEngine._get_stats.<locals>.<listcomp>c                 S   r  rc   )r  r   r  )r   r   Zwaiting_requestrc   rc   rd   r     r  0r   c                 S   r   rc   Zget_output_lenr   r  rc   rc   rd   r   1  r   c                 s   r,  rn   r  r  rc   rc   rd   r-  6  r.  c                 S   s   g | ]}t |jqS rc   )rB   Zget_finished_reasonr>  r  rc   rc   rd   r   <  s    
rP  num_running_sysnum_swapped_sysnum_waiting_sysgpu_cache_usage_syscpu_cache_usage_syscpu_prefix_cache_hit_rategpu_prefix_cache_hit_ratenum_prompt_tokens_iternum_generation_tokens_iternum_tokens_itertime_to_first_tokens_itertime_per_output_tokens_iternum_preemption_itertime_e2e_requeststime_queue_requeststime_inference_requeststime_prefill_requeststime_decode_requestsnum_prompt_tokens_requestsnum_generation_tokens_requests"max_num_generation_tokens_requests
n_requestsmax_tokens_requestsfinished_reason_requestsZmax_lorawaiting_lora_adaptersrunning_lora_adaptersrc   )0r   r/  r   r   r   r   Zget_prefix_cache_hit_raterN   ZCPUZGPUr   Zdevice_typeZ	preemptedr   collectionsCounterr   r   Z	max_lorasZnum_batched_tokensrC  rB  Znum_prefill_groupsr  rF  rK  Zget_last_token_latencyrt   num_seqsrf  Zcurrent_steprg  rD  rG  r   Zfirst_scheduled_timefirst_token_timer@  r  extendget_finished_seqsmaxr=  r  r  
max_tokensr$   rc  keys),rr   rZ   rx  rS  rj   rP  r  r  r  Znum_total_gpur  Znum_free_gpuZnum_total_cpur  Znum_free_cpur  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  Zmax_lora_statZactual_num_batched_tokensZ)num_generation_tokens_from_prefill_groupsidxrT  Zgroup_was_prefillr  Zlatencyrc   rc   rd   ry  q  s  












	!"#$%&'zLLMEngine._get_statsc                 C      | j |S rn   )r   add_lorar   rc   rc   rd   r  v     zLLMEngine.add_loralora_idc                 C   r  rn   )r   remove_lorarr   r  rc   rc   rd   r  y  r  zLLMEngine.remove_lorac                 C   s
   | j  S rn   )r   
list_lorasrq   rc   rc   rd   r  |     
zLLMEngine.list_lorasc                 C   r  rn   )r   pin_lorar  rc   rc   rd   r    r  zLLMEngine.pin_lorac                 C   r  rn   )r   start_profilerq   rc   rc   rd   r    r  zLLMEngine.start_profilec                 C   r  rn   )r   stop_profilerq   rc   rc   rd   r    r  zLLMEngine.stop_profiler   levelc                 C   s$   | j jjs	J d| jj|d d S )N-Sleep mode is not enabled in the model config)r  )r   r   enable_sleep_moder   sleep)rr   r  rc   rc   rd   r    s   zLLMEngine.sleeptagsc                 C   s"   | j jjs	J d| j| d S )Nr  )r   r   r  r   wake_up)rr   r  rc   rc   rd   r    s   zLLMEngine.wake_upc                 C   s   | j jS rn   )r   is_sleepingrq   rc   rc   rd   r    s   zLLMEngine.is_sleepingc                 C   r  rn   )r   check_healthrq   rc   rc   rd   r    r  zLLMEngine.check_healthc                 C   s
   | j d uS rn   )r   rq   rc   rc   rd   is_tracing_enabled  r  zLLMEngine.is_tracing_enabledc                 C   sL   | j d u rd S t|jD ]\}}|r||v rq|j}| r#| | qd S rn   )r   rC  rB  r  rD  create_trace_span)rr   rZ   rS  r  rT  r  rc   rc   rd   rO    s   

zLLMEngine.do_tracingc                 C   s   | j d u s
|jd u rd S t|jjd }t|j}| j jdtj	||d}|j}|j
d ur3|j
|j nd }|jd ur@|j|j nd }|tj| jj |tj|j |tj|jj |tj|jj |tj|jj |tj|jj |tj|  |tjt|j |tj t!dd |" D  |j#d ur|tj$|j# |d ur|tj%| |d ur|tj&| |j'd ur|tj(|j' |j)d ur|tj*|j)d  |j+d ur|tj,|j+ W d    d S W d    d S 1 sw   Y  d S )Ng    eAZllm_request)kindcontext
start_timec                 S   r   rc   r  r  rc   rc   rd   r     r   z/LLMEngine.create_trace_span.<locals>.<listcomp>g     @@)-r   r  rk   rG  r   rE   r   Zstart_as_current_spanrD   ZSERVERr  finished_timeZset_attributerC   ZGEN_AI_RESPONSE_MODELr   modelZGEN_AI_REQUEST_IDr   ZGEN_AI_REQUEST_TEMPERATUREZtemperatureZGEN_AI_REQUEST_TOP_PZtop_pZGEN_AI_REQUEST_MAX_TOKENSr  ZGEN_AI_REQUEST_Nr  ZGEN_AI_USAGE_NUM_SEQUENCESr  ZGEN_AI_USAGE_PROMPT_TOKENSr@  r  ZGEN_AI_USAGE_COMPLETION_TOKENSr/  r  Ztime_in_queueZGEN_AI_LATENCY_TIME_IN_QUEUEZ"GEN_AI_LATENCY_TIME_TO_FIRST_TOKENZGEN_AI_LATENCY_E2EZscheduler_timeZ GEN_AI_LATENCY_TIME_IN_SCHEDULERrH  Z$GEN_AI_LATENCY_TIME_IN_MODEL_FORWARDrI  Z$GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE)rr   r  Zarrival_time_nano_secondsZtrace_contextZseq_spanrG  ZttftZe2e_timerc   rc   rd   r    s   






8"zLLMEngine.create_trace_spaninputsc                 C   s8   t |\}}|d ur| j||dd | j||dd d S )Nencoder)prompt_typedecoder)r,   _validate_model_input)rr   r  r   r
  r  rc   rc   rd   r    s   
z LLMEngine._validate_model_inputsprompt_inputsr  )r  r  c             	   C   s  | j }| jd u r
d n| j|}|dg }|s/|dkr |jr n|d dkr'ntd| d|d urFt|dd}||jkrFtd	| d
| j j}t	||kr|dkrq|jrq| j
j}	|	j||pbt d}
t|
tslJ |
jrqd S |jrwd}nd}td| dt	| d| d| d S )Nr  r  r   ZembedszThe z prompt cannot be emptyr   rV   z	Token id z is out of vocabulary)r   zMake sure that `max_model_len` is no smaller than the number of text tokens plus multimodal tokens. For image inputs, the number of image tokens depends on the number of images, and possibly their aspect ratios as well.zLMake sure that `max_model_len` is no smaller than the number of text tokens.z prompt (length z-) is longer than the maximum model length of z. )r   r   r   r  Zis_multimodal_modelr   r  Zmax_token_idr   r@  r   r   Zcreate_processorobjectr~   r4   Zpad_dummy_encoder_prompt)rr   r  r   r  r   r   Z
prompt_idsZmax_input_idZmax_prompt_lenr   Zmm_processorZ
suggestionrc   rc   rd   r    sR   


zLLMEngine._validate_model_inputc                 C   s   g }|j s|jr"| j|d}t|j |j|d}|| d|_ d|_t|jdkr:| |}t|j|d}|| |rL|jdu rF||_|S |j| |S )a  Constructs logits processors based on the logits_bias, and
        allowed_token_ids fields in sampling_params. Deletes those fields and
        adds the constructed logits processors to the logits_processors field.
        Returns the modified sampling params.)r   )
logit_biasallowed_token_idsr   Nr   )	bad_wordsr   )	r  r  r   get_openai_logits_processorsr  r@  r  r/   r  )rr   r  r   r  r   Z
processorsrc   rc   rd   r   7  s.   



z"LLMEngine._build_logits_processorsrc   method.timeoutargskwargsc                 C   s   | j ||||S rn   )r   collective_rpc)rr   r  r  r  r  rc   rc   rd   r  \  s   zLLMEngine.collective_rpcrv   rn   r;  )NNNNr   )NNr   )NNNN)NNN)r   )Nrc   N)r]   r^   r_   r`   rx   r
   rb   ra   classmethodr   rz   r  r   rT   r   GenericSequencer   r   r   rI   rK   ZENGINE_CONTEXTr2   r   r(   r   r   r#   r3   rs   r   r   r   r"   r   r   r   r   r0   rH   r   r   r   r)   r   r:   r8   floatr   rk   r>   r  r  r*   r   r   r  r   r  r  r   r#  r%  r   r  r   r'  r   r(  r   r)  r   r*  r0  r4  r7  r   rN   r9  staticmethodr=   r?  rl   r   r1   r@   r    rX  r6   r5   rj  r!   re  rb  rk  rp  torchZTensorrd  rv  rw  rN  r$   ry  r  r  r   r  r  r  r  r  rc  r  r  r  r  rO  r  r  r+   r   r  r   r	   rU   tupler  rc   rc   rc   rd   rw   t   s  
 	
 
L%	


	

J
	

j	

6	


 ?
  R
 






	








  




D

;
'rw   r   r   )r   collectionsr   r  r   
contextlibr   dataclassesr   	functoolsr   typingr   r   r	   r
   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r  Ztyping_extensionsr   Z	vllm.envsr   Zvllm.configr   r   r   r   r   r   r   Zvllm.core.schedulerr    r!   Zvllm.engine.arg_utilsr"   Zvllm.engine.metrics_typesr#   r$   Z'vllm.engine.output_processor.interfacesr%   Z)vllm.engine.output_processor.stop_checkerr&   Z)vllm.entrypoints.openai.logits_processorsr'   r  Zvllm.executor.executor_baser(   Zvllm.inputsr)   r*   r+   Zvllm.inputs.parser,   Zvllm.inputs.preprocessr-   Zvllm.loggerr.   Zvllm.logits_processr/   Zvllm.lora.requestr0   Z"vllm.model_executor.layers.samplerr1   Zvllm.multimodalr2   r3   Zvllm.multimodal.processingr4   Zvllm.outputsr5   r6   r7   Zvllm.pooling_paramsr8   Zvllm.sampling_paramsr9   r:   Zvllm.sequencer;   r<   r=   r>   r?   r@   rA   rB   Zvllm.tracingrC   rD   rE   rF   Z#vllm.transformers_utils.detokenizerrG   Z!vllm.transformers_utils.tokenizerrH   Z'vllm.transformers_utils.tokenizer_grouprI   rJ   Zvllm.usage.usage_librK   rL   rM   Z
vllm.utilsrN   rO   rP   Zvllm.versionrQ   r   Zvllm.worker.model_runner_baserR   r]   r   r   rT   rU   rX   re   rl   rw   is_setr   r   r   rc   rc   rc   rd   <module>   s   8$,             ~