o
    )i|d                    @   s8  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dl mZ d dl	m
Z
mZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d d	l&m'Z' d d
l(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZDmEZE d dlFmGZG d dlHmIZImJZJmKZK d dlLmMZM d dlNmOZO d dlPmQZQmRZR d dlSmTZT d dlUmVZVmWZWmXZXmYZYmZZZ d dl[m\Z\ d dl]m^Z^m_Z_ d dl`maZambZbmcZcmdZdmeZemfZfmgZgmhZh d d limjZjmkZkmlZlmmZmmnZnmoZompZpmqZq e
rAd d!l"mrZr e<esZtd"Zud#Zved$d%d&Zwd'ejxjy_zd'ejxjy_{ed(d)G d*d% d%elZ|ed(d)G d+d, d,e|Z}G d-d. d.eme| Z~G d/d0 d0ekew ZG d1d2 d2ee} ZG d3d4 d4ejZdS )5    N)contextmanager)	dataclass)TYPE_CHECKINGAnyCallableDictListOptionalSetTupleTypeTypeVarUnion)tqdm)AttentionMetadataget_attn_backend)AttentionState)CommonAttentionState)compilation_counter)CompilationLevel
VllmConfig)SchedulerOutputs)broadcast_tensor_dictget_pp_group)get_kv_transfer_group)get_tensor_model_parallel_rankgraph_capture)get_forward_contextset_forward_context)INPUT_REGISTRYInputRegistry)init_logger)LoRAMapping)LoRARequest)LRUCacheWorkerLoRAManager)SamplingMetadataSamplingMetadataCache)MRotaryEmbedding)SamplerSamplerOutputget_sampler)	get_model)TensorizerConfig)supports_lorasupports_multimodal)set_cpu_offload_max_bytes)MULTIMODAL_REGISTRYBatchedTensorInputsMultiModalKwargsMultiModalPlaceholderMapMultiModalRegistry)SamplingParams)IntermediateTensorsSequenceGroupMetadata)DeviceMemoryProfiler	GiB_bytesPyObjectCacheasync_tensor_h2dflatten_2d_listsis_pin_memory_availablesupports_dynamoweak_ref_tensor)InputProcessingErrorModelRunnerBaseModelRunnerInputBaseModelRunnerInputBuilderBase%_add_attn_metadata_broadcastable_dict)_add_sampling_metadata_broadcastable_dict$_init_attn_metadata_from_tensor_dict(_init_sampling_metadata_from_tensor_dict)AttentionBackend      TModelInputForGPUModelInputForGPU)bound   T)frozenc                
   @   s  e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeee  ed< dZeee  ed< dZed	 ed
< dZeee  ed< dZed ed< dZee ed< dZeeeee f  ed< dZeee  ed< dZeed< dZee ed< dZee ed< dZ eej ed< deee!f fddZ"e#	d#de$e% deee!f ded de%fddZ&dd  Z'd!d" Z(dS )$rL   z
    This base class contains metadata needed for the base model forward pass
    but not metadata for possible additional steps, e.g., sampling. Model
    runners that run additional steps should subclass this method to add
    additional fields.
    Ninput_tokensinputs_embedsinput_positionstoken_typesseq_lens
query_lensr"   lora_mappinglora_requestsr   attn_metadatamulti_modal_kwargsrequest_ids_to_seq_idsfinished_requests_idsr   virtual_engineasync_callbackscheduler_outputsprevious_hidden_statesreturnc              
   C   s:   | j | j| j| j| j| j| j| j| jd	}t	|| j
 |S N)	rP   rQ   rR   rW   rV   rY   r\   rZ   r[   )rP   rQ   rR   rW   rV   rY   r\   rZ   r[   rD   rX   selftensor_dict re   d/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/worker/model_runner.pyas_broadcastable_tensor_dictg   s   z-ModelInputForGPU.as_broadcastable_tensor_dictclsrd   attn_backendrH   c                 C   s    |d ur	t ||}| di |S Nre   )rF   rh   rd   ri   re   re   rf   from_broadcasted_tensor_dictv   s
   z-ModelInputForGPU.from_broadcasted_tensor_dictc                 C   s   | j  }|d= |S Nr]   )__dict__copyrc   statere   re   rf   __getstate__   s   
zModelInputForGPU.__getstate__c                 C   s    | j | | j dd i d S rm   )rn   updaterp   re   re   rf   __setstate__   s   zModelInputForGPU.__setstate__N))__name__
__module____qualname____doc__rP   r	   torchTensor__annotations__rQ   rR   rS   rT   r   intrU   rV   rW   r
   r#   rX   rY   r1   rZ   r   strr[   r\   r]   r   r^   r   r_   r   rg   classmethodr   rK   rl   rr   rt   re   re   re   rf   rL   N   s@   
 
c                   @   sr   e Zd ZU dZdZed ed< dZee ed< de	e
ef fddZe	dd	e	e
ef d
ed dd fddZdS )$ModelInputForGPUWithSamplingMetadataz"
    Used by the ModelRunner.
    Nr%   sampling_metadata	is_promptr`   c              
   C   sF   | j | j| j| j| j| j| j| j| jd	}t	|| j
 t|| j |S ra   )rP   rQ   rR   rW   rV   rY   r\   rZ   r[   rD   rX   rE   r   rb   re   re   rf   rg      s   zAModelInputForGPUWithSamplingMetadata.as_broadcastable_tensor_dictrd   ri   rH   c                 C   s(   t |}|d urt||}| di |S rj   )rG   rF   rk   re   re   rf   rl      s   zAModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dictru   )rv   rw   rx   ry   r   r	   r|   r   boolr   r~   r   rg   r   rl   re   re   re   rf   r      s   
 
r   c                       sF  e Zd ZdZG dd dZdefddZdd Zd	d
 Z	d.ddde	e
e  f fddZ	d.de	e
e  ddfddZdededefddZdededefddZdededefddZdededefddZdedefdd Zdefd!d"Z	#d/d$ed%ed&ed'edef
d(d)Z	#d/ded&ed'edefd*d+Zdefd,d-Z  ZS )0ModelInputForGPUBuilderz2Build ModelInputForGPU from SequenceGroupMetadata.c                5   @   s  e Zd ZdZdd Zddddddddddddddddddddddded	ee d
ede	e
eee f  dee dede	eee   de	ej de	eee   de	eee   de	eeee    de	ee  de	ee  de	ee  de	ee  de	ee  de	ee  de	eee   de	eee   de	ee  de	e de	e
eef  deded ed!ef4d"d#Zd$d% Zd&efd'd(ZdS ))z,ModelInputForGPUBuilder.InterDataForSeqGroupz1Intermediate data for the current sequence group.c                 C   s   | j d   d | _| jd   | jd   d | _d| jd< d| jd< d| jd< d| j	d< d| j
d< d| jd< | j  | j  | j  d S )Nr   )rP   clearrQ   rR   rS   mrope_input_positionsrT   orig_seq_lensprompt_lensrU   context_lenscurr_sliding_window_blockslora_index_mappinglora_prompt_mappingrW   rc   re   re   rf   simple_reinit   s   







z:ModelInputForGPUBuilder.InterDataForSeqGroup.simple_reinitr   NF)n_seqsrP   rQ   rR   rS   r   rT   r   r   rU   r   r   r   r   rW   rY   multi_modal_placeholder_mapsprefix_cache_hitreinitreinit_use_defaultsencoder_seq_len
request_idseq_idsr   block_tablescomputed_block_numsr   rP   rQ   rR   rS   r   rT   r   r   rU   r   r   r   r   rW   rY   r   r   r   r   r   c                C   s<  |rt | jt |ksJ t|D ]	\}}|| j|< qn|| _|| _|| _|| _|| _|| _|| _|r2t | jdkrC|rC| 	  nA|rI|| _
ntt | jD ]	}| j
|   qP|| _|	rc|	| _ntt | jD ]	}| j|   qj|
rz|
| _ntt | jD ]	}| j|   qd | _|r|| _ntt | jD ]}d| j|< q|r|| _ntt | jD ]}d| j|< q|r|| _ntt | jD ]}d| j|< q|r|| _ntt | jD ]}d| j|< q|r|| _ntt | jD ]}d| j|< q|r|| _ntt | jD ]}d| j|< q|r|| _n| j  |r || _n| j  |r,|| _nX| j  nR|p6g | _
|| _|	p?g | _|
pEg | _|pKd | _|pQg | _|pWg | _|p]g | _|pcg | _|pig | _|pog | _|pug | _|p{g | _|pt | _|| _|| _|| _t | j| _|s|   d S d S )N   r   )lenr   	enumerater   r   r   r   r   r   r   rP   ranger   rQ   rR   rS   r   rT   r   r   rU   r   r   r   r   rW   setrY   r   r   __post_init__)rc   r   r   r   r   r   r   rP   rQ   rR   rS   r   rT   r   r   rU   r   r   r   r   rW   rY   r   r   r   r   r   iZseq_idre   re   rf   __init__   s   0

z5ModelInputForGPUBuilder.InterDataForSeqGroup.__init__c                 C   s   t | j| _dd t| jD | _dd t| jD | _dd t| jD | _d | _dg| j | _dg| j | _	dg| j | _
dg| j | _dg| j | _dg| j | _g | _g | _d S )Nc                 S      g | ]}g qS re   re   .0_re   re   rf   
<listcomp>y      zNModelInputForGPUBuilder.InterDataForSeqGroup.__post_init__.<locals>.<listcomp>c                 S   r   re   re   r   re   re   rf   r   z  r   c                 S   r   re   re   r   re   re   rf   r   {  r   r   )r   r   r   r   rP   rR   rS   r   rT   r   r   rU   r   r   r   r   r   re   re   rf   r   v  s   
z:ModelInputForGPUBuilder.InterDataForSeqGroup.__post_init__r`   c                 C   s   d g d| j d| j d| j d| j d| j d| j d| j d	t| j	d
d  d| j
 d| j d| j d| j d| j d| j d| j d| j S )N z InterDataForSeqGroup(request_id=z
, seq_ids=z, is_prompt=z, block_tables=z, computed_block_nums=z	, n_seqs=z, input_tokens=z, inputs_embeds.shape=shapez, input_positions=z, token_types=z, mrope_input_positions=z, seq_lens=z, orig_seq_lens=z, query_lens=z, context_lens=z, multi_modal_kwargs=)joinr   r   r   r   r   r   rP   getattrrQ   rR   rS   r   rT   r   rU   r   rY   r   re   re   rf   __repr__  sB   
	
z5ModelInputForGPUBuilder.InterDataForSeqGroup.__repr__)rv   rw   rx   ry   r   r~   r   r}   r   r	   r   rz   r{   r
   r#   r2   r3   r   r   r   re   re   re   rf   InterDataForSeqGroup   s    	





!"
#&'+,-.
 (r   num_seqsc                    s    fddS )Nc                      s   t jddg  dd g dS )Nr   r   T)r   r   r   r   r   )r   r   re   r   re   rf   <lambda>  s    z@ModelInputForGPUBuilder.gen_inter_data_builder.<locals>.<lambda>re   )rc   r   re   r   rf   gen_inter_data_builder  s   z.ModelInputForGPUBuilder.gen_inter_data_builderc                 O   sn   t |dksJ d|v sJ |d }t |}| jj}||vr't| |||< ||  }|j|i | |S )Nr   r   )r   runnerinter_data_cacher:   r   Z
get_objectr   )rc   argskwargsr   r   r   objre   re   rf   init_cached_inter_data  s   z.ModelInputForGPUBuilder.init_cached_inter_datac                 C   s   | j j D ]}|  qd S ru   )r   r   valuesreset)rc   cachere   re   rf   reset_cached_inter_data  s   
z/ModelInputForGPUBuilder.reset_cached_inter_dataNr   GPUModelRunnerBaser[   c                    s   t    | j| j| j| jg| _| jg| _|| _	| j	j
| _| j	j| _| j	j| _| j	j| _| j	j| _| j	jd u| _| jd urH| j t| | _| jd uoP| jj| _| jd url| j| j d | j | _| j| j | _d S d S Nr   )superr   _compute_lens_compute_for_prefix_cache_hit_compute_for_sliding_window_compute_lora_inputper_seq_compute_fns_compute_multi_modal_inputper_seq_group_compute_fnsr   _model_input_clsmodel_input_clsri   scheduler_configsliding_window
block_sizelora_configenable_loraZget_builder_clsweakrefproxyattn_metadata_builderchunked_prefill_enabledsliding_window_blocksblock_aligned_sliding_window)rc   r   r[   	__class__re   rf   r     s8   
	








z ModelInputForGPUBuilder.__init__r`   c                 C   s    || _ d| _g | _| j  d S )NT)r[   decode_onlyinter_data_listr   prepare)rc   r[   re   re   rf   r     s
   zModelInputForGPUBuilder.prepare
inter_dataseq_idxseq_group_metadatac                 C   sZ  |j |j|  }|j}| }|jr| }t||| }n| jjj	r(|d }n| }|j
du r<| || }d}	ndg||  }| || }	|j}
||j|< ||j|< | |j|< ||j|< |j| | |	|_|j| t|| |j| |
r|
ng  || |j|< |jdur|jdu rdg|j |_t|j|||j|< dS dS )z`Compute context length, sequence length and tokens
        for the given sequence data.
        r   Nr   )seq_datar   token_chunk_sizeget_lenr   Zget_num_computed_tokensminr   model_configis_encoder_decoderprompt_embedsget_token_idsZget_token_embeddingsZtoken_type_idsrT   r   Zget_prompt_lenr   r   rP   extendrQ   rR   r   rS   rU   mrope_position_deltar   r   r'   Zget_next_input_positions)rc   r   r   r   r   r   seq_lencontext_lentokensr   rS   re   re   rf   r     sP   









z%ModelInputForGPUBuilder._compute_lensc           
      C   s  |j }|duot|dko| jdu o|j}||_|sdS |dus"J t|| j }|j|j|  | |j	| }|j
| }||krDdS ||  k rN|k rn n;|| }	|j| |	d |j|< |j| |	d |j|< |j| |	d |j|< |}||j	|< |j
| | |j|< dS ||kr|j| dd |j|< |j| dd |j|< |j| dd |j|< d|j|< |j
| d |j	|< dS dS )zCheck if hit prefix cache (i.e., some blocks are already computed).
        If hit, update input tokens and positions to only compute the
        remaining blocks.
        Nr   r   )r   r   r   r   r   r   r   r   Zupdate_num_cached_tokensr   rT   rP   rR   rS   rU   )
rc   r   r   r   r   r   Zprefix_cache_lenr   r   Zuncomputed_startre   re   rf   r   %  sv   










z5ModelInputForGPUBuilder._compute_for_prefix_cache_hitc                 C   sr   d}|j | }|js-| jdur-| j}|j | | j }t|j | | j| }|dkr-|d7 }||j|< ||j |< dS )zUpdate seq_len and curr_sliding_window_block for the given
        sequence data (only required by decoding) if sliding window is enabled.
        r   Nr   )rT   r   r   r   r   r   r   r   )rc   r   r   r   Zcurr_sliding_window_blockZsliding_seq_lenZsuff_lenre   re   rf   r   f  s   

z3ModelInputForGPUBuilder._compute_for_sliding_windowc                 C   s   | j sdS |j}|dkr|j|j |j| }|j|g|  |j}|r6|j	dur6|j
|g|  dS | jr<|jrE|j
|g dS |j
g  dS )z:If LoRA is enabled, compute LoRA index and prompt mapping.Nr   )r   lora_int_idrW   addlora_requestrU   r   appendsampling_paramsZprompt_logprobsr   r   Z	do_sample)rc   r   r   r   lora_idZ	query_lenr   re   re   rf   r   ~  s   
z+ModelInputForGPUBuilder._compute_lora_inputc                 C   s  |j d }t|t|d |d t| \}}|s|jsdS ||_||_| jj	j
r|dd}|dd}|dd}|dd}	|dd}
| jj	j}dg|j |_t|jD ].}|j|j|  }| }tj|||||	|j| |j| ||
d		\}}||_||j|< qYdS dS )
z2If multi-modal data is given, add it to the input.r   Nimage_grid_thwvideo_grid_thwaudio_feature_lengthssecond_per_grid_tsuse_audio_in_videoF)	hf_configr   r   r   r   r   r   r   )rR   r3   Zfrom_seq_groupr   r   r   rY   r   r   r   
uses_mropegetr   r   r   r   r   r   r'   Zget_input_positionsr   rT   r   )rc   r   r   	positionsZ	mm_kwargsZplaceholder_mapsr   r   r   r   r   r   r   r   Z	token_idsr   r   re   re   rf   r     sV   



z2ModelInputForGPUBuilder._compute_multi_modal_inputc           
   
   C   s   |j  }t|}|j}|r|dksJ d| _d}| jjjr#|j	 }| j
|j|||j|jdd|d}| j| t|D ]}| jD ]}|||| qBq=| jD ]}	|	|| qOdS )z$Add a sequence group to the builder.r   Fr   T)r   r   r   r   r   r   r   r   N)r   keysr   r   r   r   r   r   Zencoder_seq_datar   r   r   r   r   r   r   r   r   r   )
rc   r   r   r   r   r   r   r   Z
per_seq_fnZper_seq_group_fnre   re   rf   add_seq_group  s6   





z%ModelInputForGPUBuilder.add_seq_groupr   
batch_sizer   max_decode_seq_lenmax_encoder_seq_lenc                 C   s4   |o| j jj o|| j jko|| j jko|| j jkS ru   )r   r   enforce_eagermax_seq_len_to_capturemax_batchsize_to_capture)rc   r   r   r   r   re   re   rf   _use_captured_graph  s   


z+ModelInputForGPUBuilder._use_captured_graphc                 C   sH   | j }|sdS |}| ||||sdS | jj|}||ks J || S )a,  
        Determine the number of padding sequences required for running in
        CUDA graph mode. Returns -1 if CUDA graphs cannot be used.

        In the multi-step + chunked-prefill case, only the first step
        has Prefills (if any). The rest of the steps are guaranteed to be all
        decodes. In this case, we set up the padding as if all the sequences
        are decodes so we may run all steps except the first step in CUDA graph
        mode.

        Args:
            num_seqs (int): Number of sequences scheduled to run.
            max_decode_seq_len (int): Greatest of all the decode sequence
                lengths. Used only in checking the viablility of using
                CUDA graphs.
            max_encoder_seq_len (int, optional): Greatest of all the encode
                sequence lengths. Defaults to 0. Used only in checking the
                viability of using CUDA graphs.
        Returns:
            int: Returns the determined number of padding sequences. If
                CUDA graphs is not viable, returns -1.
        r   )r   r   r   vllm_configZpad_for_cudagraph)rc   r   r   r   r   r   graph_batch_sizere   re   rf   _get_cuda_graph_pad_size  s   z0ModelInputForGPUBuilder._get_cuda_graph_pad_sizec                  C   s  t t  }t tj  }t t  }| jD ]-}|jD ]}|| q|jD ]}|| q#|jdur@|	|jj
| jjj| jjd qt|dkrJd}ntj|ddj
| jjj| jjd}t|t|kseJ |so|du ro|  S d}tdd | jD rdd td	D }td	D ]+}	| jD ]%}|j}
|
du r|jD ]	}||	 | qq|
D ]}||	 ||	  qqqd}ng }| jD ]}|jD ]}|| qqg }g }d}d}| jD ]$}||j ||j |jst|t|j}| jjjrt||j}qd
d | jD }| jt|||d}t|}|dkr||7 }|r%|td| | jjdus.J t|tj | jj| jj!}|rIt|tj | jj| jj!nd}|durptd	D ]}	||	 td| qTt|tj | jj| jj!}n|td| t|tj | jj| jj!}|r|td| | j"#||||}t$ }d}| j%rt$dd | jD }t&dd | jD }|r|td| t&dd | jD }t'di t(||| j) d}dd | jD }t*+|}| j|||||||||||| j,dS )zUFinalize the builder intermediate data and
        create on-device tensors.
        Ndtypedevicer   )dimc                 s   s    | ]}|j d uV  qd S ru   )r   r   r   re   re   rf   	<genexpr>?  s    z0ModelInputForGPUBuilder.build.<locals>.<genexpr>c                 S   r   re   re   r   re   re   rf   r   A  r   z1ModelInputForGPUBuilder.build.<locals>.<listcomp>   c                 S   s   i | ]}|j |jqS re   )r   r   r   datare   re   rf   
<dictcomp>d      z1ModelInputForGPUBuilder.build.<locals>.<dictcomp>)r   r   r   r   r   c                 s   s     | ]}|j D ]}|V  qqd S ru   )rW   )r   r  rre   re   rf   r    s    c                 S      g | ]}t |jqS re   )r<   r   r  re   re   rf   r     r  c                 S   r  re   )r<   r   r  re   re   rf   r     r  Zindex_mappingZprompt_mappingZ
is_prefillc                 S   s   g | ]
}|j d ur|j qS ru   )rY   r
  re   re   rf   r     s
    
)rP   rQ   rR   rS   rX   rT   rU   rV   rW   rY   rZ   r[   re   )-listr}   rz   r{   r   rP   r   rS   rQ   r   tor   r   r  r  r   catr   anyr   r   rR   rT   rU   r   maxr   r   r  	itertoolsrepeatr;   long
pin_memoryr   buildr   r   r<   r"   dictr   r2   batchr[   ) rc   rP   Zinputs_embeds_listrS   r   Zcur_input_tokensZcur_token_typesrQ   r   idxZ	msectionsZ_seq_input_positionsZ_seq_mrope_input_positionsrR   Zcur_input_positionsrT   rU   r   r   rZ   Zcuda_graph_pad_sizer   Zinput_tokens_tensorZtoken_types_tensorZinput_positions_tensorrX   rW   rV   r   r   Zmulti_modal_kwargs_listrY   re   re   rf   r    s.  

















zModelInputForGPUBuilder.buildru   )r   )rv   rw   rx   ry   r   r}   r   r   r   r	   r   r~   r   r   r7   r   r   r   r   r   r   r   r   r  rL   r  __classcell__re   re   r   rf   r      s     `
+



5
A


4%

-r   c                   @   s  e Zd ZU dZee ed< ee ed< eed< dddee	fde
dee d	ed
ededefddZdIddZdejfddZ		dJdedee dee ddfddZdeddfddZdefddZ	dKdee d eee  defd!d"Zed#d$ Ze  dId%d&Z!d'ede"e# fd(d)Z$d*d+ Z%	,dLd-ed.eddfd/d0Z&d1d2 Z'd3e(e# d4e)ddfd5d6Z*d7e#defd8d9Z+d:edefd;d<Z,d:edefd=d>Z-de(e fd?d@Z.e  dAeeej/  ddfdBdCZ0dDe1ee2f fdEdFZ3e4defdGdHZ5dS )Mr   zD
    Helper class for shared methods between GPU model runners.
    r   _builder_clsbuilderautoFr   kv_cache_dtypeis_driver_workerreturn_hidden_statesinput_registrymm_registryc                 C   s  t | | | j}| j}|| _|| _| jj| _t | _	|| _
| | _|j| _| jj| _| jjj| _dd t| jjD | _d | _|j| _d| _tj| j|  ftjd| _tj| j|  ftjd| _| j | j}	|	dkpr| jj!}
|
rt"| j# | jj$| j
| j| jj!| jj%dnd | _&| j&r| j&' t()| | _*nt+t()| | _*|| _,|| _-|  d | _.t/ | _0t1t2| jj3d  i | _4| jjdkrt5 nd | _6t7| d	r| 8t()| | _9d S d S )
Nc                 S   s   g | ]}i qS re   re   r   re   re   rf   r     s    z/GPUModelRunnerBase.__init__.<locals>.<listcomp>F)r  r   )use_mlai   @r   r  ):rA   r   r   cache_configr#  r$  Zdevice_configr  r=   r  r"  Zget_sliding_windowr   r   r   r   compilation_configZmax_capture_sizer   r   parallel_configpipeline_parallel_sizegraph_runnersgraph_memory_poolhas_inner_statein_profile_runnpzerosget_max_block_per_batchZint32Zgraph_block_tablesZ%cross_layer_shared_graph_block_tablesZget_num_attention_headsZis_attention_freer   Zget_head_sizer  r'  ri   Zget_state_clsr   r   
attn_stater   r%  r&  lora_managerr*   samplerr/   r}   Zcpu_offload_gbr   r&   sampling_metadata_cachehasattrr  r   )rc   r   r"  r#  r$  r%  r&  r   r(  Znum_attn_headsZneeds_attn_backendre   re   rf   r     s   




	


zGPUModelRunnerBase.__init__r`   Nc                 C   sT  t d| jj t| jZ}t }t| j	d| _| j
r\t| js+J | jjj dt| jr5t d | jj }t| jj| jj| j| j
| j| jj| jj|jd| _| j| j| _t }W d    n1 sjw   Y  |j| _t d| jt ||  | j	jj t!j"krt# r| j	j$| j	}t% j&d7  _&t'j(| jt)j*|d| _d S d S d S )	NzStarting to load model %s...)r   z does not support LoRA yet.zXRegarding multimodal models, vLLM currently only supports adding LoRA to language model.)max_position_embeddingsz,Model loading took %.4f GiB and %.6f secondsr   )Z	fullgraphbackend)+loggerinfor   modelr8   r  timeperf_counterr+   r   r   r-   r   rv   r.   warningr   Zget_text_configr$   r   max_num_seqsmax_num_batched_tokens
vocab_sizeZembedding_modulesZembedding_padding_modulesr8  r4  Zcreate_lora_managerZconsumed_memoryZmodel_memory_usager9   r)  levelr   ZDYNAMO_AS_ISr>   Zinit_backendr   Zdynamo_as_is_countrz   compileenvsZ"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE)rc   mZtime_before_loadZtext_configZtime_after_loadr9  re   re   rf   
load_model7  sb   

zGPUModelRunnerBase.load_modelc                 C   s   | j S ru   )r<  r   re   re   rf   r+   g  s   zGPUModelRunnerBase.get_modelpathpatternmax_sizec                 C   s$   ddl m} |j| j|||d d S )Nr   )ShardedStateLoader)rI  rJ  ) vllm.model_executor.model_loaderrK  
save_modelr<  )rc   rH  rI  rJ  rK  re   re   rf   save_sharded_statej  s   
z%GPUModelRunnerBase.save_sharded_statetensorizer_configc                 C   s$   ddl m} |j| j|| jd d S )Nr   )TensorizerLoader)rO  r   )rL  rP  rM  r<  r   )rc   rO  rP  re   re   rf   save_tensorized_modelx  s   
z(GPUModelRunnerBase.save_tensorized_modelc                 C   s   | j }| j| d | S r   )r   r   )rc   r   re   re   rf   r2    s   z*GPUModelRunnerBase.get_max_block_per_batchseq_group_metadata_listr[   c                 C   sd   | j | |D ]}z| j | W q ty' } z	t|jt||d}~ww | j   | j  S )aq  Helper method to prepare the model input based on a given sequence
        group. Prepares metadata needed for the base model forward pass but not
        metadata for possible additional steps, e.g., sampling.

        The API assumes seq_group_metadata_list is sorted by prefill -> decode.

        The result tensors and data structure also batches input in prefill
        -> decode order. For example,

        - input_tokens[:num_prefill_tokens] contains prefill tokens.
        - input_tokens[num_prefill_tokens:] contains decode tokens.

        If cuda graph is required, this API automatically pads inputs.
        N)	r   r   r   	Exceptionr@   r   r~   r   r  )rc   rR  r[   r   ere   re   rf   _prepare_model_input_tensors  s   

z/GPUModelRunnerBase._prepare_model_input_tensorsc                 c   s$    d| _ z	d V  W d| _ d S d| _ w )NTF)r/  r   re   re   rf   set_in_profile_run  s
   z%GPUModelRunnerBase.set_in_profile_runc                 C   s    | j j}| j j}| || d S ru   )r   rA  r@  
_dummy_run)rc   rA  r@  re   re   rf   profile_run  s   zGPUModelRunnerBase.profile_run	num_lorasc                 C   s   |dksJ | j d usJ g }| j  + t|D ]}|d }td| |dd}| j j|td || qW d    |S 1 sBw   Y  |S )Nr   r   Zwarmup_z/not/a/real/path)Z	lora_namer   Z	lora_path)Zrank)r4  Zdummy_lora_cacher   r#   Zadd_dummy_loraLORA_WARMUP_RANKr   )rc   rY  dummy_lora_requestsr  r   dummy_lora_requestre   re   rf   _add_dummy_loras  s(   
z#GPUModelRunnerBase._add_dummy_lorasc                 C   s   | j d usJ |   d S ru   )r4  remove_all_lorasr   re   re   rf   _remove_dummy_loras  s   z&GPUModelRunnerBase._remove_dummy_lorasr   rA  r@  c                    s     tdjd d}g  g }jr1jj t jjks&J  fddt|D }g }j	j
}|dkr_|}t||| }|dk r_d| d| d	| d
}td| d}d}	t|D ]8}
|| |
|| k  }|	|7 }	jj
|j}tt|
d|
|ji|d |r||
 nd |j|jd}|| qej
j}fddt|D }dd |D }j||d}d }t jsЈjj|	j
jjd}|jd urd|j_ !||| t"j#$  jr%  	 W d    d S 1 sw   Y  d S )NgGz?r   )Ztop_pZtop_kc                    s   g | ]
} |t    qS re   )r   )r   r  )r[  re   rf   r     s    z1GPUModelRunnerBase._dummy_run.<locals>.<listcomp>r   zmin(z, z // )zSComputed max_num_seqs (%s) to be less than 1. Setting it to the minimum value of 1.T)r   r   r   r   r   r   multi_modal_datamulti_modal_placeholdersc                    s    g | ]}t jg t j jd qS )r  )rz   tensorZfloat32r  r   r   re   rf   r     s    c                 S   s   g | ]}|j qS re   )r   )r   seqre   re   rf   r   #  s    )r[   r   r  r  F)&rV  r5   rB  r   r]  Z	max_lorasr   r   r&  Zget_max_multimodal_tokensr   r   r:  r?  r%  Zdummy_data_for_profilingr7   r~   r   ra  rb  r   Zget_num_layersr*  prepare_model_inputr   is_first_rankr<  make_empty_intermediate_tensorsr  r  rX   enable_kv_scales_calculationexecute_modelrz   cudasynchronizer_  )rc   rA  r@  r   Zdummy_lora_requests_per_seqZseqsZmax_mm_tokensZmax_num_seqs_origexprr   Zgroup_idr   Z
dummy_datard  Z
num_layers	kv_cachesr[   model_inputintermediate_tensorsre   )r[  rc   rf   rW    s   






$zGPUModelRunnerBase._dummy_runc                 C   s   | j std| j   d S NzLoRA is not enabled.)r4  RuntimeErrorZremove_all_adaptersr   re   re   rf   r^  9  s   z#GPUModelRunnerBase.remove_all_lorasrW   rV   c                 C   s    | j std| j || d S rq  )r4  rr  Zset_active_adapters)rc   rW   rV   re   re   rf   set_active_loras>  s   z#GPUModelRunnerBase.set_active_lorasr   c                 C      | j std| j |S rq  )r4  rr  Zadd_adapter)rc   r   re   re   rf   add_loraD     zGPUModelRunnerBase.add_lorar   c                 C   rt  rq  )r4  rr  Zremove_adapterrc   r   re   re   rf   remove_loraI  rv  zGPUModelRunnerBase.remove_lorac                 C   rt  rq  )r4  rr  Zpin_adapterrw  re   re   rf   pin_loraN  rv  zGPUModelRunnerBase.pin_lorac                 C   s   | j std| j  S rq  )r4  rr  Zlist_adaptersr   re   re   rf   
list_lorasS  s   
zGPUModelRunnerBase.list_lorasrn  c                 C   s  | j jrJ td t }tj d }| j	}tj
|tj| jd}tj
|tj| jd}tj
|| j  f| j j| jd}| j jrLt|dj| jd}d}dt| jjjv rhtj|| j  g| j j| jd}d}	t jsz| jj|| j j| jd}	d}
g }| jr| jd	d
}t|d	ksJ |d }|j}
| j| t| j}t | j!j"D ]}| j#j$j%}| j j&rdnd}t'(||}t) dkrt*t+|| j,j- dd}|D ]\}}| jj.|| j j/d}d|_0| jrt1di t2|
g| |
g| dd}| 3t4|g| t5| j| j67 | j8|| j j/}|d| |r!|d| nd|dd|f |	dur4|	d| nd|| || j9|j:d}|durM|d| |d< | j;r\|<d| j=|i | j j/rf| >| t?|| j#| |j@di | W d   n	1 sw   Y  |jAB | _9|| jC| ||f< qqW d   n	1 sw   Y  W d   n	1 sw   Y  | jr| D  t }tj d }|| }|| }td||tE  dS )a"  Cuda graph capture a model.

        Note that CUDA graph's performance gain is negligible if number
        of batched tokens are larger than 200. And since CUDA graph
        requires fixed sized tensors, supporting large/variable batch
        size requires high GPU memory overhead. Thus, vLLM only captures
        decoding requests. Mixed batch (chunked prefill + decoding) or
        prefill requests are not captured.

        Since it is used for decoding-only, it assumes there's only 1 token
        per sequence in the batch.
        a  Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.r   r  )r	  r   r  Nr_   re  r   )rY  )TF)FzCapturing CUDA graph shapes)disableZdesc)is_encoder_decoder_modelFr  .)	input_idsrQ   r   intermediate_inputsrn  rX   memory_poolstreamseqlen_agnostic_capture_inputsz4Graph capturing finished in %.0f secs, took %.2f GiBre   )Fr   r   r:  r;  r=  r>  rz   rk  Zmem_get_infor   r1  r  r  Zget_hidden_sizer  r   Ztileinspect	signaturer<  forward
parametersemptyr   rg  rh  r   r]  r   r   r3  r   r   r*  r+  r   r)  cudagraph_capture_sizesZenable_prompt_embedsr  productr   r   r  Zload_configZuse_tqdm_on_loadZ$graph_capture_get_metadata_for_batchr   ri  r"   r  rs  r   CUDAGraphRunnerri   get_nameZgraph_cloner-  r  r.  rs   Z"get_seqlen_agnostic_capture_inputs+_update_inputs_to_capture_for_enc_dec_modelr   capturegraphpoolr,  r_  r9   )rc   rn  
start_timeZstart_free_gpu_memoryZmax_batch_sizerP   rR   rQ   r_   r  Zdummy_lora_idr\  r[  Zgraph_capture_contextr\   r  Zcudagraph_inputs_embedsZcompilation_casesr   use_inputs_embedsrX   rV   Zgraph_runnercapture_inputsend_timeZend_free_gpu_memoryelapsed_timeZcuda_graph_sizere   re   rf   capture_modelX  s2  







 Zz GPUModelRunnerBase.capture_modelr  c                 C   s4   t jg t j| jd|d< t jg t j| jd|d< dS )a:  
        Updates the set of input tensors needed for CUDA graph capture in an
        encoder-decoder model.

        This method modifies the provided `capture_inputs` dictionary by
        adding tensors specific to encoder-decoder specific models that
        need to be captured for CUDA Graph replay.
        r  encoder_input_idsencoder_positionsN)rz   rc  r  r  )rc   r  re   re   rf   r    s   
z>GPUModelRunnerBase._update_inputs_to_capture_for_enc_dec_modelc                 C   s
   | j  S ru   )r   Zget_vocab_sizer   re   re   rf   rB    s   
zGPUModelRunnerBase.vocab_size)r`   N)NNru   )r   )6rv   rw   rx   ry   r   rK   r|   r   r   r0   r   r	   r~   r   r    r4   r   rG  nnModuler+   r}   rN  r,   rQ  r2  r   r7   rU  r   rV  rz   inference_moderX  r  r#   r]  r_  rW  r^  r
   r"   rs  ru  rx  ry  rz  r{   r  r   r   r  propertyrB  re   re   re   rf   r     s   
 

j0



 

l

 *
r   c                   @   s   e Zd ZU dZeZee ed< eZ	ee ed< de
eef defddZ			dd
ee dedeee  defddZe 			ddedeej dee dedeeee ef  f
ddZdefddZdefddZd	S )ModelRunnerz.
    GPU model runner with sampling step.
    r   r  rd   r`   c                 C   s   t j|| jd}|S )N)ri   )r   rl   ri   )rc   rd   ro  re   re   rf   -make_model_input_from_broadcasted_tensor_dict#  s   z9ModelRunner.make_model_input_from_broadcasted_tensor_dictr   NrR  r\   r[   c              	   C   sh   |  ||}t jr | |}t||j|j| j| j	|| j
}nd}|r)|d jnd}tj||||dS )a	  Prepare the model input based on a given sequence group, including
        metadata for the sampling step.

        The API assumes seq_group_metadata_list is sorted by prefill -> decode.

        The result tensors and data structure also batches input in prefill
        -> decode order. For example,

        - input_tokens[:num_prefill_tokens] contains prefill tokens.
        - input_tokens[num_prefill_tokens:] contains decode tokens.

        If cuda graph is required, this API automatically pads inputs.
        Nr   )r   r   r\   )rU  r   is_last_rankZget_generatorsr%   r   rT   rU   r  r  r6  r   dataclassesreplace)rc   rR  r\   r[   ro  
generatorsr   r   re   re   rf   rf  .  s(   
zModelRunner.prepare_model_inputr   ro  rn  rp  	num_stepsc                  K   s  |dkrt d| jr!|jd usJ |jd usJ | |j|j | j| |jd us.J |jj}|jj	}|j
}|d}	|d u r|jr|jd usLJ |jjd }
|jd u}| j| |
|f }|	d urt|	tj|
|	jd  g|	jdd  |	j|	jdg}	n| j}d}| ||rt j|||d\}}}|jpi }| jr|j|jdni }i }|	d ur|	|d< | jd ur| jjrtj j!d	d
}tj j!d	d
}|"  |st#|j| j$|! |d|j|j|j%|dt&j'|| jd||}W d    n1 sw   Y  | jd ur| jjr|"  | (||r!t )|||| t* j+sh| j,rf|d urft-|t.rf| jd urf| jjrf|/  |0|}d}|d ur\|j1dt2d3 }t2|| |j1d< |S | j4||j5}| j,r|j6d ur~|6  t-| j7t8sJ | j7j9}|jd urd	| j7_9| j7||j5d}| jd ur| jjr|d ur|/  |0|}d}|d ur|j1dt2d3 }|| |_:|jd urC| j,rg }g }|j;D ]%}t<|j=dkrqt<|j=dksJ |>| |>|j=d j? qt2|@| j}tAd|id }ntA d }t<|dkrC| jB|}| j,rC|| j7_9tC|D ]\}}|| |j=d _Dq5| j,sIg S | jEry|j5d usUJ |j5jF}|jGrg|Hd|}||_In|jrt|d t<| }n|}||_J|gS )Nr   z-num_steps > 1 is not supported in ModelRunnerr_   r   r  F)rn  )r[   rZ   T)Zenable_timingr~  rQ   r   rp  r{  g        model_forward_time)logitsr   sampled_token_idsre   )K
ValueErrorr   rW   rV   rs  r3  Zbegin_forwardrX   prefill_metadataZdecode_metadatar\   r   Zuse_cuda_graphrP   r   rQ   r,  rz   r  r  r  r  r<  need_recv_kvr   Z recv_kv_caches_and_hidden_statesrY   r.  r[   rZ   Zobservability_configZcollect_model_forward_timerk  Eventrecordr   r   rR   r2   Z	as_kwargsneed_send_kvZ send_kv_caches_and_hidden_statesr   r  r#  
isinstancer6   rl  r  tensorsrc  itemZcompute_logitsr   r]   r5  r(   Zinclude_gpu_probs_tensorr  Zoutputsr   Zsamplesr   Zoutput_tokenr  r   Zget_input_embeddingsr   Zoutput_embedr$  Zselected_token_indicesr   Zindex_selectZprefill_hidden_stateshidden_states) rc   ro  rn  rp  r  r   prefill_metaZdecode_metar\   r_   r  r  Zmodel_executableZbypass_model_exechidden_or_intermediate_statesrY   Zseqlen_agnostic_kwargsZmodel_kwargsZmodel_forward_startZmodel_forward_endr  Zorig_model_forward_timer  Zorig_include_gpu_probsoutputr  Zvalid_outputsZsequence_group_outputZsampled_token_embedsr   indicesr  re   re   rf   rj  S  s  	





	












zModelRunner.execute_modelc                 C   D   | j jdu rdS |jj}|d  dk}|du}| j jjo!| o!|S )a  Check if we need to receive kv-cache from the other worker.
        We need to receive KV when
            1. current vLLM instance is KV cache consumer/decode vLLM instance
            2. this batch is not a profiling run
            3. this batch is a prefill run

        Args:
            model_input: input to the model executable
            kv_caches: vLLM's paged memory
        NFr   )r   kv_transfer_configrX   r  numelZis_kv_consumerrc   ro  rn  r  Zis_profile_runZis_prefill_runre   re   rf   r  )     
zModelRunner.need_recv_kvc                 C   r  )a|  Check if we need to send kv-cache to the other worker.
        We need to send KV when
            1. current vLLM instance is KV cache producer/prefill vLLM instance
            2. this batch is not a profiling run
            3. this batch is a prefill run

        Args:
            model_input: input to the model executable
            kv_caches: vLLM's paged memory
        NFr   )r   r  rX   r  r  Zis_kv_producerr  re   re   rf   r  B  r  zModelRunner.need_send_kv)r   Nr   )rv   rw   rx   ry   r   r   r   r|   r   r  r   r~   r   r  r   r7   r}   r	   rf  rz   r  r{   r6   r   r)   rj  r   r  r  re   re   re   rf   r    sL   
 



% Vr  c                       s   e Zd Zdejdededef fddZe	dd Z
d	ejd
eej dejdee deej dedeeeef  dejjfddZd	ejd
eej dejdee dejf
ddZ  ZS )r  r<  backend_namer3  r}  c                    s8   t    || _|| _|| _i | _i | _d | _|| _d S ru   )	r   r   r<  r  r3  input_buffersoutput_buffers_graph_is_encoder_decoder_model)rc   r<  r  r3  r}  r   re   rf   r   `  s   

zCUDAGraphRunner.__init__c                 C   s   | j d usJ | j S ru   )r  r   re   re   rf   r  m  s   zCUDAGraphRunner.graphr~  rQ   r   r  rn  rX   r  r  c	                 K   sr  | j d u sJ ttD ]}
| jd||||d|	 qtj  tj | _ tjj| j ||dC | jdd|i|d ur?d|ini ||d|	}t	|tj
rUt|}nt	|trgtdd |j D d}~t  W d    n1 svw   Y  tj  d|i|d urd|ini ||d	| j|| j|	| _|d ur| j|j t jrd
|i| _d S || _d S )Nr  )r  r  r~  rQ   )r   rp  c                 S   s   i | ]	\}}|t |qS re   )r?   )r   keyvaluere   re   rf   r    s    z+CUDAGraphRunner.capture.<locals>.<dictcomp>)r  )r   rn  r  re   )r  r   _NUM_WARMUP_ITERSr<  rz   rk  rl  Z	CUDAGraphr  r  r{   r?   r6   r  itemsgcZcollectr3  Zget_graph_input_buffersr  r  rs   r   r  r  )rc   r~  rQ   r   r  rn  rX   r  r  r   r   Z$output_hidden_or_intermediate_statesr  re   re   rf   r  r  s   
	





zCUDAGraphRunner.capturerp  r`   c                 K   sz  t  j}| jd j|dd |d ur#| jd d |jd  j|dd |d ur8| jd d |jd  j|dd | jdkrH| jd j|jdd | j| j|| j	 d	| jv rb| j
j| jfi | d
| jv rs| jd
 j|d
 dd |d ur|jD ]}|dkr|dkr| j| j|| dd qz| j	r| jd j|d dd | jd j|d dd | j  t jr| jd S | jS )Nr~  T)Znon_blockingr   r   rQ   ZNO_ATTENTIONslot_mappingr  r_   Zmodel_execute_timer  r  r  r  )r   rX   r  Zcopy_r   r  r  r3  Zprepare_graph_input_buffersr  r<  Zcopy_inputs_before_cuda_graphsr  r  Zreplayr   r  r  )rc   r~  rQ   r   rp  r   rX   r  re   re   rf   r    sV   










zCUDAGraphRunner.forward)rv   rw   rx   r  r  r~   r   r   r   r  r  rz   r{   r	   r6   r   r   r   r}   rk  ZStreamr  r  r  re   re   r   rf   r  ^  sH    
	
Sr  )r  r  r  r  r=  r   
contextlibr   r   typingr   r   r   r   r   r	   r
   r   r   r   r   numpyr0  rz   Ztorch.distributedZtorch.nnr  Z	tqdm.autor   Z	vllm.envsrE  Zvllm.attentionr   r   Z vllm.attention.backends.abstractr   Zvllm.attention.backends.utilsr   Zvllm.compilation.counterr   Zvllm.configr   r   Zvllm.core.schedulerr   Zvllm.distributedr   r   Zvllm.distributed.kv_transferr   Zvllm.distributed.parallel_stater   r   Zvllm.forward_contextr   r   Zvllm.inputsr   r    Zvllm.loggerr!   Zvllm.lora.layersr"   Zvllm.lora.requestr#   Zvllm.lora.worker_managerr$   Zvllm.model_executorr%   r&   Z+vllm.model_executor.layers.rotary_embeddingr'   Z"vllm.model_executor.layers.samplerr(   r)   r*   rL  r+   Z+vllm.model_executor.model_loader.tensorizerr,   Zvllm.model_executor.modelsr-   r.   Z vllm.model_executor.models.utilsr/   Zvllm.multimodalr0   r1   r2   r3   r4   Zvllm.sampling_paramsr5   Zvllm.sequencer6   r7   Z
vllm.utilsr8   r9   r:   r;   r<   r=   r>   r?   Zvllm.worker.model_runner_baser@   rA   rB   rC   rD   rE   rF   rG   rH   rv   r:  rZ  r  rK   Z_dynamoconfigcache_size_limitZaccumulated_cache_size_limitrL   r   r   r   r  r  r  re   re   re   rf   <module>   s   4((

?'          Z  E