o
    )if                     @   s  d Z ddlZddlZddlmZmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z=m>Z> ee?Z@G dd de<ZA		d%dedeBdeeC deBddf
ddZDd ejEfd!d"ZF		d&d#d$ZGdS )'zA GPU worker class.    N)DictListOptionalSetTupleTypeUnion)	Attention)
VllmConfigget_layers_from_vllm_config)CuMemAllocator)!ensure_model_parallel_initializedinit_distributed_environmentset_custom_all_reduce)ensure_kv_transfer_initialized)init_logger)LoRARequest)set_random_seed)SamplerOutput)TensorizerConfig)current_platform)ExecuteModelRequestIntermediateTensorsSequenceGroupMetadataSequenceGroupMetadataDelta)	GiB_bytesMemorySnapshotbind_kv_cachememory_profiling)CacheEngine)EncoderDecoderModelRunner)GPUModelRunnerBaseModelRunner)PoolingModelRunner)LocalOrDistributedWorkerBase
WorkerBaseWorkerInputc                       sV  e Zd ZdZ		dQdededededed	ee	e
  d
dfddZdd Zdd ZdRded
dfddZdSdeee  d
dfddZdTddZdd Z		dUdedee dee d
dfdd Zd!ed
dfd"d#Ze d
eeef fd$d%Zd&d' Zd(ed)ed
dfd*d+Zd,d- ZdTd.d/Zed
efd0d1Zed
ee e ej!   fd2d3Z"e d4e#d
e$fd5d6Z%e d7e$d
dfd8d9Z&d:e e'e(e)f  d;e e d
e e( fd<d=Z*	dSd4e#d>ee+ d
ee e,  f fd?d@Z-dAe.d
efdBdCZ/dDed
efdEdFZ0dDed
efdGdHZ1d
e2e fdIdJZ3ed
efdKdLZ4ed
efdMdNZ5d
efdOdPZ6  Z7S )VWorkera/  A worker class that executes (a partition of) the model on a GPU.

    Each worker is associated with a single GPU. The worker is responsible for
    maintaining the KV cache and executing the model on the GPU. In case of
    distributed inference, each worker is assigned a partition of the model.
    FNvllm_config
local_rankrankdistributed_init_methodis_driver_workermodel_runner_clsreturnc                 C   sD  t | | || j_|| _|| _|| _|| _| jjr#ddl	m
} |  | j}| j}	|d u s=|jjj|	jjks=|jjjdvr?i nddi}
t}|	jdkrMt}n| jjrSt}|d| j| jj|d|
| _|d url|| j| _|  d | _i | _i | _tjrtj}td| tj j!tj j"j#tj j"j$gdtj j%|dd	d
| _ d S d | _ d S )Nr   )init_cached_hf_modules)ZmedusaZmlp_speculatoreagleZdeepseek_mtpZglm4_moe_mtpZmimo_mtpZreturn_hidden_statesTZpooling)r(   Zkv_cache_dtyper,   z.Profiling enabled. Traces will be saved to: %s)Zuse_gzip)Z
activitiesZ
with_stackZon_trace_ready )&r%   __init__parallel_configr*   r)   r+   r,   model_configZtrust_remote_code
vllm.utilsr/   speculative_configZdraft_model_configZ	hf_configZ
model_typer"   Zrunner_typer#   Zis_encoder_decoderr    r(   cache_configZcache_dtypemodel_runner	gpu_cache_seq_group_metadata_cache_sleep_saved_buffersenvsZVLLM_TORCH_PROFILER_DIRloggerinfotorchprofilerZprofileZProfilerActivityZCPUCUDAZtensorboard_trace_handler)selfr(   r)   r*   r+   r,   r-   r/   r6   r4   Zspeculative_argsZModelRunnerClassZtorch_profiler_trace_dirr1   r1   ^/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/worker/worker.pyr2   /   sl   	


	zWorker.__init__c                 C   s    | j d u r	td| j   d S )NProfiler is not enabled.)r@   RuntimeErrorstartrB   r1   r1   rC   start_profile|   s   
zWorker.start_profilec                 C   s6   | j d u r	td| j   t| j  jdd d S )NrD   Zself_cuda_time_total)Zsort_by)r@   rE   stopprintZkey_averagestablerG   r1   r1   rC   stop_profile   s   

zWorker.stop_profile   levelc           	      C   s   t j d }|dkr| jj}dd | D | _t }|j	|dkr%dnt
 d t j \}}|| }|| }|dksAJ dtd	|t |t  d S )
Nr      c                 S   s   i | ]\}}||   qS r1   )cpuclone).0namebufferr1   r1   rC   
<dictcomp>   s    z Worker.sleep.<locals>.<dictcomp>rM   )weights)Zoffload_tagsz&Memory usage increased after sleeping.zBSleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.)r?   cudamem_get_infor8   modelnamed_buffersr;   r   get_instancesleeptupler=   r>   r   )	rB   rN   Zfree_bytes_before_sleeprY   	allocatorZfree_bytes_after_sleeptotalZfreed_bytesZ
used_bytesr1   r1   rC   r\      s"   zWorker.sleeptagsc                 C   sd   t  }|j|d t| jr0| jj}| D ]\}}|| jv r*|j	| j| j qi | _d S d S )N)r`   )
r   r[   wake_uplenr;   r8   rY   rZ   dataZcopy_)rB   r`   r^   rY   rS   rT   r1   r1   rC   ra      s   


zWorker.wake_upc                 C   s   | j jjdkr=dtjd< tjdd  td| j | _tj	| j t
| jj t  tj  tj  t | _n	td| j j t| j| j| j| j t| jj d S )NrW   1ZTORCH_NCCL_AVOID_RECORD_STREAMSZNCCL_ASYNC_ERROR_HANDLINGzcuda:zNot support device type: )device_configdevicetypeosenvironpopr?   r)   rW   Z
set_device_check_if_gpu_supports_dtyper4   dtypegccollectempty_cachereset_peak_memory_statsr   baseline_snapshotrE   #init_worker_distributed_environmentr(   r*   r+   r   seedrG   r1   r1   rC   init_device   s$   




zWorker.init_devicec                 C   sz   | j jjrt }| dksJ d|jdd}n	ddlm} | }| | j	
  W d    d S 1 s6w   Y  d S )Nr   z9Sleep mode can only be used for one instance per process.rV   tagnullcontext)r(   r4   enable_sleep_moder   r[   Zget_current_usageuse_memory_pool
contextlibrx   r8   
load_model)rB   r^   contextrx   r1   r1   rC   r|      s   
"zWorker.load_modelpathpatternmax_sizec                 C   s   | j j|||d d S )N)r   r   )r8   save_sharded_state)rB   r~   r   r   r1   r1   rC   r      s
   
zWorker.save_sharded_statetensorizer_configc                 C   s   | j j|d d S )N)r   )r8   save_tensorized_model)rB   r   r1   r1   rC   r      s   
zWorker.save_tensorized_modelc           
      C   sJ  t j  t j  t j \}}t| j| jjd}| j	  W d   n1 s*w   Y  | 
  || jj }||j }|  }|dkrKd}d}nt|| }t| jj| }t|d}t|d}d|jdd|t dd| jjdd|t dd	|jt dd
|jt dd|jt dd|t dd}	t|	 t  ||fS )a  Profiles the peak memory usage of the model to determine how many
        KV blocks may be allocated without OOMs.

        The engine will first conduct a profiling of the existing memory usage.
        Then, it calculate the maximum possible number of GPU and CPU blocks
        that can be allocated with the remaining free memory.

        Tip:
            You may limit the usage of GPU memory
            by adjusting the `gpu_memory_utilization` parameter.
        )weights_memoryNr   zMemory profiling takes z.2fz= seconds
the current vLLM instance can use total_gpu_memory (zGiB) x gpu_memory_utilization (z) = zGiB
model weights take zGiB; non_torch_memory takes z*GiB; PyTorch activation peak memory takes z5GiB; the rest of the memory reserved for KV Cache is zGiB.)r?   rW   ro   rp   rX   r   rq   r8   Zmodel_memory_usageZprofile_run3_assert_memory_footprint_increased_during_profilingr7   Zgpu_memory_utilizationZnon_kv_cache_memoryget_cache_block_size_bytesintZswap_space_bytesmaxZprofile_timer   r   Znon_torch_increaseZtorch_peak_increaser=   r>   rm   rn   )
rB   Zfree_memory_pre_profileZtotal_gpu_memoryresultZmemory_for_current_instanceZavailable_kv_cache_memoryZcache_block_sizenum_gpu_blocksnum_cpu_blocksmsgr1   r1   rC   determine_num_available_blocks   sZ   








z%Worker.determine_num_available_blocksc                 C   s@   t j \}}|| }| jj|k sJ d| jj d| dd S )Nz/Error in memory profiling. Initial used memory z, currently used memory ze. This happens when the GPU memory was not properly cleaned up before initializing the vLLM instance.)r?   rW   rX   rq   cuda_memory)rB   Zfree_gpu_memoryr_   r   r1   r1   rC   r   -  s   z:Worker._assert_memory_footprint_increased_during_profilingr   r   c                 C   s   t || jj| jj| jj| jj || j_|| j_	| j
jjr(t }|jdd}n	ddlm} | }| |   W d   n1 sBw   Y  |   dS )zAllocate GPU and CPU KV cache with the specified number of blocks.

        This also warms up the model, which may record CUDA graphs.
        kv_cacheru   r   rw   N)raise_if_cache_size_invalidr7   
block_sizeis_attention_freer4   max_model_lenr3   pipeline_parallel_sizer   r   r(   ry   r   r[   rz   r{   rx   _init_cache_engine_warm_up_model)rB   r   r   r^   r}   rx   r1   r1   rC   initialize_cache9  s"   

zWorker.initialize_cachec                    s    j jd usJ  fddt jjD  _ fddt jjD  _i }t jt	}|
 D ]\}}|j }d ur?|||< q0t jj j| d S )Nc                    s"   g | ]}t  j j j jqS r1   )r   r7   r4   r3   re   )rR   _rG   r1   rC   
<listcomp>T  s    
z-Worker._init_cache_engine.<locals>.<listcomp>c                    s   g | ]} j | jqS r1   )cache_enginer9   )rR   verG   r1   rC   r   Y  s    
)r7   r   ranger3   r   r   r9   r   r(   r	   itemsZkv_sharing_target_layer_namer   compilation_configZstatic_forward_context)rB   Zshared_kv_cache_layersZattn_layersZ
layer_nameZattn_moduleZkv_tgt_layerr1   rG   rC   r   R  s&   



		zWorker._init_cache_enginec                    sx    j jj } jjs fdd|D }t|ddD ]}td|  j	
| q jjs4 j	 j t jj d S )Nc                    s   g | ]}| j jjvr|qS r1   )r(   r   Zcudagraph_capture_sizes)rR   xrG   r1   rC   r   {  s    z)Worker._warm_up_model.<locals>.<listcomp>T)reversez(Compile and warming up model for size %d)r(   r   Zcompile_sizescopyr4   Zenforce_eagersortedr=   r>   r8   Z
_dummy_runZcapture_modelr9   r   rs   )rB   Zwarmup_sizessizer1   rG   rC   r   u  s   
zWorker._warm_up_modelc                 C   s   | j jdkS )NrM   )r3   tensor_parallel_sizerG   r1   r1   rC   do_metadata_broadcast  s   zWorker.do_metadata_broadcastc                 C   s   | j S N)r9   rG   r1   r1   rC   r     s   zWorker.kv_cacheexecute_model_reqc                 C   s   |j }|j}t|j}tj|jdtjddd}tj|j	dtjddd}tj|j
| jtjddd}t||||||dS )NrP   )rf   rl   rO   )num_seq_groupsblocks_to_swap_inblocks_to_swap_outblocks_to_copyvirtual_engine	num_steps)r   r   rb   seq_group_metadata_listr?   Ztensorr   Zint64viewr   r   rf   r&   )rB   r   r   r   r   r   r   r   r1   r1   rC   prepare_worker_input  s:   
zWorker.prepare_worker_inputworker_inputc                 C   s   |j }|jd ur|j dkr| j| |j |jd ur-|j dkr-| j| |j |jd urD|j dkrF| j| |j d S d S d S )Nr   )	r   r   Znumelr   Zswap_inr   Zswap_outr   r   )rB   r   r   r1   r1   rC   execute_worker  s    




zWorker.execute_workerr   finished_request_idsc                 C   s   g }|D ]9}|j }|| jvrt|tsJ || j|< nt|tr)| j| | nt|ts0J || j|< || j|  q|D ]}| j|= q@|S )a  Return a list of cached Sequence Group Metadata after updating its
        state.

        It is used because scheduler only sends delta to workers to reduce
        the data payload size. The function also cleans up cache based on
        a given `finished_request_ids`.
        )
request_idr:   
isinstancer   r   Zapply_deltaappend)rB   r   r   new_seq_group_metadata_listZmetadata_or_deltar   Zfinished_idr1   r1   rC   _get_cached_seq_group_metadata  s*   



z%Worker._get_cached_seq_group_metadataintermediate_tensorsc                    s0   |d ur|  |j|j}||_t ||}|S r   )r   r   Zfinished_requests_idssuper_execute_model_spmd)rB   r   r   r   output	__class__r1   rC   r     s   zWorker._execute_model_spmdlora_requestc                 C      | j |S r   )r8   add_lora)rB   r   r1   r1   rC   r        zWorker.add_loralora_idc                 C   r   r   )r8   remove_lorarB   r   r1   r1   rC   r     r   zWorker.remove_lorac                 C   r   r   )r8   pin_lorar   r1   r1   rC   r     r   zWorker.pin_lorac                 C   s
   | j  S r   )r8   
list_lorasrG   r1   r1   rC   r      s   
zWorker.list_lorasc                 C      | j jS r   )r4   r   rG   r1   r1   rC   r        zWorker.max_model_lenc                 C   r   r   )r8   
vocab_sizerG   r1   r1   rC   r     r   zWorker.vocab_sizec                 C   s   t | j| j| jS )z:Get the size of the KV cache block size in bytes.
        )r   Zget_cache_block_sizer7   r4   r3   rG   r1   r1   rC   r     s   z!Worker.get_cache_block_size_bytes)FN)rM   r   r.   N)NN)8__name__
__module____qualname____doc__r
   r   strboolr   r   r!   r2   rH   rL   r\   listra   rt   r|   r   r   r   r?   Zinference_moder   r   r   r   r   r   propertyr   r   ZTensorr   r   r&   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__r1   r1   r   rC   r'   '   s    

M


D

#
,
r'   r   r(   r*   r+   r)   r.   c                 C   s@   | j }t|j  t|j|||tj t|j|j	 t
|  dS )z'Initialize the distributed environment.N)r3   r   Zdisable_custom_all_reducer   Z
world_sizer   Zdist_backendr   r   r   r   )r(   r*   r+   r)   r3   r1   r1   rC   rr     s   rr   torch_dtypec                 C   sb   | t jkr-tds/t }t }|d u rd}n	| }d| }td| d| dd S d S )NP   z"does not have a compute capabilityzhas compute capability zQBfloat16 is only supported on GPUs with compute capability of at least 8.0. Your z GPU zg. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)r?   Zbfloat16r   Zhas_device_capabilityZget_device_capabilityZget_device_nameZas_version_str
ValueError)r   Z
capabilityZgpu_nameZcompute_strversion_strr1   r1   rC   rk   &  s"   


rk   c                 C   sf   |r| dkrt d|  d|s| dkrt d|| |  }|s/||kr1t d| d| dd S d S )Nr   zTNo memory should be allocated for the cache blocks for an attention-free model, but z blocks are allocated.zoNo available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.zThe model's max seq len (zN) is larger than the maximum number of tokens that can be stored in KV cache (zf). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.)r   )r   r   r   r   r   Zmax_seq_lenr1   r1   rC   r   :  s   
r   )Nr   r   )Hr   rm   rh   typingr   r   r   r   r   r   r   r?   Ztorch.distributedZ	vllm.envsr<   Zvllm.attention.layerr	   Zvllm.configr
   r   Zvllm.device_allocator.cumemr   Zvllm.distributedr   r   r   Zvllm.distributed.kv_transferr   Zvllm.loggerr   Zvllm.lora.requestr   Zvllm.model_executorr   Z"vllm.model_executor.layers.samplerr   Z+vllm.model_executor.model_loader.tensorizerr   Zvllm.platformsr   Zvllm.sequencer   r   r   r   r5   r   r   r   r   Zvllm.worker.cache_enginer   Z vllm.worker.enc_dec_model_runnerr    Zvllm.worker.model_runnerr!   r"   Z vllm.worker.pooling_model_runnerr#   Zvllm.worker.worker_baser$   r%   r&   r   r=   r'   r   r   rr   rl   rk   r   r1   r1   r1   rC   <module>   s^   $   r
