o
    )i6+                     @   s  U d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ erMd dlmZ eeZejd kZee d	< d a!e"e d
< d a#e"e d< eja$e"e d< ee%Z&ee d< G dd de
Z'de%e( de(de(de%e( fddZ)eG dd dZ*eG dd dZ+da,ee+ e d< de+fddZ-ed ddej.dfde	dede(d ee( d!eej/ d"ed#ee' fd$d%Z0dS )&    N)defaultdict)contextmanager)	dataclass)TYPE_CHECKINGAny
NamedTupleOptionalUnion)CUDAGraphModeParallelConfig
VllmConfig)init_logger)AttentionMetadatatrack_batchsizelast_logging_timeforward_start_timebatchsize_logging_intervalbatchsize_forward_timec                   @   s6   e Zd ZU dZeed< dZeed< 	 ed	ddZ	dS )
BatchDescriptorz
    Batch descriptor for cudagraph dispatching. We should keep the num of
    items as minimal as possible to properly and uniquely describe the padded
    batch for cudagraph.
    
num_tokensFuniform_decodereturnc                 C   s   t | jddS )zK
        Return a non-uniform version of current batch descriptor.
        F)r   )r   r   self r   `/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/forward_context.pynon_uniform*   s   zBatchDescriptor.non_uniformN)r   r   )
__name__
__module____qualname____doc__int__annotations__r   boolpropertyr   r   r   r   r   r      s   
 r   num_tokens_across_dp_cpumax_num_tokens	chunk_idxr   c                 C   sV   t | }dg| }t|D ]}| | }t||||  ||< || dkr(d||< q|S )Nr      )lenrangemin)r%   r&   r'   dp_sizeZ
local_sizeiZ	dp_tokensr   r   r   !_compute_chunked_local_num_tokens2   s   

r/   c                   @   s   e Zd ZU ejed< ejed< dZeee	  ed< e
de	de	de	dejfd	d
Ze
	ddedede	deej dd f
ddZede	de	fddZdeee	  fddZdS )
DPMetadatamax_tokens_across_dp_cpucu_tokens_across_dp_cpuNlocal_sizesr   r-   dp_rankr   c                 C   sF   dg| }| ||< t j|dt jd}ddlm} tj|| jd |S )zw
        Gather the num_tokens across all DP ranks and return results in a
        CPU tensor of size dp_size.
        r   cpu)ZdeviceZdtype)get_dp_group)group)torchtensorZint32Zvllm.distributed.parallel_stater6   distZ
all_reduceZ	cpu_group)r   r-   r4   num_tokens_across_dpZnum_tokens_tensorr6   r   r   r   r;   G   s   
zDPMetadata.num_tokens_across_dpparallel_configattn_metadatar;   c           	      C   s   | j dksJ | j }| j}|d urt|dr|j|j }n|}|d u s+|| |ks+J |d u r6t|||}t|}tj	|dd}t||S )Nr)   num_prefill_tokensr   )dim)
data_parallel_sizeZdata_parallel_rankhasattrr>   num_decode_tokensr0   r;   r8   maxZcumsum)	r<   r=   r   r;   r-   r4   	batchsizer1   r2   r   r   r   makeW   s&   

zDPMetadata.makemax_chunk_size_per_rankr'   c                 #   sN    | j   fddtt D }t|||| _z
| jV  W d| _dS d| _w )a  
        Context manager to compute and temporarily set the per-rank local token
        sizes for a specific chunk during chunked forward execution.

        This is necessary to ensure each DP (data parallel) rank processes its
        designated portion of tokens in lockstep with others, even when the
        token counts are uneven or some ranks have completed their input early.

        For chunked execution, we break up the total tokens on each rank into
        multiple chunks (of at most `max_chunk_size_per_rank`), and for a given
        `chunk_idx`, this context manager sets `self.local_sizes` to the number
        of tokens to process in that chunk on each rank.

        It uses cumulative sizes (`cu_tokens_across_dp_cpu`) to derive the
        number of tokens per rank, and calls `_compute_chunked_local_num_tokens`
        to determine the chunk-wise split.

        `self.local_sizes` is only valid inside the context.

        Args:
            max_chunk_size_per_rank: The max number of tokens each rank is 
                                     allowed to process in this chunk.
            chunk_idx: The index of the chunk to compute sizes for.
        c                    s8   g | ]}|d kr |  |d     n d    qS )r   r)   )item).0r.   Zcu_sizesr   r   
<listcomp>   s    
z,DPMetadata.chunked_sizes.<locals>.<listcomp>N)r2   r+   r*   r/   r3   )r   rF   r'   r%   r   rI   r   chunked_sizesv   s   


zDPMetadata.chunked_sizesc                 C   s   | j S N)r3   r   r   r   r   get_chunk_sizes_across_dp_rank   s   z)DPMetadata.get_chunk_sizes_across_dp_rankrL   )r   r   r   r8   Tensorr"   r3   r   listr!   staticmethodr;   r   r   rE   r   rK   rM   r   r   r   r   r0   A   s4   
 


&r0   c                   @   sv   e Zd ZU eeef ed< 	 edeedf f ed< eed< dZ	e
e ed< ejZeed< dZe
e ed< d	d
 ZdS )ForwardContextno_compile_layersr   r=   virtual_engineNdp_metadatacudagraph_runtime_modebatch_descriptorc                 C   s*   | j tjtjtjfv sJ d| j  d S )Nz Invalid cudagraph runtime mode: )rU   r
   NONEZ	PIECEWISEZFULLr   r   r   r   __post_init__   s
   
zForwardContext.__post_init__)r   r   r   dictstrr   r"   r	   r!   rT   r   r0   r
   rW   rU   rV   r   rX   r   r   r   r   rQ      s   
 rQ   _forward_contextc                   C   s   t dusJ dt S )z Get the current forward context.NzXForward context is not set. Please use `set_forward_context` to set the forward context.)r[   r   r   r   r   get_forward_context   s   
r\   r=   vllm_configrS   r   r;   rU   rV   c                 c   s^   t o| du}|rt ad}|jjdkr(| dus|dur(t|j| |p%d|}t}	t	|j
j|| |||daz}dV  W |rt| drI| j| j }
n|}
ddlm} |j}|dur[|  t }t|
 |t d  |t tkr|ag }t D ]&\}}t|dkrqxtjt|dd	 }t|d
}||t||f qx|jdd dd |rtd| |	adS |r,t| dr| j| j }
n|}
ddlm} |j}|dur|  t }t|
 |t d  |t tkr,|ag }t D ]&\}}t|dkrqtjt|dd	 }t|d
}||t||f q|jdd dd |r,td| |	aw )zA context manager that stores the current forward context,
    can be attention metadata, etc.
    Here we can inject common logic for every model forward pass.
    Nr)   r   )rR   rS   r=   rT   rU   rV   r>   )current_platformi  g      ?)q   c                 S   s   | d S )Nr)   r   )xr   r   r   <lambda>  s    z%set_forward_context.<locals>.<lambda>T)keyreversezDBatchsize forward time stats (batchsize, count, median_time(ms)): %s) r   timeperf_counterr   r<   r@   r0   rE   r[   rQ   Zcompilation_configZstatic_forward_contextrA   r>   rB   Zvllm.platformsr^   synchronizer   appendr   r   itemsr*   r8   Zquantiler9   rG   roundsortloggerinfo)r=   r]   rS   r   r;   rU   rV   Zneed_to_track_batchsizerT   Zprev_contextrD   r^   rg   nowZforward_statsbstimesZmediumr   r   r   set_forward_context   s   






rq   )1re   collectionsr   
contextlibr   dataclassesr   typingr   r   r   r   r	   r8   Ztorch.distributeddistributedr:   Z	vllm.envsZenvsZvllm.configr
   r   r   Zvllm.loggerr   Z vllm.attention.backends.abstractr   r   rl   ZVLLM_LOG_BATCHSIZE_INTERVALr   r#   r"   r   floatr   r   rO   r   r   r!   r/   r0   rQ   r[   r\   rW   rN   rq   r   r   r   r   <module>   sj   


_