o
    á)i6+  ã                   @   s¨  U d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ erMd dlmZ eeƒZejd kZee d	< d a!e"e d
< d a#e"e d< eja$e"e d< ee%ƒZ&ee d< G dd„ de
ƒZ'de%e( de(de(de%e( fdd„Z)eG dd„ dƒƒZ*eG dd„ dƒƒZ+da,ee+ e d< de+fdd„Z-ed ddej.dfde	dede(d ee( d!eej/ d"ed#ee' fd$d%„ƒZ0dS )&é    N)Údefaultdict)Úcontextmanager)Ú	dataclass)ÚTYPE_CHECKINGÚAnyÚ
NamedTupleÚOptionalÚUnion)ÚCUDAGraphModeÚParallelConfigÚ
VllmConfig)Úinit_logger)ÚAttentionMetadataÚtrack_batchsizeÚlast_logging_timeÚforward_start_timeÚbatchsize_logging_intervalÚbatchsize_forward_timec                   @   s6   e Zd ZU dZeed< dZeed< 	 ed	dd„ƒZ	dS )
ÚBatchDescriptorz¶
    Batch descriptor for cudagraph dispatching. We should keep the num of
    items as minimal as possible to properly and uniquely describe the padded
    batch for cudagraph.
    Ú
num_tokensFÚuniform_decodeÚreturnc                 C   s   t | jddS )zK
        Return a non-uniform version of current batch descriptor.
        F)r   )r   r   ©Úself© r   ú`/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/forward_context.pyÚnon_uniform*   s   zBatchDescriptor.non_uniformN)r   r   )
Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚintÚ__annotations__r   ÚboolÚpropertyr   r   r   r   r   r      s   
 r   Únum_tokens_across_dp_cpuÚmax_num_tokensÚ	chunk_idxr   c                 C   sV   t | ƒ}dg| }t|ƒD ]}| | }t||||  ƒ||< || dkr(d||< q|S )Néÿÿÿÿr   é   )ÚlenÚrangeÚmin)r%   r&   r'   Údp_sizeZ
local_sizeÚiZ	dp_tokensr   r   r   Ú!_compute_chunked_local_num_tokens2   s   

ÿ€r/   c                   @   s®   e Zd ZU ejed< ejed< dZeee	  ed< e
de	de	de	dejfd	d
„ƒZe
	ddedede	deej dd f
dd„ƒZede	de	fdd„ƒZdeee	  fdd„ZdS )Ú
DPMetadataÚmax_tokens_across_dp_cpuÚcu_tokens_across_dp_cpuNÚlocal_sizesr   r-   Údp_rankr   c                 C   sF   dg| }| ||< t j|dt jd}ddlm} tj||ƒ jd |S )zw
        Gather the num_tokens across all DP ranks and return results in a
        CPU tensor of size dp_size.
        r   Úcpu)ZdeviceZdtype)Úget_dp_group)Úgroup)ÚtorchÚtensorZint32Zvllm.distributed.parallel_stater6   ÚdistZ
all_reduceZ	cpu_group)r   r-   r4   Únum_tokens_across_dpZnum_tokens_tensorr6   r   r   r   r;   G   s   
þzDPMetadata.num_tokens_across_dpÚparallel_configÚattn_metadatar;   c           	      C   sŽ   | j dksJ ‚| j }| j}|d urt|dƒr|j|j }n|}|d u s+|| |ks+J ‚|d u r6t |||¡}t |¡}tj	|dd}t||ƒS )Nr)   Únum_prefill_tokensr   )Údim)
Údata_parallel_sizeZdata_parallel_rankÚhasattrr>   Únum_decode_tokensr0   r;   r8   ÚmaxZcumsum)	r<   r=   r   r;   r-   r4   Ú	batchsizer1   r2   r   r   r   ÚmakeW   s&   ÿÿÿ

zDPMetadata.makeÚmax_chunk_size_per_rankr'   c                 #   sN    | j ‰ ‡ fdd„ttˆ ƒƒD ƒ}t|||ƒ| _z
| jV  W d| _dS d| _w )až  
        Context manager to compute and temporarily set the per-rank local token
        sizes for a specific chunk during chunked forward execution.

        This is necessary to ensure each DP (data parallel) rank processes its
        designated portion of tokens in lockstep with others, even when the
        token counts are uneven or some ranks have completed their input early.

        For chunked execution, we break up the total tokens on each rank into
        multiple chunks (of at most `max_chunk_size_per_rank`), and for a given
        `chunk_idx`, this context manager sets `self.local_sizes` to the number
        of tokens to process in that chunk on each rank.

        It uses cumulative sizes (`cu_tokens_across_dp_cpu`) to derive the
        number of tokens per rank, and calls `_compute_chunked_local_num_tokens`
        to determine the chunk-wise split.

        `self.local_sizes` is only valid inside the context.

        Args:
            max_chunk_size_per_rank: The max number of tokens each rank is 
                                     allowed to process in this chunk.
            chunk_idx: The index of the chunk to compute sizes for.
        c                    s8   g | ]}|d krˆ | ˆ |d     ¡ nˆ d    ¡ ‘qS )r   r)   )Úitem)Ú.0r.   ©Zcu_sizesr   r   Ú
<listcomp>‘   s    ÿÿ
ÿþz,DPMetadata.chunked_sizes.<locals>.<listcomp>N)r2   r+   r*   r/   r3   )r   rF   r'   r%   r   rI   r   Úchunked_sizesv   s   €

ýÿ
zDPMetadata.chunked_sizesc                 C   s   | j S ©N)r3   r   r   r   r   Úget_chunk_sizes_across_dp_rank   s   z)DPMetadata.get_chunk_sizes_across_dp_rankrL   )r   r   r   r8   ÚTensorr"   r3   r   Úlistr!   Ústaticmethodr;   r   r   rE   r   rK   rM   r   r   r   r   r0   A   s4   
 


ÿÿüÿþýüû&r0   c                   @   sv   e Zd ZU eeef ed< 	 edeedf f ed< eed< dZ	e
e ed< ejZeed< dZe
e ed< d	d
„ ZdS )ÚForwardContextÚno_compile_layersr   r=   Úvirtual_engineNÚdp_metadataÚcudagraph_runtime_modeÚbatch_descriptorc                 C   s*   | j tjtjtjfv sJ d| j › ƒ‚d S )Nz Invalid cudagraph runtime mode: )rU   r
   ÚNONEZ	PIECEWISEZFULLr   r   r   r   Ú__post_init__µ   s
   ÿ
þzForwardContext.__post_init__)r   r   r   ÚdictÚstrr   r"   r	   r!   rT   r   r0   r
   rW   rU   rV   r   rX   r   r   r   r   rQ   ¡   s   
 rQ   Ú_forward_contextc                   C   s   t dusJ dƒ‚t S )z Get the current forward context.NzXForward context is not set. Please use `set_forward_context` to set the forward context.)r[   r   r   r   r   Úget_forward_context¾   s   
ÿr\   r=   Úvllm_configrS   r   r;   rU   rV   c                 c   s^   t o| du}|rt ¡ ad}|jjdkr(| dus|dur(t |j| |p%d|¡}t}	t	|j
j|| |||daz}dV  W |r°t| dƒrI| j| j }
n|}
ddlm} |j}|dur[|ƒ  t ¡ }t|
  |t d ¡ |t tkr°|ag }t ¡ D ]&\}}t|ƒdkrƒqxtjt |¡dd	 ¡ }t|d
ƒ}| |t|ƒ|f¡ qx|jdd„ dd |r°t d|¡ |	adS |r,t| dƒrÃ| j| j }
n|}
ddlm} |j}|durÕ|ƒ  t ¡ }t|
  |t d ¡ |t tkr,|ag }t ¡ D ]&\}}t|ƒdkrþqótjt |¡dd	 ¡ }t|d
ƒ}| |t|ƒ|f¡ qó|jdd„ dd |r,t d|¡ |	aw )z¥A context manager that stores the current forward context,
    can be attention metadata, etc.
    Here we can inject common logic for every model forward pass.
    Nr)   r   )rR   rS   r=   rT   rU   rV   r>   )Úcurrent_platformiè  g      à?)Úqé   c                 S   s   | d S )Nr)   r   )Úxr   r   r   Ú<lambda>  s    z%set_forward_context.<locals>.<lambda>T)ÚkeyÚreversezDBatchsize forward time stats (batchsize, count, median_time(ms)): %s) r   ÚtimeÚperf_counterr   r<   r@   r0   rE   r[   rQ   Zcompilation_configZstatic_forward_contextrA   r>   rB   Zvllm.platformsr^   Úsynchronizer   Úappendr   r   Úitemsr*   r8   Zquantiler9   rG   ÚroundÚsortÚloggerÚinfo)r=   r]   rS   r   r;   rU   rV   Zneed_to_track_batchsizerT   Zprev_contextrD   r^   rg   ÚnowZforward_statsÚbsÚtimesZmediumr   r   r   Úset_forward_contextÆ   s¢   €þù

ÿ
ÿ
þÝ
ÿ
ÿ
þrq   )1re   Úcollectionsr   Ú
contextlibr   Údataclassesr   Útypingr   r   r   r   r	   r8   Ztorch.distributedÚdistributedr:   Z	vllm.envsZenvsZvllm.configr
   r   r   Zvllm.loggerr   Z vllm.attention.backends.abstractr   r   rl   ZVLLM_LOG_BATCHSIZE_INTERVALr   r#   r"   r   Úfloatr   r   rO   r   r   r!   r/   r0   rQ   r[   r\   rW   rN   rq   r   r   r   r   Ú<module>   sj   

ÿþ
þ_ùÿþýüûúù