o
    )iis                  
   @   sd  d dl Z d dlZd dlmZmZ d dlmZmZmZ d dl	Z	d dl
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ eeZeeegdf ZG dd deZG dd deZ G dd deZ!ee	j"e	j#e	j$f Z%de%de&e de'de(ee%f fddZ)de&e dede&e fddZ*dede&e fddZ+G dd dZ,dS )    N)ABCabstractmethod)CallableOptionalUnion)SupportsMetricsInfo
VllmConfig)init_logger)PrefixCachingMetrics)FinishReason)unregister_vllm_metrics)IterationStatsSchedulerStats)SpecDecodingLoggingSpecDecodingPromStatLoggerBasec                   @   sb   e Zd ZdZeddedefddZe	ddee	 dee
 d	efd
dZedd Zdd ZdS )r   a   Interface for logging metrics.

    API users may define custom loggers that implement this interface.
    However, note that the `SchedulerStats` and `IterationStats` classes
    are not considered stable interfaces and may change in future versions.
    r   vllm_configengine_indexc                 C      d S N selfr   r   r   r   c/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/v1/metrics/loggers.py__init__       zStatLoggerBase.__init__scheduler_statsiteration_stats
engine_idxc                 C   r   r   r   r   r   r   r   r   r   r   record$   s   zStatLoggerBase.recordc                 C   r   r   r   r   r   r   r   log_engine_initialized+   r   z%StatLoggerBase.log_engine_initializedc                 C   r   r   r   r!   r   r   r   log/   s   zStatLoggerBase.logNr   )__name__
__module____qualname____doc__r   r   intr   r   r   r   r    r"   r#   r   r   r   r   r      s     
c                   @   s~   e Zd ZddedefddZdd Zdefd	d
Zdede	de	fddZ
	ddee dee defddZdd Zdd ZdS )LoggingStatLoggerr   r   r   c                 C   sB   || _ || _| t  t | _t | _t	 | _
d| _d| _d S Ng        )r   r   _resettime	monotonicr   last_scheduler_statsr
   prefix_caching_metricsr   spec_decoding_logginglast_prompt_throughputlast_generation_throughputr   r   r   r   r   5   s   
zLoggingStatLogger.__init__c                 C   s   || _ d| _d| _d S Nr   )last_log_timenum_prompt_tokensnum_generation_tokens)r   nowr   r   r   r,   A   s   
zLoggingStatLogger._resetr   c                 C   s$   |  j |j 7  _ |  j|j7  _d S r   )r6   r7   )r   r   r   r   r   _track_iteration_statsH   s   z(LoggingStatLogger._track_iteration_statstracked_statsr8   returnc                 C   s"   || j  }|dkrdS t|| S r+   )r5   float)r   r:   r8   Z
delta_timer   r   r   _get_throughputM   s   
z!LoggingStatLogger._get_throughputr   r   c                 C   sJ   |r|  | |dur#| j|j |jdur| j|j || _dS dS )zLog Stats to standard output.N)r9   r0   observeprefix_cache_statsspec_decoding_statsr1   r/   r   r   r   r   r    T   s   


zLoggingStatLogger.recordc              
   C   s   t  }| | j|}| | j|}| | | j}tj}t	||| j
| jfs*tj}|| _|| _
|d| j|||j|j|jd | jjd  | jj|d d S )NzEngine %03d: Avg prompt throughput: %.1f tokens/s, Avg generation throughput: %.1f tokens/s, Running: %d reqs, Waiting: %d reqs, GPU KV cache usage: %.1f%%, Prefix cache hit rate: %.1f%%d   )log_fn)r-   r.   r=   r6   r7   r,   r/   loggerinfoanyr2   r3   debugr   num_running_reqsnum_waiting_reqskv_cache_usager0   Zhit_rater1   r#   )r   r8   Zprompt_throughputZgeneration_throughputr   rB   r   r   r   r#   g   s6   

zLoggingStatLogger.logc                 C   s(   | j jjrtd| j| j jj d S d S )NzSEngine %03d: vllm cache_config_info with initialization after num_gpu_blocks is: %d)r   cache_configZnum_gpu_blocksrC   rD   r   r!   r   r   r   r"      s   
z(LoggingStatLogger.log_engine_initializedNr$   )r%   r&   r'   r   r)   r   r,   r   r9   r<   r=   r   r   r    r#   r"   r   r   r   r   r*   3   s    

%r*   c                   @   s|   e Zd ZejZejZejZ	e
Z	ddedeee  fddZdedefdd	Z	
ddee dee defddZdd ZdS )PrometheusStatLoggerNr   engine_indexesc                    sR  |d u rdg}|| _ t  || _|jj| _ddg}|jj|jj}t| j dkr1|j	d ur1t
d|jjt| j d g}| |j	||| _| jddd|d	}t||| _| jd
dd|d	}t||| _| jddd|d	}t||| _| jdd|d}	t|	|| _| jdd|d}
t|
|| _| jdd|d}t||| _| jdd|d}t||| _| jdd|d}t||| _| jdd|d}t||| _| jdd|d}t||| _| jdd|d}t||| _i | _| jdd |d!g d tD ] fd"d#|D | j< q| jd$dt||d%}t||| _ | jd&dt||d%}t||| _!| jd'd(g d)|d%}t||| _"| jd*d+t||d%}t||| _#| jd,d-g d.|d%}t||| _$| jd/d0t||d%}t||| _%| jd1d2g d3|d%}t||| _&| jd4d5g d6|d%}t||| _'g d7}| jd8d9||d%}t||| _(| jd:d;||d%}t||| _)| jd<d=||d%}t||| _*| jd>d?||d%}t||| _+| jd@dA||d%}t||| _,d | _-|j.d ur't| j dkrt
dBdC| _/dD| _0dE| _1|j.j2| _3| jdFdGdH| j/| j0| j1gd	| _-d S d S )INr   
model_nameengine   z[Prometheus metrics with Spec Decoding with >1 EngineCore per AsyncLLM is not supported yet.zvllm:num_requests_runningz.Number of requests in model execution batches.
mostrecentnamedocumentationZmultiprocess_mode
labelnameszvllm:num_requests_waitingz+Number of requests waiting to be processed.zvllm:gpu_cache_usage_percz_GPU KV-cache usage. 1 means 100 percent usage.DEPRECATED: Use vllm:kv_cache_usage_perc instead.zvllm:gpu_prefix_cache_querieszqGPU prefix cache queries, in terms of number of queriedtokens. DEPRECATED: Use vllm:prefix_cache_queries instead.)rR   rS   rT   zvllm:gpu_prefix_cache_hitszkGPU prefix cache hits, in terms of number of cached tokens. DEPRECATED: Use vllm:prefix_cache_hits instead.zvllm:kv_cache_usage_percz*KV-cache usage. 1 means 100 percent usage.zvllm:prefix_cache_queriesz;Prefix cache queries, in terms of number of queried tokens.zvllm:prefix_cache_hitsz7Prefix cache hits, in terms of number of cached tokens.zvllm:num_preemptionsz0Cumulative number of preemption from the engine.zvllm:prompt_tokensz#Number of prefill tokens processed.zvllm:generation_tokensz&Number of generation tokens processed.zvllm:request_successz)Count of successfully processed requests.Zfinished_reasonc              	      s$   i | ]}|  t|tqS r   labelsstr.0idxZcounter_request_success_baserM   reasonr   r   
<dictcomp>+  s    z1PrometheusStatLogger.__init__.<locals>.<dictcomp>zvllm:request_prompt_tokens)rR   rS   bucketsrT   zvllm:request_generation_tokenszvllm:iteration_tokens_totalz.Histogram of number of tokens per engine_step.)rO             @         i   i   i   i   i    i @  z&vllm:request_max_num_generation_tokensz;Histogram of maximum number of requested generation tokens.zvllm:request_params_nz%Histogram of the n request parameter.)rO         
      zvllm:request_params_max_tokensz.Histogram of the max_tokens request parameter.z vllm:time_to_first_token_secondsz,Histogram of time to first token in seconds.)gMbP?g{Gzt?{Gz?g{Gz?g{Gz?gQ?g{Gz?皙?g      ?      ?      ?      ?      @      @      @      $@      4@      D@      T@g      d@g      @g      @z"vllm:time_per_output_token_secondsz.Histogram of time per output token in seconds.)ri   g?g?g333333?rj   g333333?g?333333?g?rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   )ru   rk   g?rm   g      ?g       @rn   ro   rq   g      .@rr   g      >@rs   g      I@g      N@g      ^@g      n@g      ~@g      @g      @g      @z vllm:e2e_request_latency_secondsz,Histogram of e2e request latency in seconds.zvllm:request_queue_time_secondsz5Histogram of time spent in WAITING phase for request.z#vllm:request_inference_time_secondsz5Histogram of time spent in RUNNING phase for request.z!vllm:request_prefill_time_secondsz5Histogram of time spent in PREFILL phase for request.z vllm:request_decode_time_secondsz4Histogram of time spent in DECODE phase for request.z%LoRA in DP mode is not supported yet.max_lorawaiting_lora_adaptersrunning_lora_adapterszvllm:lora_requests_infozRunning stats on lora requests.sum)4rL   r   r   Zobservability_configZshow_hidden_metricsZmodel_configZserved_model_namemax_model_lenlenZspeculative_configNotImplementedErrorrW   _spec_decoding_clsspec_decoding_prom
_gauge_clsmake_per_enginegauge_scheduler_runninggauge_scheduler_waitinggauge_gpu_cache_usage_counter_cls counter_gpu_prefix_cache_queriescounter_gpu_prefix_cache_hitsgauge_kv_cache_usagecounter_prefix_cache_queriescounter_prefix_cache_hitscounter_num_preempted_reqscounter_prompt_tokenscounter_generation_tokenscounter_request_successr   _histogram_clsbuild_1_2_5_buckets#histogram_num_prompt_tokens_request'histogram_num_generation_tokens_requesthistogram_iteration_tokens+histogram_max_num_generation_tokens_requesthistogram_n_requesthistogram_max_tokens_requesthistogram_time_to_first_tokenhistogram_time_per_output_tokenhistogram_e2e_time_requesthistogram_queue_time_request histogram_inference_time_requesthistogram_prefill_time_requesthistogram_decode_time_requestgauge_lora_infoZlora_configlabelname_max_loralabelname_waiting_lora_adapterslabelname_running_lora_adaptersZ	max_lorasrv   )r   r   rL   rT   rz   Zspec_decode_labelvaluesr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Zrequest_latency_bucketsr   r   r   r   r   r   r[   r   r      s  
	
		
zPrometheusStatLogger.__init__type
config_objc                 C   s   |  }d|d< d\}}|dkrd}d}|d usJ d| | j||d| d	}| jD ]}|  }t||d< |jdi |d
 q-d S )N rN   NNrJ   zvllm:cache_config_infoz(Information of the LLMEngine CacheConfigzUnknown metrics info type rP   rQ   rO   r   )metrics_infor   keysrL   rW   rV   set)r   r   r   r   rR   rS   Z
info_gauger   r   r   r   log_metrics_info  s$   
z%PrometheusStatLogger.log_metrics_infor   r   r   r   c                 C   s  |dur\| j | |j | j| |j | j| |j | j| |j | j| 	|j
j | j| 	|j
j | j| 	|j
j | j| 	|j
j |jdur\| j|j |du rbdS | j| 	|j | j| 	|j | j| 	|j | j| |j|j  |jD ]
}| j| | q|jD ]
}| j| | q|jD ]
}| j| | q|j D ]
}| j!| | q|j"D ]X}| j#|j$ | 	  | j%| |j& | j'| |j( | j)| |j* | j+| |j, | j-| |j. | j/| |j | j0| |j |j1r| j2| |j1 q| j3durLd4|j56 }	d4|j76 }
| j8|	| j9|
| j:| j;i}| j3j<di |=  dS dS )zLog to prometheus.N,r   )>r   r   rG   r   rH   r   rI   r   r   incr?   Zqueriesr   hitsr   r   r@   r~   r>   r   Znum_preempted_reqsr   r6   r   r7   r   Zmax_num_generation_tokens_iterr   Zn_params_iterr   Ztime_to_first_tokens_iterr   Ztime_per_output_tokens_iterr   Zfinished_requestsr   Zfinish_reasonr   Ze2e_latencyr   Zqueued_timer   Zprefill_timer   Zinference_timer   Zdecode_timer   r   Zmax_tokens_paramr   r   joinrx   r   rw   r   r   r   rv   rV   Zset_to_current_time)r   r   r   r   Zmax_gen_tokensZn_paramZttftZtpotZfinished_requestrx   rw   Zlora_info_labelsr   r   r   r      s   



























zPrometheusStatLogger.recordc                 C   s   |  d| jj d S )NrJ   )r   r   rJ   r!   r   r   r   r"   A  s   z+PrometheusStatLogger.log_engine_initializedr   r$   )r%   r&   r'   prometheus_clientGauger   Counterr   	Histogramr   r   r}   r   r   listr)   r   rW   r   r   r   r   r    r"   r   r   r   r   rK      s.    

  :
WrK   metricengine_idxsrM   r;   c                    s    fdd|D S )Nc                    s   i | ]}|  t|qS r   rU   rX   r   rM   r   r   r]   N  s    z#make_per_engine.<locals>.<dictcomp>r   )r   r   rM   r   r   r   r   L  s   r   mantissa_lst	max_valuec                 C   sD   d}g }	 | D ]}|d|  }||kr| | q|  S |d7 }q)z
    Builds a list of buckets with increasing powers of 10 multiplied by
    mantissa values until the value exceeds the specified maximum.

    r   Trg   rO   )append)r   r   exponentr^   mvaluer   r   r   build_bucketsQ  s   r   c                 C   s   t g d| S )zR
    Example:
    >>> build_1_2_5_buckets(100)
    [1, 2, 5, 10, 20, 50, 100]
    )rO   re   rf   )r   )r   r   r   r   r   c  s   r   c                	   @   sr   e Zd ZdZ		ddedeee  deee  fddZ		ddee
 d	ee d
ee fddZdd Zdd ZdS )StatLoggerManagera  
    StatLoggerManager:
        Logging happens at the level of the EngineCore (per scheduler).
         * DP: >1 EngineCore per AsyncLLM - loggers for each EngineCore.
         * With Local Logger, just make N copies for N EngineCores.
         * With Prometheus, we need a single logger with N "labels"

        This class abstracts away this implementation detail from
        the AsyncLLM, allowing the AsyncLLM to just call .record()
        and .log() to a simple interface.
    Nr   r   custom_stat_loggersc           	      C   s   |r|ndg| _ |d ur|}ng }ttjr|t i | _t}| j D ]#}g }|D ]}t	|t
r9t|tr9|}q*|||| q*|| j|< q$|||| _d S r4   )r   rC   isEnabledForloggingINFOr   r*   per_engine_logger_dictrK   
isinstancer   
issubclassprometheus_logger)	r   r   r   r   Z	factoriesZprometheus_factoryr   loggersZlogger_factoryr   r   r   r   y  s*   


zStatLoggerManager.__init__r   r   r   c                 C   sB   |d u rd}| j | }|D ]	}|||| q| j||| d S r4   )r   r    r   )r   r   r   r   per_engine_loggersrC   r   r   r   r      s   

zStatLoggerManager.recordc                 C   s&   | j  D ]}|D ]}|  q	qd S r   )r   valuesr#   r   r   rC   r   r   r   r#     s
   
zStatLoggerManager.logc                 C   s0   | j   | j D ]}|D ]}|  qq
d S r   )r   r"   r   r   r   r   r   r   r"     s   

z(StatLoggerManager.log_engine_initializedr   r   )r%   r&   r'   r(   r   r   r   r)   StatLoggerFactoryr   r   r   r    r#   r"   r   r   r   r   r   l  s*    


(
r   )-r   r-   abcr   r   typingr   r   r   r   Zvllm.configr   r   Zvllm.loggerr	   Zvllm.v1.core.kv_cache_utilsr
   Zvllm.v1.enginer   Zvllm.v1.metrics.prometheusr   Zvllm.v1.metrics.statsr   r   Zvllm.v1.spec_decode.metricsr   r   r%   rC   r)   r   r   r*   rK   r   r   r   Z
PromMetricr   rW   dictr   r   r   r   r   r   r   r   <module>   sD   a   4

	