o
    )i                  /   @   s
  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZ ddlZddlZddlmZ ddlmZ dd	lmZmZmZ dd
lmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) dZ*eG dd dZ+deed  dee, dee, de,de,de-de-fddZ.				dHde/e de-de-deed  dee, dee, de
e0ee-f df fddZ1de/e d e/e" d!e-d"ed#e/e- d$e2e3e-f de0e+e/e, f fd%d&Z4				'dId(e3d)e3d*e3d+e3d,e3d"ede/e d-ee, de-de-d.e5d/e5d0e/e3 d#e/e- d1e5d$e2e3e-f d2ee, d3eee3  d4ee2 deed  dee, dee, d5e,f.d6d7Z6d8d9 Z7d:d; Z8d<ej9d=e2e3ef d>e3ddfd?d@Z:dAej;fdBdCZ<d<ej9de2e3ef fdDdEZ=d<ej9de2e3ef fdFdGZ>dS )Ja.  Benchmark online serving throughput.

On the server side, run one of the following commands
to launch the vLLM OpenAI API server:
    vllm serve <your_model> <engine arguments>        

On the client side, run:
    vllm bench serve \
        --endpoint-type <endpoint_type. Default 'openai'> \
        --label <benchmark result label. Default using endpoint_type> \
        --model <your_model> \
        --dataset-name <dataset_name. Default 'random'> \
        --request-rate <request_rate. Default inf> \
        --num-prompts <num_prompts. Default 1000>
    N)AsyncGeneratorIterable)	dataclass)datetime)AnyLiteralOptional)tqdm)PreTrainedTokenizerBase)SampleRequestadd_dataset_parserget_samples)ASYNC_REQUEST_FUNCSOPENAI_COMPATIBLE_BACKENDSRequestFuncInputRequestFuncOutput)wait_for_endpoint)#convert_to_pytorch_benchmark_formatwrite_to_json)get_tokenizer  c                   @   s   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< eeeef  ed< eed< eed< eed< eeeef  ed< eed< eed< eed< eeeef  ed< eed< eed< eed< eeeef  ed< dS )BenchmarkMetrics	completedtotal_inputtotal_outputrequest_throughputrequest_goodputoutput_throughputtotal_token_throughputmean_ttft_msmedian_ttft_msstd_ttft_mspercentiles_ttft_msmean_tpot_msmedian_tpot_msstd_tpot_mspercentiles_tpot_msmean_itl_msmedian_itl_ms
std_itl_mspercentiles_itl_msmean_e2el_msmedian_e2el_msstd_e2el_mspercentiles_e2el_msN)__name__
__module____qualname__int__annotations__floatlisttuple r7   r7   a/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/benchmarks/serve.pyr   1   s0   
 r   ramp_up_strategy)linearexponentialramp_up_start_rpsramp_up_end_rpsrequest_indextotal_requestsrequest_ratereturnc           	      C   sp   | r6|d ur6|d ur6|t |d d }| dkr!|| | }|| S | dkr/|| }|||  S td|  |S )N   r:   r;   zUnknown ramp-up strategy: )max
ValueError)	r9   r<   r=   r>   r?   r@   progressZincreaseratior7   r7   r8   _get_current_request_rateO   s   rG         ?input_requests
burstinessc                   s  |dksJ d| dt | trt | tst| } t| }|dks'J dg }g }t| D ]/\}	}
t||||	||}|| |tdkrM|d q/d||  }|tj	j
||d q/tdt|D ]}||  ||d  7  < qf|d	u r|d
 dkr|| }||d
    fdd|D }t }t| D ])\}	}
||	 dkrt }|||	  | }|dkrt|I d	H  |
||	 fV  qd	S )a  
    Asynchronously generates requests at a specified rate
    with OPTIONAL burstiness and OPTIONAL ramp-up strategy.

    Args:
        input_requests:
            A list of input requests, each represented as a SampleRequest.
        request_rate:
            The rate at which requests are generated (requests/s).
        burstiness (optional):
            The burstiness factor of the request generation.
            Only takes effect when request_rate is not inf.
            Default value is 1, which follows a Poisson process.
            Otherwise, the request intervals follow a gamma distribution.
            A lower burstiness value (0 < burstiness < 1) results
            in more bursty requests, while a higher burstiness value
            (burstiness > 1) results in a more uniform arrival of requests.
         ramp_up_strategy (optional):
            The ramp-up strategy. Can be "linear" or "exponential".
            If None, uses constant request rate (specified by request_rate).
        ramp_up_start_rps (optional):
            The starting request rate for ramp-up.
        ramp_up_end_rps (optional):
            The ending request rate for ramp-up.
    r   z4A positive burstiness factor is expected, but given .zNo requests provided.infrH   )shapescalerB   Nc                    s   g | ]}|  qS r7   r7   ).0delayZnormalize_factorr7   r8   
<listcomp>   s    zget_request.<locals>.<listcomp>)
isinstancer   r5   len	enumeraterG   appendr4   nprandomgammarangetimeasynciosleep)rI   r@   rJ   r9   r<   r=   r?   Zrequest_ratesZdelay_tsr>   requestcurrent_request_ratethetaiZtarget_total_delay_sZstart_tsZ
current_tsZsleep_interval_sr7   rR   r8   get_requeste   sP   
!

rc   outputsdur_s	tokenizerselected_percentilesgoodput_config_dictc                    st  g }d}d}d}	g g g }
g g  t t|D ]g}|| jrz|| j}|s2t||| jddj}|| || | j7 }d}|dkrY|| j|| j	 }||d  }| |
| || j
7 || j	  || j |d7 }q|d q|rg }g }d|v r| ||d t  d|v r||
 ||d t  d|v r|  ||d t  t| D ]}tdd	 t||D }|r|	d7 }	q|dkrtjd
dd td*i d|d|dt|d|| d|	| dt|| d|t| | dtpdd dtpdd dtp%dd dfdd	|D dtp:dd dtpEdd dtpPdd dfdd	|D dtpedd d tppdd d!tp{dd d"fd#d	|D d$t pdd d%t pdd d&t pdd d' fd(d	|D }||fS )+a  Calculate the metrics for the benchmark.

    Args:
        input_requests: The input requests.
        outputs: The outputs of the requests.
        dur_s: The duration of the benchmark.
        tokenizer: The tokenizer to use.
        selected_percentiles: The percentiles to select.
        goodput_config_dict: The goodput configuration.

    Returns:
        A tuple of the benchmark metrics and the actual output lengths.
    r   F)Zadd_special_tokensrB   ttfttpote2elc                 S   s   g | ]\}}||kqS r7   r7   )rP   srr7   r7   r8   rS     s    z%calculate_metrics.<locals>.<listcomp>zYAll requests failed. This is likely due to a misconfiguration on the benchmark arguments.   )
stacklevelr   r   r   r   r   r   r   r   r   r!   r    r"   c                    $   g | ]}|t  p
d |d fqS r   r   rX   Z
percentilerP   p)ttftsr7   r8   rS   "      r#   r%   r$   r&   c                    rp   rq   rr   rs   )tpotsr7   r8   rS   '  rv   r'   r)   r(   r*   c                    rp   rq   rr   rs   )itlsr7   r8   rS   ,  rv   r+   r-   r,   r.   c                    rp   rq   rr   rs   )e2elsr7   r8   rS   1  rv   Nr7   )r[   rU   successZoutput_tokensgenerated_textZ	input_idsrW   
prompt_lenZlatencyri   itl"MILLISECONDS_TO_SECONDS_CONVERSIONzipallwarningswarnr   sumrX   meanZstdZmedian)rI   rd   re   rf   rg   rh   actual_output_lensr   r   Zgood_completedZ	all_tpotsrb   
output_lenrj   Zlatency_minus_ttftZvalid_metricsZ
slo_valuesZ
req_metricZis_good_reqmetricsr7   )ry   rx   rw   ru   r8   calculate_metrics   s   

















r   X  endpoint_typeapi_urlbase_urlmodel_id
model_namelogprobsdisable_tqdmprofileselected_percentile_metrics
ignore_eosmax_concurrencylora_modules
extra_bodyready_check_timeout_secc           9         s  | t v r
t |  ntd|  tj|pd|pddddddd|v d}tj|dtjd	d
d}td |d j|d j|d j	|d j
f\}}}}|d u set|tset|tratdd |D seJ dt||||||||||d
}t|||dI d H }|jstd|j td  rt fddtt|D  |rtd t||||d ||||||d
}||dI d H } | jrtd |	dkrdnd}!|d urtd| d td| d | d! ntd"|  td#|	 d$|! d% td&|  |
rd ntt|d
}"|rt|nd fd'd(}#t }$g }%g }&d)}'|d ur<|d ur<|}'|&|'t  d* t|||	|||2 zn3 d H W \}(})|d urxt |)}*|*|'krxt  }+t|'d+ |*d+ D ]},|&|,|+d* qj|*}'|(j|(j|(j	|(j
f\}-}.}/}0||}1}2 rt! }3|3|3}1}2t|1|2|-||.|/||0||d
}4|%t"|#|4||"d, qE6 tj#|% I d H }5|"d ur|"$  t |$ }6t%||5|6|||d-\}7td.j&d/d0d1d2 td3&d4j' |d urtd3&d5| |t(d6krtd7&d8| td7&d9|6 td3&d:j) td3&d;j* td7&d<j+ |r6td7&d=j, td7&d>j- td7&d?j. |6j'j)j*j+|rWj,nd j-j.d@d |5D |7dAd |5D dBd |5D dCd |5D dDd |5D dE|&r|&dF< dGt/dHt/dIt/ffdJdK}8|8dLdMdN |8dOdPdQ |8dRdSdT |8dUdVdW tdX |rtdY t|||dZ |||d[}||dI d H } | jrtd\ |$ I d H  S )]NzUnknown endpoint_type: r   i,  T<   Fzhttps://)limitZlimit_per_hostZttl_dns_cacheZuse_dns_cacheZkeepalive_timeoutZenable_cleanup_closedZforce_closessli`T  )total)	connector	trust_envtimeoutz*Starting initial single prompt test run...c                 s   s    | ]}t |tV  qd S N)rT   dict)rP   itemr7   r7   r8   	<genexpr>u  s    zbenchmark.<locals>.<genexpr>z-multi_modal_data must be a dict or list[dict])
modelr   promptr   r|   r   r   Zmulti_modal_contentr   r   )Ztimeout_secondsz_Initial test run failed - Please make sure benchmark arguments are correctly specified. Error: z:Initial test run completed. Starting main benchmark run...c                    s   g | ]}t  qS r7   )rY   choice)rP   _)r   r7   r8   rS     s    zbenchmark.<locals>.<listcomp>zStarting profiler...z/start_profile)request_func_inputsessionzProfiler startedrH   zPoisson processzGamma distributionzTraffic ramp-up strategy: rK   zWill increase RPS from z to z( RPS over the duration of the benchmark.zTraffic request rate: zBurstiness factor: z ()zMaximum request concurrency: c              	      sn   d u r | ||dI d H S 4 I d H   | ||dI d H W  d   I d H  S 1 I d H s0w   Y  d S )Nr   r   pbarr7   r   )request_func	semaphorer7   r8   limited_request_func  s   
0z'benchmark.<locals>.limited_request_funcrO   )Zrps	timestamprB   r   )rI   rd   re   rf   rg   rh   {s:{c}^{n}}z Serving Benchmark Result 2   =rl   ncz{:<40} {:<10}zSuccessful requests:zMaximum request concurrency:rL   {:<40} {:<10.2f}zRequest rate configured (RPS):zBenchmark duration (s):zTotal input tokens:zTotal generated tokens:zRequest throughput (req/s):zRequest goodput (req/s):z Output token throughput (tok/s):zTotal Token throughput (tok/s):c                 S      g | ]}|j qS r7   )r|   rP   outputr7   r7   r8   rS   ,      c                 S   r   r7   )ri   r   r7   r7   r8   rS   .  r   c                 S   r   r7   )r}   r   r7   r7   r8   rS   /  r   c                 S   r   r7   )r{   r   r7   r7   r8   rS   0  r   c                 S   r   r7   )errorr   r7   r7   r8   rS   1  r   )durationr   Ztotal_input_tokensZtotal_output_tokensr   r   r   r   
input_lensoutput_lensru   rx   generated_textserrorsrps_change_eventsmetric_attribute_namemetric_namemetric_headerc              	      s:  | vrd S t dj|ddd t dd| dt d|  d	 t dd
| dt d|  d	 t d|  d	d|  d	< t d|  d	d|  d	< t d|  d	d|  d	< t d|  d	D ].\}}t||kr|tt|nt|}t dd| d| d| |d| d|  d	< qld S )Nr   r   -r   r   zMean z (ms):Zmean_Z_mszMedian Zmedian_Zstd_Zpercentiles_P rt   r   )printformatgetattrr2   str)r   r   r   rt   valueZp_word)r   resultr   r7   r8   process_one_metric7  s:   



 z%benchmark.<locals>.process_one_metricri   ZTTFTzTime to First Tokenrj   ZTPOTz'Time per Output Token (excl. 1st token)r}   ZITLzInter-token Latencyrk   ZE2ELzEnd-to-end Latencyz2==================================================zStopping profiler...z/stop_profile)r   r   r   r|   r   r   zProfiler stopped)0r   rD   aiohttpZTCPConnectorZClientSessionZClientTimeoutr   r   r|   Zexpected_output_lenZmulti_modal_datarT   r   r5   r   r   r   rz   r   iterr[   rU   r	   r]   	Semaphorer\   perf_counterrW   r   now	isoformatrc   r2   nextcreate_taskgathercloser   r   r   r4   r   r   r   r   r   r   r   )9r   r   r   r   r   rf   rI   r   r@   rJ   r   r   r   rg   r   rh   r   r   r   r9   r<   r=   r   r   r   Ztest_promptZtest_prompt_lenZtest_output_lenZtest_mm_contentZ
test_inputZtest_outputZprofile_inputZprofile_outputdistributionr   r   Zbenchmark_start_timetasksr   Zlast_int_rpsr_   r`   Zcurrent_int_rpsr   Zrps_valr   r|   r   Z
mm_contentZreq_model_idZreq_model_nameZreq_lora_moduler   rd   Zbenchmark_durationr   r   r7   )r   r   r   r   r   r   r8   	benchmark8  s  










'

	
 r   c              	   C   sx   i }g d}| j r:t| j }| D ]'\}}||vr*td| d| dt| d|dk r9td| d| dq|S )	N)ri   rj   rk   zInvalid metric name found, z: z4. The service level objective name should be one of z. r   zInvalid value found, z;. The service level objective value should be non-negative.)Zgoodputparse_goodputitemsrD   r   )argsrh   ZVALID_NAMESslo_nameslo_valr7   r7   r8   check_goodput_argsr  s"   
r   c              
   C   sT   i }z| D ]}| d\}}t|||< qW |S  ty) } ztd|d }~ww )N:zInvalid format found for service level objectives. Specify service level objectives for goodput as "KEY:VALUE" pairs, where the key is a metric name, and the value is a number in milliseconds.)splitr4   rD   argparseArgumentTypeError)Z	slo_pairsrh   Zslo_pairr   r   errr7   r7   r8   r     s   	r   r   results	file_namec                    sj   g dg d t | fddD  fddD d}|r3tj|d  d}t|| d S d S )	N)r    r   r!   Zp99_ttft_msr#   r$   r%   Zp99_tpot_msr(   r'   r)   Z
p99_itl_ms)ru   rx   r   r   c                    s    i | ]}| v r| | gqS r7   r7   rP   k)r   r7   r8   
<dictcomp>  s    

z4save_to_pytorch_benchmark_format.<locals>.<dictcomp>c                    s&   i | ]}|vr| vr|| qS r7   r7   r   Zignored_metricsr   r   r7   r8   r     s    )r   r   Z
extra_infor   z.pytorch.json)r   ospathsplitextr   )r   r   r   Z
pt_recordsZpt_filer7   r   r8    save_to_pytorch_benchmark_format  s   
r   parserc                 C   s  t |  | jdtdtt d | jdtd dd | jdtdtt d | jd	td d
d | jdtdd | jdtdd | jdtddd | jdtd dd | jdtddd | jdtdd | jddd | jdtd d d | jd!ttd"d#d | jd$td%d&d | jd'dd(d) | jd*dd+d) | jd,dd-d) | jd.dd/d) | jd0dd1d) | jd2dd3d) | jd4d5d6d7d8 | jd9td d:d | jd;td d<d | jd=dd>d) | jd?td@dAd | jdBtdCdDd | jdEdFdGdHdI | dJ}|jdKtd dLd |jdMtd dNd |jdOtd dPd |jdQtd dRd | jdStdTg dUdVdW | jdXtd dYd | jdZdFd d[d\ | jd]td d^d_gd`dW | jdatd dbd | jdctd ddd | jdetdfdgd d S )hNz--endpoint-typeZopenai)typedefaultchoicesz--labelzkThe label (prefix) of the benchmark results. If not specified, the endpoint type will be used as the label.)r   r   helpz	--backendZvllmz
--base-urlz7Server or API base url if not using http host and port.z--hostz	127.0.0.1)r   r   z--porti@  z
--endpointz/v1/completionszAPI endpoint.z--max-concurrencya  Maximum number of concurrent requests. This can be used to help simulate an environment where a higher level component is enforcing a maximum number of concurrent requests. While the --request-rate argument controls the rate at which requests are initiated, this argument will control how many are actually allowed to execute at a time. This means that when used in combination, the actual request rate may be lower than specified with --request-rate, if the server is not processing requests fast enough to keep up.z--modelTzName of the model.)r   requiredr   z--tokenizerzBName or path of the tokenizer, if not using the default tokenizer.)r   r   z--use-beam-search
store_true)actionz
--logprobsa  Number of logprobs-per-token to compute & return as part of the request. If unspecified, then either (1) if beam search is disabled, no logprobs are computed & a single dummy logprob is returned for each token; or (2) if beam search is enabled 1 logprob per token is computedz--request-raterL   zNumber of requests per second. If this is inf, then all the requests are sent at time 0. Otherwise, we use Poisson process or gamma distribution to synthesize the request arrival times.z--burstinessrH   au  Burstiness factor of the request generation. Only take effect when request_rate is not inf. Default value is 1, which follows Poisson process. Otherwise, the request intervals follow a gamma distribution. A lower burstiness value (0 < burstiness < 1) results in more bursty requests. A higher burstiness value (burstiness > 1) results in a more uniform arrival of requests.z--trust-remote-codez"Trust remote code from huggingface)r   r   z--disable-tqdmz%Specify to disable tqdm progress bar.z	--profilezbUse Torch Profiler. The endpoint must be launched with VLLM_TORCH_PROFILER_DIR to enable profiler.z--save-resultz0Specify to save benchmark results to a json filez--save-detailedznWhen saving the results, whether to include per request information such as response, error, ttfs, tpots, etc.z--append-resultz6Append the benchmark result to the existing json file.z
--metadataz	KEY=VALUE*zKey-value pairs (e.g, --metadata version=0.3.3 tp=1) for metadata of this run to be saved in the result JSON file for record keeping purposes.)metavarnargsr   z--result-dirznSpecify directory to save benchmark json results.If not specified, results are saved in the current directory.z--result-filenamezSpecify the filename to save benchmark json results.If not specified, results will be saved in {label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json format.z--ignore-eoszuSet ignore_eos flag when sending the benchmark request.Warning: ignore_eos is not supported in deepspeed_mii and tgi.z--percentile-metricszttft,tpot,itlzComma-separated list of selected metrics to report percentils. This argument specifies the metrics to report percentiles. Allowed metric names are "ttft", "tpot", "itl", "e2el". z--metric-percentilesZ99zComma-separated list of percentiles for selected metrics. To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". Default value is "99".Use "--percentile-metrics" to select metrics.z	--goodput+Fa  Specify service level objectives for goodput as "KEY:VALUE" pairs, where the key is a metric name, and the value is in milliseconds. Multiple "KEY:VALUE" pairs can be provided, separated by spaces. Allowed request level metric names are "ttft", "tpot", "e2el". For more context on the definition of goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 and the blog: https://hao-ai-lab.github.io/blogs/distserve)r   r   r   zsampling parametersz--top-pzHTop-p sampling parameter. Only has effect on openai-compatible backends.z--top-kzHTop-k sampling parameter. Only has effect on openai-compatible backends.z--min-pzHMin-p sampling parameter. Only has effect on openai-compatible backends.z--temperaturezTemperature sampling parameter. Only has effect on openai-compatible backends. If not specified, default to greedy decoding (i.e. temperature==0.0).z--tokenizer-modeauto)r   ZslowZmistralZcustomzThe tokenizer mode.

* "auto" will use the fast tokenizer if available.
* "slow" will always use the slow tokenizer. 
* "mistral" will always use the `mistral_common` tokenizer. 
*"custom" will use --tokenizer to select the preregistered tokenizer.)r   r   r   r   z--served-model-namezoThe model name used in the API. If not specified, the model name will be the same as the ``--model`` argument. z--lora-moduleszA subset of LoRA module names passed in when launching the server. For each request, the script chooses a LoRA module at random.)r   r   r   z--ramp-up-strategyr:   r;   zThe ramp-up strategy. This would be used to ramp up the request rate from initial RPS to final RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps.) over the duration of the benchmark.z--ramp-up-start-rpszcThe starting request rate for ramp-up (RPS). Needs to be specified when --ramp-up-strategy is used.z--ramp-up-end-rpszaThe ending request rate for ramp-up (RPS). Needs to be specified when --ramp-up-strategy is used.z--ready-check-timeout-secr   zeMaximum time to wait for the endpoint to become ready in seconds (default: 600 seconds / 10 minutes).)	r   add_argumentr   r5   r   keysr2   r4   add_argument_group)r   Zsampling_groupr7   r7   r8   add_cli_args  s  


			
	

r  c                 C   s   t t| S r   )r]   run
main_async)r   r7   r7   r8   main  s   r  c                    s  t |  t| j tj| j | jd urV| jtdkr"td| jd u s,| j	d u r0td| jdk s:| j	dk r>td| j| j	krHtd| jdkrV| jdkrVtd| j
}| j}| j}| j}| jd urj| jn| j}| j}| jd ur| j | j }| j }nd	| j d
| j | j }d	| j d
| j }t||| jd}	| jd u rtdt| |	}
t| }dd | j| j| j| jd D }|r| jtvrtdd|vrd|d< t !  t "  t#dFi d| j
d|d|d|d|d|	d|
d| j$d| jd| j%d| j&d| j'd| j()d d!d"d# | j*)d D d$| j+d%|d&| j,d'| j-d(|d)| jd*| jd+| j	d,| j.I d H }i }t/0 1d-}||d.< | j
|d< ||d/< ||d< ||d0< | j2|d1< | j3r| j3D ]}d2|v r|)d2}|d3 4 ||d 4 < qqtd4| jtdk r| jnd|d< | j%|d< | j,|d&< | jd ur| j|d)< | j|d*< | j	|d+< i ||}| j5sd5D ]}||v r||= ||v r||= q| j6s| j7r|)d6d7 }| j,d urd8| j, nd9}|p|}| jd ur | d:| j d;| j d<| j	 d=| d;| d;| d>}n| d;| j d=| d;| d;| d>
}| j8r9| j8}| j9rMt:j;| j9d?d@ t:j<=| j9|}t>|| j7rUdAndBdCdD}| j7rk|? dkrk|@dE tAB|| W d    n	1 s|w   Y  tC| || |S )GNrL   zWhen using ramp-up, do not specify --request-rate. The request rate will be controlled by ramp-up parameters. Please remove the --request-rate argument.z_When using --ramp-up-strategy, both --ramp-up-start-rps and --ramp-up-end-rps must be specifiedr   z.Ramp-up start and end RPS must be non-negativez+Ramp-up start RPS must be less than end RPSr;   z3For exponential ramp-up, the start RPS cannot be 0.zhttp://r   )tokenizer_modetrust_remote_codezSPlease specify '--dataset-name' and the corresponding '--dataset-path' if required.c                 S   s   i | ]\}}|d ur||qS r   r7   )rP   r   vr7   r7   r8   r     s
    zmain_async.<locals>.<dictcomp>)top_ptop_kmin_ptemperaturezESampling parameters are only supported by openai-compatible backends.r  g        r   r   r   r   r   rf   rI   r   r@   rJ   r   r   r   ,rg   c                 S   s   g | ]}t |qS r7   )r4   rs   r7   r7   r8   rS     s    zmain_async.<locals>.<listcomp>r   rh   r   r   r   r9   r<   r=   r   z%Y%m%d-%H%M%Sdatelabeltokenizer_idnum_promptsr   rB   z5Invalid metadata format. Please use KEY=VALUE format.)r   r   ru   rx   r   r   /rO   z-concurrency z	-ramp-up-r   zqps-Zqpsz.jsonT)exist_okza+wzutf-8)modeencoding
r7   )Dr   rY   seedrX   r9   r@   r4   rD   r<   r=   r   r  r   Zserved_model_namerf   r  r   Zendpointhostportr   r  Zdataset_namer   r   r  r	  r
  r  r   backendr   gcZcollectfreezer   r   rJ   r   r   Zpercentile_metricsr   Zmetric_percentilesr   r   r   r   r   r   strftimer  metadatastripZsave_detailedZsave_resultZappend_resultZresult_filenameZ
result_dirr   makedirsr   joinopentellwritejsondumpr   )r   r   r  r   r   r  r  r   r   rf   rI   rh   Zsampling_paramsZbenchmark_resultZresult_jsonZ
current_dtr   ZkvstringfieldZbase_model_idZmax_concurrency_strr   outfiler7   r7   r8   r    sN  






	















6$
r  )rH   NNN)NNNr   )?__doc__r   r]   r  r&  r   rY   r\   r   collections.abcr   r   dataclassesr   r   typingr   r   r   r   numpyrX   Ztqdm.asyncior	   Ztransformersr
   Zvllm.benchmarks.datasetsr   r   r   Z)vllm.benchmarks.lib.endpoint_request_funcr   r   r   r   Z!vllm.benchmarks.lib.ready_checkerr   Zvllm.benchmarks.lib.utilsr   r   Z!vllm.transformers_utils.tokenizerr   r~   r   r2   r4   rG   r5   r6   rc   r   r   r   boolr   r   r   	Namespacer   ArgumentParserr  r  r  r7   r7   r7   r8   <module>   s  



[

 	




  <

   