o
    )ih                     @   s*  d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZ ddlmZm Z  ddl!m"Z"m#Z# dd	l$m%Z%m&Z& dd
l'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. 	d.de/e de0de#de1de2e3e
e/e*  f f
ddZ4	d.de/e de0de#de1de2e3e/e* f f
ddZ5		d/de/e de0de"de1de1de3fddZ6	d.de/e de7dede0de0de1de1de3fdd Z8d!ej9d"e:e7e	f ddfd#d$Z;d%d& Z<d'd( Z=d)ej>fd*d+Z?d!ej9fd,d-Z@dS )0z'Benchmark offline inference throughput.    N)AnyOptionalUnion)tqdm)AutoModelForCausalLMAutoTokenizerPreTrainedTokenizerBase)
AIMODatasetBurstGPTDatasetConversationDatasetInstructCoderDatasetPrefixRepetitionRandomDatasetRandomDatasetSampleRequestShareGPTDatasetSonnetDatasetVisionArenaDataset)#convert_to_pytorch_benchmark_formatwrite_to_json)AsyncEngineArgs
EngineArgs)
TextPromptTokensPrompt)LoRARequest)RequestOutput)BeamSearchParams)merge_async_iteratorsFrequestsnengine_argsdisable_detokenizereturnc                    s`  ddl m}m} |di t| t fdd| D s!J dg }g }| D ]+}|d|jv r:t|jd |j	dnt
|j|j	d |||d	d	d
|j| d q'd }	|jr_dd | D }	d}
d }|
swt } j|||	d
d}t }n3|	d u sJ ddd | D }| d j}| D ]	}|j|ksJ qt } |t||d
d t }|| |fS )Nr   LLMSamplingParamsc                 3   &    | ]} j jj|j|j kV  qd S NZ
llm_enginemodel_configmax_model_len
prompt_lenexpected_output_len.0requestllm f/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/benchmarks/throughput.py	<genexpr>,       

zrun_vllm.<locals>.<genexpr>pPlease ensure that max_model_len is greater than the sum of prompt_len and expected_output_len for all requests.prompt_token_idsr6   multi_modal_datapromptr8         ?Tr   temperaturetop_p
ignore_eos
max_tokensZ
detokenizec                 S      g | ]}|j qS r1   )lora_requestr,   r1   r1   r2   
<listcomp>G       zrun_vllm.<locals>.<listcomp>F)rB   use_tqdmz$BeamSearch API does not support LoRAc                 S   rA   r1   )r:   r,   r1   r1   r2   rC   U   rD   )Z
beam_widthr@   r?   r1   )vllmr#   r$   dataclassesasdictallappendr:   r   r8   r   r+   enable_loratimeperf_countergenerateZbeam_searchr   )r   r   r   r    r#   r$   promptssampling_paramsr.   lora_requestsZuse_beam_searchoutputsstartend
output_lenr1   r/   r2   run_vllm$   st   


	

rV   c                    s   ddl m}m} |di t| t fdd| D s!J dg }g }| D ]}||j |||ddd|j| d q't	
 }	 j||dd	}
t	
 }||	 |
fS )z
    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
    multimodal models as it properly handles multimodal inputs and chat
    formatting. For non-multimodal models, use run_vllm() instead.
    r   r"   c                 3   r%   r&   r'   r,   r/   r1   r2   r3   s   r4   z run_vllm_chat.<locals>.<genexpr>r5   r;   Tr<   )rE   Nr1   )rF   r#   r$   rG   rH   rI   rJ   r:   r+   rL   rM   Zchat)r   r   r   r    r#   r$   rO   rP   r.   rS   rR   rT   r1   r/   r2   run_vllm_chatf   s2   

	rW    disable_frontend_multiprocessingc                    s|  ddl m} ddlm} |||d4 I d H }| I d H  t fdd| D s.J dg }g }	g }
| D ]1}|d|jv rIt|jd |j	d	nt
|j|j	d
 |	||ddd|j| d |
|j q6g }t }tt||	|
D ]\}\}}}|j|||d| d}|| qvt| }|2 z3 d H W \}}q6 t }|| W  d   I d H  S 1 I d H sw   Y  d S )Nr   )r$   )*build_async_engine_client_from_engine_args)rX   c                 3   s"    | ]} j |j|j kV  qd S r&   )r)   r*   r+   r,   r(   r1   r2   r3      s    
z!run_vllm_async.<locals>.<genexpr>r5   r6   r7   r9   r;   Tr<   test)rB   Z
request_id)rF   r$   Z"vllm.entrypoints.openai.api_serverrY   Zget_model_configrI   rJ   r:   r   r8   r   r+   rB   rL   rM   	enumerateziprN   r   )r   r   r   rX   r    r$   rY   r0   rO   rP   rQ   r.   
generatorsrS   ir:   splr	generatorZall_gensresrT   r1   rZ   r2   run_vllm_async   sp   


	0rd   model	tokenizermax_batch_sizetrust_remote_codec              
   C   sX  t j|tj|d}|jjdkr|j|_| }t	t
| d}t }	g }
d}d}tt
| D ]u}| | j}| | j}| | j}|
| t||}t||}t
|
|k rw|t
| d krw| |d  j}| |d  j}t||t|| dkrwq.||
ddd	j}|j| d|d
d
d|d}|s|j|dd |t
|
 g }
d}d}q.t }||	 S )N)Ztorch_dtyperh   llama)totalr      i   ptT)Zreturn_tensorspaddingr;   )	input_idsZ	do_sampleZnum_return_sequencesr=   r>   Z	use_cacheZmax_new_tokens)Zskip_special_tokens)r   from_pretrainedtorchZfloat16configZ
model_typeZ	eos_tokenZ	pad_tokencudar   lenrL   rM   ranger:   r*   r+   rJ   maxrn   rN   Zbatch_decodeupdate)r   re   rf   r   rg   rh   r    r0   ZpbarrS   batchZmax_prompt_lenZmax_output_lenr_   r:   r*   rU   Znext_prompt_lenZnext_output_lenrn   Zllm_outputsrT   r1   r1   r2   run_hf   sb   	





	rx   argsresultsc                    s\   t |  d g d gd fdddD d}|r,tj| jd  d	}t|| d S d S )
Nrequests_per_secondtokens_per_second)r{   r|   c                    s   i | ]}| | qS r1   r1   )r-   krz   r1   r2   
<dictcomp>  s    z4save_to_pytorch_benchmark_format.<locals>.<dictcomp>)elapsed_timenum_requeststotal_num_tokens)ry   ZmetricsZ
extra_infor   z.pytorch.json)r   ospathsplitextoutput_jsonr   )ry   rz   Z
pt_recordsZpt_filer1   r~   r2    save_to_pytorch_benchmark_format	  s   

r   c                 C   s  | j | jd}|| j| j| j| j| jd}| j d u s| jdkr,| j|d< | j	|d< t
}n| jdkr=t}| jdkr<d|d	< n| jd
krX|jsL|jsLJ dt}| j	|d< d|d< n{| jdkr`t}ns| jdkr| j tjv rzt}d |d< d|d< d|d	< nY| j tjv rt}d|d< nL| j tjv rt}| j|d< | j|d< d|d	< n5| j tjv rt}d |d< d|d< n$| jdkrt}| j|d< | j|d< | j|d< | j|d< ntd| j dd | D }|di |jdi |S )N)dataset_pathZrandom_seed)rf   	lora_path	max_lorasr   	input_lenrU   randomZrange_ratio
prefix_lensharegpt	vllm-chatTZenable_multimodal_chatsonnetz;Tokenizer/model must have chat template for sonnet dataset.Zreturn_prompt_formattedburstgpthfZdataset_subsettrainZdataset_splitprefix_repetitionZ
suffix_lenZnum_prefixesrU   zUnknown dataset name: c                 S   s   i | ]\}}|d ur||qS r&   r1   )r-   r}   vr1   r1   r2   r   U  s    z get_requests.<locals>.<dictcomp>r1   ) r   seedr   r   Znum_promptsr   rU   dataset_namerandom_range_ratior   r   r   backendZchat_templateZdefault_chat_templater   r
   r   SUPPORTED_DATASET_PATHSr   r   	hf_subsethf_splitr	   r   Zprefix_repetition_prefix_lenZprefix_repetition_suffix_lenZprefix_repetition_num_prefixesZprefix_repetition_output_len
ValueErroritemssample)ry   rf   Zcommon_kwargsZsample_kwargsZdataset_clsr1   r1   r2   get_requests  sp   	

















r   c                 C   s  | j durtjddd | j | _t| dds| j| _h d}| j|vr+td| j | j sF| jsF| j	dvrFt
d	 d
| _	| jdu rFtd| j	dkrct| dddus[t| dddurctjddd n=| j	dkr| jtj tjB v r| jdksJ | j dn| jtjtjB v r| jdksJ | j dnt| j d| j	d
kr| jdurtjddd | j	dvr| jdurtjddd t| ddr| jdkrtdt| ddr| jdu rtd| jdkr| jdu rtd| jdkr| jdurtd| jdv rt| dddurtd | jd!kr| jd"krtd#| jd!kr/| jd$kr/td%| jd!kr@| j| jkrBtd&dS dS )'z*
    Validate command-line arguments.
    NzzThe '--dataset' argument will be deprecated in the next release. Please use '--dataset-name' and '--dataset-path' instead.   )
stacklevelrf   >   miir   r   rF   zUnsupported backend: >   r   z?When dataset path is not set, it will default to random datasetr   z/input_len must be provided for a random datasetr   r   r   z\--hf-subset and --hf-split will be ignored                 since --dataset-name is not 'hf'.r   z' needs to use vllm-chat as the backend.rF   z" needs to use vllm as the backend.z  is not supported by hf dataset.zZ--random-range-ratio will be ignored since                 --dataset-name is not 'random'.>   r   Nr   zh--prefix-len will be ignored since --dataset-name                 is not 'random', 'sonnet', or not set.rK   Fz4LoRA benchmarking is only supported for vLLM backendz3LoRA path must be provided when enable_lora is Truez,HF max batch size is required for HF backendz)HF max batch size is only for HF backend.>   r   r   Zquantizationz&Quantization is only for vLLM backend.r   autoz#dtype must be auto for MII backend.rk   zn must be 1 for MII backend.z8Tokenizer must be the same as the model for MII backend.)Zdatasetwarningswarnr   getattrre   rf   r   r   r   printr   r   r   keysr   r   r	   r   r   r   hf_max_batch_sizeZdtyper   )ry   Zvalid_backendsr1   r1   r2   validate_argsY  s   








r   parserc                 C   s  | j dtg ddd | j dtg dddd	 | j d
td dd | j dtd dd | j dtd dd | j dtd dd | j dtddd | j dtddd | j dtd dd | j dtd dd | j dddd d! | j d"ddd#d! | j d$dd%d& | j d'td d(d | j d)td*d+d | j d,td-d.d | j d/td d0d | j d1td d2d | d3}|j d4td d5d |j d6td d7d |j d8td d9d |j d:td d;d t| } d S )<Nz	--backend)rF   r   r   r   rF   )typechoicesdefaultz--dataset-name)r   r   r   r   r   r   z$Name of the dataset to benchmark on.r   )r   r   helpr   z	--datasetzPath to the ShareGPT dataset, will be deprecated in            the next release. The dataset is expected to be a json in form of list[dict[..., conversations: list[dict[..., value: <prompt_or_response>]]]])r   r   r   z--dataset-pathzPath to the datasetz--input-lenz$Input prompt length for each requestz--output-lenzMOutput length for each request. Overrides the output length from the dataset.z--nrk   z)Number of generated sequences per prompt.z--num-promptsi  zNumber of prompts to process.z--hf-max-batch-sizez"Maximum batch size for HF backend.z--output-jsonz3Path to save the throughput results in JSON format.z--async-engine
store_trueFz,Use vLLM async engine rather than LLM class.)actionr   r   z"--disable-frontend-multiprocessingz(Disable decoupled async engine frontend.z--disable-detokenizez[Do not detokenize the response (i.e. do not include detokenization time in the measurement))r   r   z--lora-pathztPath to the lora adapters to use. This can be an absolute path, a relative path, or a Hugging Face model identifier.z--prefix-lenr   zRNumber of fixed prefix tokens before the random context in a request (default: 0).z--random-range-ratiog        zRange ratio for sampling input/output length, used only for RandomDataset. Must be in the range [0, 1) to define a symmetric sampling range [length * (1 - range_ratio), length * (1 + range_ratio)].z--hf-subsetzSubset of the HF dataset.z
--hf-splitzSplit of the HF dataset.z!prefix repetition dataset optionsz--prefix-repetition-prefix-lenzMNumber of prefix tokens per request, used only for prefix repetition dataset.z--prefix-repetition-suffix-lenz|Number of suffix tokens per request, used only for prefix repetition dataset. Total input length is prefix_len + suffix_len.z --prefix-repetition-num-prefixesz|Number of prefixes to generate, used only for prefix repetition dataset. Prompts per prefix is num_requests // num_prefixes.z--prefix-repetition-output-lenzMNumber of output tokens per request, used only for prefix repetition dataset.)add_argumentstrintfloatadd_argument_groupr   add_cli_args)r   Zprefix_repetition_groupr1   r1   r2   r     s   	r   c                 C   s  | j d u r	| j| _ t|  | jd u rd| _t| j tj| j | jd}t| |}t	dd |D }d }| j
dkr]| jrNtt|| jt| | j| j}nGt|| jt| | j\}}n8| j
dkry| jdksiJ t|| j|| j| j| j| j}n| j
dkrt|| jt| | j\}}ntd	| j
 |rd}d}|D ] }t|tsq||jrt|jnd7 }|td
d |j D 7 }q|| }	ntdd |D }	tdd |D }|	| }|r| j
dkrt!d| j
 d t!dt|| dd|	| dd|| dd t!d|  t!d|  | j"rJ|t||	t|| |	| d}
t#| j"d}t$j%|
|dd W d    n	1 s>w   Y  t&| |
 d S d S )Nr   )rh   c                 s   s    | ]}|j d uV  qd S r&   )r8   r,   r1   r1   r2   r3   I  s    zmain.<locals>.<genexpr>rF   r   rk   r   zUnknown backend: c                 s   s    | ]
}|rt |jV  qd S r&   )rs   Z	token_ids)r-   or1   r1   r2   r3   p  s    
c                 s   s    | ]	}|j |j V  qd S r&   )r*   r+   r-   rr1   r1   r2   r3   t  s    c                 s   s    | ]}|j V  qd S r&   )r+   r   r1   r1   r2   r3   v  s    z+[91mWARNING[0m: Multi-modal request with z backend detected. The following metrics are not accurate because image tokens are not counted. See vllm-project/vllm/issues/9778 for details.zThroughput: z.2fz requests/s, z total tokens/s, z output tokens/szTotal num prompt tokens:  zTotal num output tokens:  )r   r   r   r{   r|   w   )indent)'rf   re   r   r   r   r   ro   rh   r   anyr   Zasync_engineuvlooprunrd   r   r   Zfrom_cli_argsrX   r    rV   r   Ztensor_parallel_sizerx   r   rW   r   
isinstancer   r6   rs   sumrR   r   r   openjsondumpr   )ry   rf   r   Zis_multi_modalZrequest_outputsr   Ztotal_prompt_tokensZtotal_output_tokensror   rz   fr1   r1   r2   main>  s   



	









r   )F)FF)A__doc__argparserG   r   r   r   rL   r   typingr   r   r   rp   r   r   Ztransformersr   r   r   Zvllm.benchmarks.datasetsr	   r
   r   r   r   r   r   r   r   r   Zvllm.benchmarks.lib.utilsr   r   Zvllm.engine.arg_utilsr   r   Zvllm.inputsr   r   Zvllm.lora.requestr   Zvllm.outputsr   Zvllm.sampling_paramsr   Z
vllm.utilsr   listr   booltupler   rV   rW   rd   r   rx   	Namespacedictr   r   r   ArgumentParserr   r   r1   r1   r1   r2   <module>   s   0
F
+
D
?

>^ 