o
    )i                     @   s  U d Z ddlZddlZddlZddlZddlZddlmZmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlZdd	lmZ dd
lmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) zddl*m+Z+ W n e,y   e)dZ*e*-dZ+Y nw zddl.Z/W n e,y   e)dZ/Y nw zddl0Z0W n e,y   e)dZ0Y nw zddl(m1Z1 W n e,y   ddl2m3Z1 Y nw e4e5Z6eG dd dZ7G dd deZ8				 d[d!e9d"e9d#e9d$e9d%e9d&e:d'e:fd(d)Z;ed*e<d'e<fd+d,Z=i Z>e?e9e&f e@d-< d.ed'e
e<ef fd/d0ZAG d1d2 d2e8ZBG d3d4 d4e8ZCd5e1fd6d7ZDd'eEe7 fd8d9ZFG d:d; d;e8ZGed<G d=d> d>e8ZHG d?d@ d@e8ZIG dAdB dBe8ZJG dCdD dDeJZKG dEdF dFeJZLG dGdH dHeJZMG dIdJ dJeJZNG dKdL dLeJZOdMZP	Nd\dOe?dPe<d'e?fdQdRZQG dSdT dTeJZRG dUdV dVeJZSG dWdX dXeJZTG dYdZ dZe8ZUdS )]a!  
This module defines a framework for sampling benchmark requests from various
datasets. Each dataset subclass of BenchmarkDataset must implement sample
generation. Supported dataset types include:
  - ShareGPT
  - Random (synthetic)
  - Sonnet
  - BurstGPT
  - HuggingFace
  - VisionArena
    N)ABCabstractmethod)Mapping)	dataclass)cache)BytesIO)AnyCallableOptionalUnion)Image)PreTrainedTokenizerBase)
deprecated)LoRARequestget_adapter_absolute_path)MultiModalDataDict)convert_image_mode)AnyTokenizerget_lora_tokenizer)PlaceholderModule)load_datasetdatasetsr   pandaslibrosa)FlexibleArgumentParser)ArgumentParserc                   @   s`   e Zd ZU dZeeef ed< eed< eed< dZ	e
eeeee f  ed< dZe
e ed< dS )SampleRequestzA
    Represents a single inference request for benchmarking.
    prompt
prompt_lenexpected_output_lenNmulti_modal_datalora_request)__name__
__module____qualname____doc__r   strr   __annotations__intr!   r
   r   dictlistr"   r    r,   r,   d/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/benchmarks/datasets.pyr   B   s   
 r   c                   @   s   e Zd ZdZdZdefdee deddfddZ	dd	ed
ee	 de
e fddZdddZ		ddedee dee deee ef fddZededede
e fddZde
e deddfddZdS )BenchmarkDatasetr   FNdataset_pathrandom_seedreturnc                 C   s$   || _ |dur	|n| j| _d| _dS )a  
        Initialize the BenchmarkDataset with an optional dataset path and random
        seed.  
        
        Args:
            dataset_path (Optional[str]): Path to the dataset. If None, it
            indicates that a default or random dataset might be used.
            random_seed (int): Seed value for reproducible shuffling or
            sampling. Defaults to DEFAULT_SEED.
        N)r/   DEFAULT_SEEDr0   data)selfr/   r0   r,   r,   r-   __init__Z   s   
zBenchmarkDataset.__init__r   
mm_contentc                 C   s*   |ddg}|dur| | d|dgS )z
        Transform a prompt and optional multimodal content into a chat format.
        This method is used for chat models that expect a specific conversation
        format.
        text)r7   typeNuserZrolecontent)append)r4   r   r6   r;   r,   r,   r-   $apply_multimodal_chat_transformationp   s   	
z5BenchmarkDataset.apply_multimodal_chat_transformationc                 C      t d)a3  
        Load data from the dataset path into self.data.

        This method must be overridden by subclasses since the method to load
        data will vary depending on the dataset format and source.

        Raises:
            NotImplementedError: If a subclass does not implement this method.
        z,load_data must be implemented in subclasses.NotImplementedErrorr4   r,   r,   r-   	load_data~   s   zBenchmarkDataset.load_data	tokenizer	max_loras	lora_pathc                 C   s^   |du s|du rd|fS t d|}tt||t|d}|tvr't|t|< |t| p-|fS )a  
        Optionally select a random LoRA request and return its associated
        tokenizer.

        This method is used when LoRA parameters are provided.  It randomly
        selects a LoRA based on max_loras and retrieves a cached tokenizer for
        that LoRA if available. Otherwise, it returns the base tokenizer.

        Args:
            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
                LoRA is selected.
            max_loras (Optional[int]): The maximum number of LoRAs available.
                If `None`, LoRA is not used.
            lora_path (Optional[str]): Path to the LoRA parameters on disk.
                If `None`, LoRA is not used.

        Returns:
            A tuple with the following elements:
                - A new [LoRARequest][] (or `None` if not applicable).
                - The tokenizer associated with the LoRA request
                  (or the base tokenizer).
        N   )Z	lora_nameZlora_int_idrE   )randomrandintr   r'   lora_path_on_disklora_tokenizer_cacher   )r4   rC   rD   rE   Zlora_idr"   r,   r,   r-   get_random_lora_request   s   z(BenchmarkDataset.get_random_lora_requestnum_requestsc                 C   r>   )a%  
        Abstract method to generate sample requests from the dataset.

        Subclasses must override this method to implement dataset-specific logic
        for generating a list of SampleRequest objects.

        Args:
            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
                for processing the dataset's text.
            num_requests (int): The number of sample requests to generate.

        Returns:
            list[SampleRequest]: A list of sample requests generated from the
            dataset.
        z)sample must be implemented in subclasses.r?   )r4   rC   rL   r,   r,   r-   sample   s   zBenchmarkDataset.samplerequestsc                 C   sL   t ||k r$t| j tj||t | d}|| td| dS dS )a  
        Oversamples the list of requests if its size is less than the desired
        number.

        Args:
            requests (List[SampleRequest]): The current list of sampled
                requests.
            num_requests (int): The target number of requests.
        kz/Oversampled requests to reach %d total samples.N)lenrG   seedr0   choicesextendloggerinfo)r4   rN   rL   
additionalr,   r,   r-   maybe_oversample_requests   s   

z*BenchmarkDataset.maybe_oversample_requestsNr1   NNN)r#   r$   r%   r2   IS_MULTIMODALr
   r'   r)   r5   r   r+   r*   r=   rB   r   tupler   r   rK   r   r   rM   rX   r,   r,   r,   r-   r.   V   sV    



,
r.            Fr   
output_lenmin_lenmax_prompt_lenmax_total_lenskip_min_output_len_checkr1   c           
      C   s<   | |k }| o
||k }| |k}| | |k}	|p|p|p|	 S )a  
    Validate a sequence based on prompt and output lengths.

    Default pruning criteria are copied from the original `sample_hf_requests`
    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
    from `sample_requests` in benchmark_throughput.py.
    r,   )
r   ra   rb   rc   rd   re   Zprompt_too_shortZoutput_too_shortZprompt_too_longZcombined_too_longr,   r,   r-   is_valid_sequence   s   rf   rE   c                 C   s   t | S rY   r   )rE   r,   r,   r-   rI      s   rI   rJ   imagec                 C   s   t | trd| v rtt| d } t | tjrLt| d} t }| j|dd t	|
 d}W d   n1 s=w   Y  ddd	| id
S t | trd| drX| nd|  }dd|id
S td|  d)a  
    Process a single image input and return a multimedia content dictionary.

    Supports three input types:

    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
       containing raw image data.  - Loads the bytes as a PIL.Image.Image.

    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
       a dictionary with the image as a base64 data URL.

    3. String input: - Treats the string as a URL or local file path.  -
       Prepends "file://" if the string doesn't start with "http://" or
       "file://".  - Returns a dictionary with the image URL.

    Raises:
        ValueError: If the input is not a supported type.
    bytesRGBZJPEG)formatutf-8N	image_urlurlzdata:image/jpeg;base64,)r8   rl   )zhttp://file://rn   zInvalid image input zF. Must be a PIL.Image.Image or str or dictionary with raw image bytes.)
isinstancer*   r   openr   r   iosavebase64	b64encodegetvaluedecoder'   
startswith
ValueError)rg   Z
image_dataZimage_base64rl   r,   r,   r-   process_image	  s0   



ry   c                       sf   e Zd ZdZdZdZdZ		d fddZeeeefd	ed
e	de	de
de	de	dee fddZ  ZS )RandomDatasetr           r_      r1   Nc                    0   t  jdi | t| j tj| j d S Nr,   superr5   rG   rR   r0   npr4   kwargs	__class__r,   r-   r5   A     zRandomDataset.__init__rC   rL   
prefix_lenrange_ratio	input_lenra   c              	   K   sv  |dk sJ d|j }| }	||	 }
|dkr"tjjd||d ng }t|
d|  }t|
d|  }t|d|  }t|d|  }td|||| tjj||d |d}tjj||d |d}tjjd||d}g }t	|D ]F}|| | t
||  |  }|| }||}|t||  }|j|ddd | }||}t|}|t||t|| d	 qr|S )
Ng      ?zArandom_range_ratio must be < 1.0 to ensure a valid sampling ranger   sizerF   z=Sampling input_len from [%s, %s] and output_len from [%s, %s]FZadd_special_tokensr   r   r    )
vocab_sizeZnum_special_tokens_to_addr   rG   rH   tolistr)   rU   rV   rangeZarangerv   encoderQ   r<   r   )r4   rC   rL   r   r   r   ra   r   r   Znum_special_tokensZreal_input_lenZprefix_token_idsZ	input_lowZ
input_highZ
output_lowZoutput_highZ
input_lensZoutput_lensoffsetsrN   iZ	inner_seqZtoken_sequencer   Ztotal_input_lenZre_encoded_sequencer,   r,   r-   rM   I  sl   

	

zRandomDataset.samplerZ   )r#   r$   r%   DEFAULT_PREFIX_LENZDEFAULT_RANGE_RATIODEFAULT_INPUT_LENDEFAULT_OUTPUT_LENr5   r   r)   floatr+   r   rM   __classcell__r,   r,   r   r-   rz   :  s6    	rz   c                       sh   e Zd ZdZd fddZdddZ				dd	ed
edee	 dee dee de
defddZ  ZS )ShareGPTDatasetz
    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
    sample requests based on conversation turns.
    r1   Nc                       t  jdi | |   d S r~   r   r5   rB   r   r   r,   r-   r5        zShareGPTDataset.__init__c                 C   sz   | j d u r	tdt| j dd}t|| _W d    n1 s!w   Y  dd | jD | _t| j t	| j d S )N/dataset_path must be provided for loading data.rk   encodingc                 S   s(   g | ]}d |v rt |d  dkr|qS )conversations   rQ   ).0entryr,   r,   r-   
<listcomp>  s
    z-ShareGPTDataset.load_data.<locals>.<listcomp>)
r/   rx   rp   jsonloadr3   rG   rR   r0   shuffler4   fr,   r,   r-   rB     s   
zShareGPTDataset.load_dataFrC   rL   rE   rD   ra   enable_multimodal_chatc              
   K   s   g }| j D ]i}	t||kr n`|	d d d |	d d d }
}| j|||d\}}||
j}||j}t|}|d u r@t|n|}t|||d udsLq|	d }rXt|}nd }|rb| |
|}
|t	|
||||d q| 
|| |S )	Nr   r   valuerF   rC   rD   rE   )re   rg   )r   r   r    r"   r!   )r3   rQ   rK   	input_idsrf   getry   r=   r<   r   rX   )r4   rC   rL   rE   rD   ra   r   r   samplesr   r   
completionr"   
prompt_idscompletion_idsr   Znew_output_lenZ
image_pathr6   r,   r,   r-   rM     sT   





zShareGPTDataset.samplerZ   )NNNFr#   r$   r%   r&   r5   rB   r   r)   r
   r'   boolr+   rM   r   r,   r,   r   r-   r     s.    
	r   parserc                 C   s  | j dtdd | j dtddd | j dtd	g d
dd | j dddd | j dtd dd | d}|j dtddd |j dddd | d}|j dtddd |j dtddd |j d td!d"d | d#}|j d$td d%d | d&}|j d'td(d)d |j d*td+d,d |j d-td.d/d |j d0tdd1d | d2}|j d3td d4d |j d5td d6d |j d7td d8d | d9}|j d:tdd;d |j d<tdd=d |j d>td?d@d |j dAtd+dBd d S )CNz--seedr   )r8   defaultz--num-promptsi  zNumber of prompts to process.)r8   r   helpz--dataset-namerG   )sharegptburstgptsonnetrG   hfcustomprefix_repetitionz$Name of the dataset to benchmark on.)r8   r   rS   r   z--no-stream
store_truez*Do not load the dataset in streaming mode.)actionr   z--dataset-pathzWPath to the sharegpt/sonnet dataset. Or the huggingface dataset ID if using HF dataset.zcustom dataset optionsz--custom-output-len   zBNumber of output tokens per request, used only for custom dataset.z--custom-skip-chat-templatezDSkip applying chat template to prompt, used only for custom dataset.zsonnet dataset optionsz--sonnet-input-len&  zANumber of input tokens per request, used only for sonnet dataset.z--sonnet-output-len   zBNumber of output tokens per request, used only for sonnet dataset.z--sonnet-prefix-len   zBNumber of prefix tokens per request, used only for sonnet dataset.zsharegpt dataset optionsz--sharegpt-output-lenzVOutput length for each request. Overrides the output length from the ShareGPT dataset.zrandom dataset optionsz--random-input-lenr_   zBNumber of input tokens per request, used only for random sampling.z--random-output-lenr|   zCNumber of output tokens per request, used only for random sampling.z--random-range-ratior{   zRange ratio for sampling input/output length, used only for random sampling. Must be in the range [0, 1) to define a symmetric sampling range[length * (1 - range_ratio), length * (1 + range_ratio)].z--random-prefix-lenzNumber of fixed prefix tokens before the random context in a request. The total input length is the sum of `random-prefix-len` and a random context length sampled from [input_len * (1 - range_ratio), input_len * (1 + range_ratio)].zhf dataset optionsz--hf-subsetzSubset of the HF dataset.z
--hf-splitzSplit of the HF dataset.z--hf-output-lenzYOutput length for each request. Overrides the output lengths from the sampled HF dataset.z!prefix repetition dataset optionsz--prefix-repetition-prefix-lenzMNumber of prefix tokens per request, used only for prefix repetition dataset.z--prefix-repetition-suffix-lenz|Number of suffix tokens per request, used only for prefix repetition dataset. Total input length is prefix_len + suffix_len.z --prefix-repetition-num-prefixes
   z|Number of prefixes to generate, used only for prefix repetition dataset. Prompts per prefix is num_requests // num_prefixes.z--prefix-repetition-output-lenzMNumber of output tokens per request, used only for prefix repetition dataset.)add_argumentr)   r'   add_argument_groupr   )r   Zcustom_groupZsonnet_groupZsharegpt_groupZrandom_groupZhf_groupZprefix_repetition_groupr,   r,   r-   add_dataset_parser  s  

	


	

r   c              
      s`   j dkrt jd}|j j j jd}|S  j dkrUt jd} jdkr:|j j j	 j
 jdd}|S jsDjsDJ d|j j j	 j
 jd	d}|S  j d
kr jtjv rit}d _d  _nh jtjv rut}d _n\ jtjv rt}d _nP jtjv rt}nG jtjv rt}d _n; jtjv rt}d _n/ jtjv rt}d _n# jtjv rt}d _ntdd t D }td j d| d|jr݈ jdvrtd| j j j j jdj j j d}|S  fdd fdd fdd fddd}z	| j   }W |S  t!y/ } z	td j  |d }~ww )Nr   )r/   )rL   rC   ra   skip_chat_templater   openai-chatF)rL   r   ra   r   rC   return_prompt_formattedz;Tokenizer/model must have chat template for sonnet dataset.Tr   trainc                 S   s   g | ]
}|j D ]}|qqS r,   )SUPPORTED_DATASET_PATHS)r   clsdataset_namer,   r,   r-   r     s    zget_samples.<locals>.<listcomp>Unsupported dataset path: zH. Huggingface dataset only supports dataset_path from one of following: z_. Please consider contributing if you would like to add support for additional dataset formats.)r   zopenai-audiozRMulti-modal content is only supported on 'openai-chat' and 'openai-audio' backend.)r/   dataset_subsetdataset_splitr0   	no_stream)rL   rC   ra   c                      s    t  j jdj j jdS )Nr0   r/   )rC   rL   ra   )r   rR   r/   rM   num_promptsZsharegpt_output_lenr,   argsrC   r,   r-   <lambda>  s    zget_samples.<locals>.<lambda>c                      s   t  j jdj jdS )Nr   )rC   rL   )BurstGPTDatasetrR   r/   rM   r   r,   r   r,   r-   r     s
    c                      ,   t  j jdj j j j j jdS )Nr   )rC   rL   r   r   ra   r   )	rz   rR   r/   rM   r   Zrandom_prefix_lenZrandom_input_lenZrandom_output_lenZrandom_range_ratior,   r   r,   r-   r     s    c                      r   )Nr   )rC   rL   r   
suffix_lennum_prefixesra   )	PrefixRepetitionRandomDatasetrR   r/   rM   r   Zprefix_repetition_prefix_lenZprefix_repetition_suffix_lenZprefix_repetition_num_prefixesZprefix_repetition_output_lenr,   r   r,   r-   r     s    )r   r   rG   r   zUnknown dataset: )"r   CustomDatasetr/   rM   r   Zcustom_output_lenZcustom_skip_chat_templateSonnetDatasetZendpoint_typeZsonnet_input_lenZsonnet_output_lenZsonnet_prefix_lenZchat_templateZdefault_chat_templateVisionArenaDatasetr   Zhf_splitZ	hf_subsetInstructCoderDatasetMTBenchDatasetConversationDatasetAIMODatasetNextEditPredictionDataset
ASRDatasetMLPerfDatasetsetHuggingFaceDataset__subclasses__rx   r\   rR   r   Zhf_output_lenKeyError)r   rC   ZdatasetZinput_requestsZdataset_classZsupported_datasetsZdataset_mappingerrr,   r   r-   get_samples  s   
 

yn

6
#r   c                       sn   e Zd ZdZd fddZdddZ					dd	ed
edee	 dee dee de
de
defddZ  ZS )r   a*  
    Implements the Custom dataset.  Loads data from a JSONL file and generates
    sample requests based on conversation turns. E.g.,
    ```
    {"prompt": "What is the capital of India?"}
    {"prompt": "What is the capital of Iran?"}
    {"prompt": "What is the capital of China?"}
    ```
    r1   Nc                    r   r~   r   r   r   r,   r-   r5     r   zCustomDataset.__init__c                 C   s   | j d u r	tdg | _| j dr5tj| j dd}d|jvr#td| D ]\}}| j|	  q'nt
dt| j t| j d S )Nr   z.jsonlT)Zpath_or_buflinesr   z*JSONL file must contain a 'prompt' column.z1Only JSONL format is supported for CustomDataset.)r/   rx   r3   endswithpd	read_jsoncolumnsZiterrowsr<   to_dictr@   rG   rR   r0   r   )r4   Z
jsonl_data_rowr,   r,   r-   rB      s"   

zCustomDataset.load_dataFrC   rL   rE   rD   ra   r   r   c                 K   sv   g }	| j D ]-}
t|	|kr n$|
d }|s!|jd|dgddd}t||j}|	t|||d q| |	| |	S )Nr   r9   r:   TFZadd_generation_prompttokenizer   )r3   rQ   apply_chat_templater   r<   r   rX   )r4   rC   rL   rE   rD   ra   r   r   r   sampled_requestsitemr   r   r,   r,   r-   rM   A  s.   
	zCustomDataset.samplerZ   )NNNFFr   r,   r,   r   r-   r     s4    

%
r   zDSonnetDataset is deprecated and will be removed in a future version.c                       sh   e Zd ZdZdZdZdZ		d fddZdd	d
Zeeedfde	de	de	de	de
defddZ  ZS )r   z
    Simplified implementation of the Sonnet dataset.  Loads poem lines from a
    text file and generates sample requests.  Default values here copied from
    `benchmark_serving.py` for the sonnet dataset.
    r   r   r   r1   Nc                    r   r~   r   r   r   r,   r-   r5   {  s   zSonnetDataset.__init__c                 C   sL   | j stdt| j dd}| | _W d    d S 1 sw   Y  d S )Nzdataset_path must be provided.rk   r   )r/   rx   rp   	readlinesr3   r   r,   r,   r-   rB     s
   "zSonnetDataset.load_dataFrL   r   r   ra   r   c                    s@   fdd| j D }tdd |D t| }	d}
d|
dg} j|dd	d
}t |j}||kr:td| dt|| |	 }tt|| |	 d}| j d | }g }t||k rtj	| j || d}|
 d
||  }d|dg} j|dd	d
}t |j}||kr|t|r|n|||d t||k s\|S )Nc                    s   g | ]} |j qS r,   )r   )r   linerC   r,   r-   r     s    z(SonnetDataset.sample.<locals>.<listcomp>c                 s   s    | ]}t |V  qd S rY   r   )r   tokensr,   r,   r-   	<genexpr>  s    z'SonnetDataset.sample.<locals>.<genexpr>z5Pick as many lines as you can from these poem lines:
r9   r:   TFr   z8'input_len' must be higher than the base prompt length (z).r   rO    r   )r3   sumrQ   r   r   rx   roundmaxrG   rS   joinr<   r   )r4   rC   rL   r   r   ra   r   r   Ztokenized_linesZavg_lenZbase_promptZbase_msgZbase_fmtZbase_offsetZnum_input_linesZnum_prefix_linesZprefix_linesr   extra_linesr   msgprompt_formattedr   r,   r   r-   rM     sZ   zSonnetDataset.samplerZ   )r#   r$   r%   r&   r   r   r   r5   rB   r)   r   r+   rM   r   r,   r,   r   r-   r   m  s4    

	r   c                       sl   e Zd ZdZd fddZdd Zdedefd	d
Z		dde	dede
e de
e dee f
ddZ  ZS )r   z
    Implements the BurstGPT dataset.  Loads data from a CSV file and generates
    sample requests based on synthetic prompt generation. Only rows with Model
    "GPT-4" and positive response tokens are used.
    r1   Nc                    r   r~   r   r   r   r,   r-   r5     r   zBurstGPTDataset.__init__c                 C   sH   | j d u r	tdt| j }||d dk }||d dk }|| _d S )Nr   ZModelzGPT-4zResponse tokensr   )r/   rx   r   Zread_csvr3   )r4   dfZgpt4_dfr,   r,   r-   rB     s   

zBurstGPTDataset.load_datarL   c                 C   s@   |t | jkr| jj|| jd}n
| jj|| jdd}|j S )N)nrandom_stateT)r	  r
  replace)rQ   r3   rM   r0   valuesr   )r4   rL   r3   r,   r,   r-   _sample_loaded_data  s   
z#BurstGPTDataset._sample_loaded_datarC   rD   rE   c              	      s   g }| j |d}t|D ]; t|  d }t|  d }	| j|||d\}
}|j fddt|D }||}|t|||	|
d q|S )N)rL   r      r   c                    s   g | ]} |  qS r,   r,   )r   jr   r   r,   r-   r     s    z*BurstGPTDataset.sample.<locals>.<listcomp>)r   r   r    r"   )r  r   r)   rK   r   rv   r<   r   )r4   rC   rL   rD   rE   r   r   r3   r   ra   Zlora_reqZ	token_idsr   r,   r  r-   rM     s(   

zBurstGPTDataset.samplerZ   r[   )r#   r$   r%   r&   r5   rB   r)   r+   r  r   r
   r'   r   rM   r   r,   r,   r   r-   r     s$    r   c                       sn   e Zd ZU dZe Zeee eee	f f e
d< 		ddedededee d	df
 fd
dZdddZ  ZS )r   z.Base class for datasets hosted on HuggingFace.r   FNr/   r   r   r   r1   c                    s6   t  jdd|i| || _|| _| | _|   d S )Nr/   r,   )r   r5   r   r   load_streamrB   )r4   r/   r   r   r   r   r   r,   r-   r5     s
   zHuggingFaceDataset.__init__c                 C   s0   t | j| j| j| jd| _| jj| jd| _dS )z$Load data from HuggingFace datasets.)namesplitZ	streaming)rR   N)r   r/   r   r   r  r3   r   r0   rA   r,   r,   r-   rB     s   zHuggingFaceDataset.load_data)FNrZ   )r#   r$   r%   r&   r   r   r   r'   r*   r	   r(   r   r
   r5   rB   r   r,   r,   r   r-   r     s"   
 "r   c                   @   sD   e Zd ZdZddhZdZ		ddeded	ee d
e	de
f
ddZdS )r   z6Dataset for conversation data with multimodal support.zlmms-lab/LLaVA-OneVision-Dataz Aeala/ShareGPT_Vicuna_unfilteredTNFrC   rL   ra   r   r1   c              	   K   s   | j dd }g }|d u }|D ]e}	t||kr n\|	d }
|
d d |
d d }}||j}||j}t|}t|}|rA|n|}t|trL|dksNJ |rVt||sVqd|	v r`t|	d nd }|rj| ||}|	t
||||d q| || |S )	Nc                 S   s   t | d dkS )Nr   r   r   xr,   r,   r-   r   9  s    z,ConversationDataset.sample.<locals>.<lambda>r   r   r   rF   rg   r   r   r    r!   )r3   filterrQ   r   ro   r)   rf   ry   r=   r<   r   rX   )r4   rC   rL   ra   r   r   Zfiltered_datar   dynamic_outputr   convr   r   r   r   r   completion_lenr6   r,   r,   r-   rM   1  sP   

zConversationDataset.sampleNF)r#   r$   r%   r&   r   r\   r   r)   r
   r   r+   rM   r,   r,   r,   r-   r   *  s$    r   c                   @   sR   e Zd ZdZdZdd dd dZdZ			dd
edede	e de
def
ddZdS )r   z
    Vision Arena Dataset.
    r|   c                 C      | d d d d S )NZconversationr   r;   r,   r  r,   r,   r-   r   l      zVisionArenaDataset.<lambda>c                 C   r  )Nturnsr   r;   r,   r  r,   r,   r-   r   n  r  )zlmarena-ai/VisionArena-Chatz"lmarena-ai/vision-arena-bench-v0.1TNFrC   rL   ra   r   r1   c              	   K   s   |d ur|n| j }g }| jD ]C}t||kr n:| j| j}|d u r+td| j ||}	t|d d }
t||	j}|rF| 	|	|
}	|
t|	|||
d q| || |S )Nr   Zimagesr   r  )r   r3   rQ   r   r   r/   rx   ry   r   r=   r<   r   rX   )r4   rC   rL   ra   r   r   r   r   Z	parser_fnr   r6   r   r,   r,   r-   rM   r  s<   	

zVisionArenaDataset.sampler  )r#   r$   r%   r&   r   r   r\   r   r)   r
   r   r+   rM   r,   r,   r,   r-   r   d  s(    r   c                   @   B   e Zd ZdZdZdhZ		ddededee d	e	d
e
f
ddZdS )r   a  
    InstructCoder Dataset.
    https://huggingface.co/datasets/likaixin/InstructCoder

    InstructCoder is the dataset designed for general code editing.  It consists
    of 114,239 instruction-input-output triplets, and covers multiple distinct
    code editing scenario.
    r   zlikaixin/InstructCoderNFrC   rL   ra   r   r1   c           
      K   s   |d ur|n| j }g }| jD ]3}t||kr n*|d  d|d  d}|jd|dgddd	}t||j}	|t||	|d
 q| || |S )Ninputz

ZinstructionzB Just output             the code, do not include any explanation.r9   r:   TFr   r   r   r3   rQ   r   r   r<   r   rX   
r4   rC   rL   ra   r   r   r   r   r   r   r,   r,   r-   rM     s4   
	zInstructCoderDataset.sampler  r#   r$   r%   r&   r   r   r   r)   r
   r   r+   rM   r,   r,   r,   r-   r     s$    	r   c                   @   r  )r   a'  
    MT-Bench Dataset.
    https://huggingface.co/datasets/philschmid/mt-bench

    We create a single turn dataset for MT-Bench.
    This is similar to Spec decoding benchmark setup in vLLM
    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
    r   zphilschmid/mt-benchNFrC   rL   ra   r   r1   c           
      K   s   |d ur|n| j }g }| jD ]-}t||kr n$|d d }|jd|dgddd}t||j}	|t||	|d q| || |S )	Nr  r   r9   r:   TFr   r   r!  r"  r,   r,   r-   rM     s4   	
	zMTBenchDataset.sampler  r#  r,   r,   r,   r-   r     s$    	r   c                	   @   s:   e Zd ZdZh dZ	d
dededee defdd	Z	dS )r   zO
    Dataset class for processing a AIMO dataset with reasoning questions.
    >   zAI-MO/aimo-validation-aimezAI-MO/NuminaMath-CoTzAI-MO/NuminaMath-1.5NrC   rL   ra   r1   c              	   K   s   g }|d u }| j D ]L}t||kr nC|d |d }}	||j}
||	j}t|
}t|}|r2|n|}t|tr=|dks?J |rJt||dddsJq	|t|||d d q	| || |S )NproblemZsolutionr   r`   i }  )rc   rd   r  )	r3   rQ   r   ro   r)   rf   r<   r   rX   )r4   rC   rL   ra   r   r   r  r   r   r   r   r   r   r  r,   r,   r-   rM     s8   


zAIMODataset.samplerY   )
r#   r$   r%   r&   r   r   r)   r
   r+   rM   r,   r,   r,   r-   r     s    r   a*  ### Instruction:
You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.

### User Edits:

{}

### User Excerpt:

{}

### Response:

<|editable_region_start|>rM   original_start_markerc           	      C   sH   | d }| d }| d }t ||}||}||d }|}||dS )a4  Format the zeta prompt for the Next Edit Prediction (NEP) dataset.

    This function formats examples from the NEP dataset
    into prompts and expected outputs. It could be
    further extended to support more NEP datasets.

    Args:
        sample: The dataset sample containing events,
            inputs, and outputs.
        original_start_marker: The marker indicating the
            start of the editable region. Defaults to
            "<|editable_region_start|>".

    Returns:
        A dictionary with the formatted prompts and expected outputs.
    eventsr   outputN)r   expected_output)zeta_promptrj   find)	rM   r&  r'  r   r(  r   Zoutput_start_indexZoutput_focused_regionr)  r,   r,   r-   _format_zeta_promptP  s   

r,  c                   @   s0   e Zd ZdZdhZdeiZdedefddZ	dS )r   zF
    Dataset class for processing a Next Edit Prediction dataset.
    zzed-industries/zetarC   rL   c              
   K   s   | j | j}|d u rtd| j g }| jD ](}||}|t|d t||d jt||d jd t||kr@ nq| 	|| |S )Nr   r   r)  r   )
MAPPING_PROMPT_FUNCSr   r/   rx   r3   r<   r   rQ   r   rX   )r4   rC   rL   r   Zformatting_prompt_funcr   rM   r,   r,   r-   rM   }  s,   
z NextEditPredictionDataset.sampleN)
r#   r$   r%   r&   r   r,  r-  r   r)   rM   r,   r,   r,   r-   r   q  s    r   c                	   @   sT   e Zd ZU dZh dZdZdZdZdZe	e
d< 	dded	ed
ee defddZdS )r   a  
    Dataset class for processing a ASR dataset for transcription.
    Tested on the following set:

    +----------------+----------------------------------------+--------------------------+-----------------------------+
    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
    +----------------+----------------------------------------+--------------------------+-----------------------------+
    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
    |                |                                        |                          | release3-speaker-adaptation |
    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
    +----------------+----------------------------------------+--------------------------+-----------------------------+

    >   zfacebook/voxpopulizopenslr/librispeech_asrzLIUM/tedliumzspeechcolab/gigaspeechzkensho/spgispeechzedinburghcstr/amir|   Tz9<|startoftranscript|><|en|><|transcribe|><|notimestamps|>skip_long_audiosNrC   rL   ra   r1   c              	   K   s   |d ur|n| j }tj}t||j}g }d}| jD ];}	t||kr$ n2|	d }
|
d |
d }}tj||d}| jrD|dkrD|d7 }qd||fi}|	t
||||d q|r^td	| | || |S )
Nr   audioarrayZsampling_rate)ysr   rF   r  z_%d samples discarded from dataset due to their length being greater than what Whisper supports.)r   r   TRANSCRIPTION_PREAMBLErQ   r   r3   r   Zget_durationr.  r<   r   rU   warningrX   )r4   rC   rL   ra   r   r   r   r   Zskippedr   r/  r1  r2  Z
duration_sr6   r,   r,   r-   rM     sB   
zASRDataset.samplerY   )r#   r$   r%   r&   r   r   r\   r4  r.  r   r(   r   r)   r
   r+   rM   r,   r,   r,   r-   r     s$   
 	r   c                
   @   s>   e Zd ZdZddhZ	ddededee dee	 fd	d
Z
dS )r   a2  
    MLPerf Inference Dataset.

    Dataset on HF:
    https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data
    https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data

    Each record contains:
      - "system_prompt": system role instruction.
      - "question": user question.
      - "output": reference answer.

    We combine the system prompt and question into a chat-formatted prompt
    (using the tokenizer's chat template) and set the expected output length to
    the tokenized length of the provided reference answer.
    z"mgoin/mlperf-inference-llama2-dataz$mgoin/mlperf-inference-llama3.1-dataNrC   rL   ra   r1   c                 K   s   |d u }g }| j D ]N}t||kr nE|d }|d }	|d }
d|dd|	dg}|j|ddd	}t||j}t||
dd
j}|rE|n|}t||sMq	|t|||d q	| || |S )Nsystem_promptquestionr(  systemr:   r9   TFr   r   r   )r3   rQ   r   r   rf   r<   r   rX   )r4   rC   rL   ra   r   r  r   r   r6  r7  Zreference_answermessagesr  r   Zref_out_lenr    r,   r,   r-   rM     s<   

zMLPerfDataset.samplerY   )r#   r$   r%   r&   r   r   r)   r
   r+   r   rM   r,   r,   r,   r-   r     s    	r   c                       sf   e Zd ZdZdZdZdZ		d fddZeeeefded	e	d
e	de	de	de	de
e fddZ  ZS )r   r   r   r|   r1   Nc                    r}   r~   r   r   r   r,   r-   r5   C  r   z&PrefixRepetitionRandomDataset.__init__rC   rL   r   r   r   ra   c              	      s   j || }|dkrtd| d| ddtdtt f fdd g }	t|D ](}
 |}t|D ]}
 |}|| }|}t|}|	t|||d	 q5q+t	
|	 |	S )
Nr   znum_requests (z1) must be greater than or equal to num_prefixes ()target_lengthr1   c                    sr   t jjd| d }|}j|dd}t|| kr|S t|| k r3| t| } |}|| S |d|  S )zOGenerate tokens that decode and re-encode to exactly
            target_length.r   r   Fr   N)r   rG   rH   r   rv   r   rQ   )r;  r   r7   Z
re_encodedneededZextra_tokens_generate_exact_length_tokensrC   r   r,   r-   r>  ]  s   
zKPrefixRepetitionRandomDataset.sample.<locals>._generate_exact_length_tokensr   )r   rx   r)   r+   r   rv   rQ   r<   r   rG   r   )r4   rC   rL   r   r   r   ra   r   Zprompts_per_prefixrN   r   Zprefix_tokensZsuffix_tokensZcombined_tokensr   r   r,   r=  r-   rM   K  s6   


z$PrefixRepetitionRandomDataset.samplerZ   )r#   r$   r%   r   ZDEFAULT_SUFFIX_LENZDEFAULT_NUM_PREFIXESr   r5   r   r)   r+   r   rM   r   r,   r,   r   r-   r   ;  s6    	r   )r^   r_   r`   F)r%  )Vr&   rs   rq   r   loggingrG   abcr   r   collections.abcr   dataclassesr   	functoolsr   r   typingr   r	   r
   r   numpyr   ZPILr   Ztransformersr   Ztyping_extensionsr   Zvllm.lora.requestr   Zvllm.lora.utilsr   Zvllm.multimodalr   Zvllm.multimodal.imager   Z!vllm.transformers_utils.tokenizerr   r   Z
vllm.utilsr   r   r   ImportErrorZplaceholder_attrr   r   r   r   argparser   	getLoggerr#   rU   r   r.   r)   r   rf   r'   rI   rJ   r*   r(   ry   rz   r   r   r+   r   r   r   r   r   r   r   r   r   r   r*  r,  r   r   r   r   r,   r,   r,   r-   <module>   s   
 
1YK % \QE$:98:1
!'TO