o
    )iW                     @   sV   d dl mZmZ d dlmZ d dlZd dlmZ eddd ddZ	eG d	d
 d
Z
dS )    )	dataclassfield)OptionalN)
InputBatchg      g        g      ?)temperaturemin_ptop_ktop_pc                   @   s.  e Zd ZU dZejed< dZejed< dZejed< dZ	ejed< dZ
eed< dZeed	< dZeed
< dZdZdZdZedd dZeee  ed< dZedd dZeeeeef   ed< dZdZedd dZeeejf ed< edeeejf fddZ e!	dde"dedej#dedd f
ddZ$dS )TPUSupportedSamplingMetadataNr   r   r   r	   T
all_greedyFlogprobsno_penaltiesc                   C      t  S Nlist r   r   g/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/v1/sample/tpu/metadata.py<lambda>/       z%TPUSupportedSamplingMetadata.<lambda>)default_factoryoutput_token_idsc                   C   r   r   r   r   r   r   r   r   4   r   
logit_biasc                   C   r   r   )dictr   r   r   r   r   ;   r   _generatorsreturnc                 C   s   | j S r   )r   )selfr   r   r   
generators=   s   z'TPUSupportedSamplingMetadata.generatorsinput_batchpadded_num_reqs
xla_devicegenerate_params_if_all_greedyc                    s   |j r|j dknd}|jdu r|du r| d|dS |j dtjdtjf fdd}||jtd	  ||jtd
  ||jtd  ||j	td  | |jd 
||j|j	d 
||jd 
||jd 
||dS )a/  
        Copy sampling tensors slices from `input_batch` to on device tensors.

        `InputBatch._make_sampling_metadata` causes recompilation on XLA as it 
        slices dynamic shapes on device tensors. This impl moves the dynamic 
        ops to CPU and produces tensors of fixed `padded_num_reqs` size.

        Args:
            input_batch: The input batch containing sampling parameters.
            padded_num_reqs: The padded number of requests.
            xla_device: The XLA device.
            generate_params_if_all_greedy: If True, generate sampling parameters
                even if all requests are greedy. this is useful for cases where
                we want to pre-compile a graph with sampling parameters, even if
                they are not strictly needed for greedy decoding.
        r   FT)r   r   
cpu_tensorr   c                    s   ||  < d S r   r   )r"   Zfill_valnum_reqsr   r   r   
fill_slicec   s   zATPUSupportedSamplingMetadata.from_input_batch.<locals>.fill_slicer   r   r   r	   N)r   r   r	   r   r   r   )Zmax_num_logprobsr   r$   torchTensorZtemperature_cpu_tensorDEFAULT_SAMPLING_PARAMSZmin_p_cpu_tensorZtop_k_cpu_tensorZtop_p_cpu_tensorto)clsr   r   r    r!   Zneeds_logprobsr%   r   r#   r   from_input_batchB   sH   
z-TPUSupportedSamplingMetadata.from_input_batch)F)%__name__
__module____qualname__r   r&   r'   __annotations__r   r   r	   r   boolr   r   Zprompt_token_idsZfrequency_penaltiesZpresence_penaltiesZrepetition_penaltiesr   r   r   intZ
min_tokensr   r   r   floatZallowed_token_ids_maskZbad_words_token_idsr   	Generatorpropertyr   classmethodr   Zdevicer+   r   r   r   r   r
      sL   
 
r
   )dataclassesr   r   typingr   r&   Zvllm.v1.worker.tpu_input_batchr   r   r(   r
   r   r   r   r   <module>   s   