o
    )i=                     @   sr  U d dl mZmZ d dlmZ d dlmZmZmZm	Z	 d dl
mZmZmZmZmZmZmZmZ d dlZd dlZd dlmZmZmZ d dlmZmZ dd	lmZ dd
lm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) edZ*edZ+erxd dl,m-Z. nede/ dZ.G dd deee*e+f Z0G dd de0e	e* e*f Z1G dd de0eej2e3ej2 f ej2f Z4G dd de0ee5ej2f ee5ej2f f Z6G dd de1e! Z7G dd de4Z8G dd deZ9G dd de1e" Z:G dd  d e4Z;G d!d" d"e1e# Z<G d#d$ d$e4Z=ed%e0eef d&Z>G d'd( d(ee5e0eef f Z?ee%e gee0eef  f Z@eeAd)< G d*d+ d+ZBdS ),    )ABCabstractmethod)UserDict)CallableIteratorMappingSequence)TYPE_CHECKINGAnyGenericLiteral
NamedTupleOptionalTypeVarUnionN)	TypeAlias	TypeGuardassert_never)
LazyLoader
is_list_of   )AudioResampler)
	AudioItemHfAudioItemHfImageItemHfVideoItem	ImageItemModalityDataMultiModalDataDictMultiModalFieldConfigMultiModalKwargs	VideoItem_T_IPILImagez	PIL.Imagec                       s   e Zd ZdZdededdf fddZdefdd	Zdefd
dZ	dede
fddZer5dee
 fddZedefddZedede
fddZdee
 fddZedeeef fddZedeeef fddZ  ZS )ModalityDataItemszy
    Represents data items for a modality in
    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
    datamodalityreturnNc                    s   t    || _|| _d S N)super__init__r&   r'   )selfr&   r'   	__class__ a/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/multimodal/parse.pyr+   $   s   

zModalityDataItems.__init__c                 C   s"   t | j d| jdt|  dS )Nz
(modality=z, len=))type__name__r'   lenr,   r/   r/   r0   __repr__*   s   zModalityDataItems.__repr__c                 C   s   |   S r)   	get_countr5   r/   r/   r0   __len__.   s   zModalityDataItems.__len__indexc                 C   s
   |  |S r)   getr,   r:   r/   r/   r0   __getitem__1      
zModalityDataItems.__getitem__c                 C   s   d S r)   r/   r5   r/   r/   r0   __iter__6      zModalityDataItems.__iter__c                 C      t )zGet the number of data items.NotImplementedErrorr5   r/   r/   r0   r8   9      zModalityDataItems.get_countc                 C   rB   )zGet a data item by its index.rC   r=   r/   r/   r0   r<   >   rE   zModalityDataItems.getc                    s    fddt   D S )zGet all data items.c                    s   g | ]}  |qS r/   r;   ).0idxr5   r/   r0   
<listcomp>E   s    z-ModalityDataItems.get_all.<locals>.<listcomp>)ranger8   r5   r/   r5   r0   get_allC   s   zModalityDataItems.get_allc                 C   rB   )z)Get the data to pass to the HF processor.rC   r5   r/   r/   r0   get_processor_dataG   rE   z$ModalityDataItems.get_processor_datac                 C   rB   )z+Get the data to pass directly to the model.rC   r5   r/   r/   r0   get_passthrough_dataL   rE   z&ModalityDataItems.get_passthrough_data)r3   
__module____qualname____doc__r"   strr+   r6   intr9   r#   r>   r	   r   r@   r   r8   r<   listrJ   r   objectrK   rL   __classcell__r/   r/   r-   r0   r%      s"     r%   c                   @   s\   e Zd ZdZdefddZdedefddZdee	e
f fdd	Zdee	e
f fd
dZdS )ProcessorBatchItemsz6Base class for data items that are arranged in a list.r(   c                 C   
   t | jS r)   r4   r&   r5   r/   r/   r0   r8   U   r?   zProcessorBatchItems.get_countr:   c                 C   
   | j | S r)   r&   r=   r/   r/   r0   r<   X   r?   zProcessorBatchItems.getc                 C      | j  d| jiS )Nsr'   r&   r5   r/   r/   r0   rK   [      z&ProcessorBatchItems.get_processor_datac                 C      i S r)   r/   r5   r/   r/   r0   rL   ^   rA   z(ProcessorBatchItems.get_passthrough_dataN)r3   rM   rN   rO   rQ   r8   r"   r<   r   rP   rS   rK   rL   r/   r/   r/   r0   rU   R   s    rU   c                   @   sp   e Zd ZdZdefddZdedejfddZde	e
ef fdd	Zde	e
ef fd
dZdedefddZdS )EmbeddingItemsz
    Base class for data items that are expressed as a batched embedding tensor,
    or a list of embedding tensors (one per item).
    r(   c                 C   rV   r)   rW   r5   r/   r/   r0   r8   i   r?   zEmbeddingItems.get_countr:   c                 C   rX   r)   rY   r=   r/   r/   r0   r<   l   r?   zEmbeddingItems.getc                 C   r^   r)   r/   r5   r/   r/   r0   rK   o   rA   z!EmbeddingItems.get_processor_datac                 C   rZ   )NZ_embedsr\   r5   r/   r/   r0   rL   r   r]   z#EmbeddingItems.get_passthrough_dataitem_idxc                 C      t | |S r)   r4   r<   r,   r`   r/   r/   r0   get_feature_sizeu      zEmbeddingItems.get_feature_sizeN)r3   rM   rN   rO   rQ   r8   torchTensorr<   r   rP   rS   rK   rL   rd   r/   r/   r/   r0   r_   b   s    r_   c                       s   e Zd ZdZdeeejf dedee de	eeejf geee
f f ddf
 fdd	Zdefd
dZdedeeejf fddZdeeef fddZdeeef fddZ  ZS )DictEmbeddingItemsz
    Base class for data items that are expressed as a dictionary of tensors.

    Usually, the dictionary keys correspond to the outputs of HF processor.
    r&   r'   required_fieldsfields_factoryr(   Nc                    s   ddl m} t || ||  }|r't| }d| d| }t|||}	||	  }
|
rEt|	 }d|d|}t||	| _|| _t	
|t||	| _d S )Nr   )BatchFeaturez$The data should contain the fields: z%, but only found the following keys: zrequired_fields=z should be a subset of fields=)Z%transformers.feature_extraction_utilsrk   r*   r+   keysset
ValueErrorfields_configri   r    Zfrom_hf_inputsdict_kwargs)r,   r&   r'   ri   rj   rk   Zmissing_required_data_keysZ	data_keysmsgro   Zmissing_required_fieldsfieldsr-   r/   r0   r+      s*   


zDictEmbeddingItems.__init__c                 C   s   | j | jS r)   )rq   Zget_item_countr'   r5   r/   r/   r0   r8      re   zDictEmbeddingItems.get_countr:   c                 C   s   dd | j | j| D S )Nc                 S   s   i | ]\}}||j qS r/   rY   )rF   kvr/   r/   r0   
<dictcomp>   s    z*DictEmbeddingItems.get.<locals>.<dictcomp>)rq   Zget_itemr'   itemsr=   r/   r/   r0   r<      s   zDictEmbeddingItems.getc                 C   r^   r)   r/   r5   r/   r/   r0   rK      rA   z%DictEmbeddingItems.get_processor_datac                 C   s   | j S r)   rY   r5   r/   r/   r0   rL      s   z'DictEmbeddingItems.get_passthrough_data)r3   rM   rN   rO   r   rP   rf   rg   rm   r   r   r+   rQ   r8   r<   rS   rK   rL   rT   r/   r/   r-   r0   rh   y   s*    
	$rh   c                       s<   e Zd Zdee ddf fddZdedefddZ  ZS )	AudioProcessorItemsr&   r(   Nc                       t  |d d S Naudior*   r+   r,   r&   r-   r/   r0   r+      r]   zAudioProcessorItems.__init__r`   c                 C   s   |  |}t|S r)   )r<   r4   )r,   r`   r{   r/   r/   r0   get_audio_length   s   
z$AudioProcessorItems.get_audio_length)	r3   rM   rN   r   r   r+   rQ   r~   rT   r/   r/   r-   r0   rx          rx   c                       6   e Zd Zdeejeej f ddf fddZ  ZS )AudioEmbeddingItemsr&   r(   Nc                    ry   rz   r|   r}   r-   r/   r0   r+      r]   zAudioEmbeddingItems.__init__	r3   rM   rN   r   rf   rg   rR   r+   rT   r/   r/   r-   r0   r          .r   c                   @   s   e Zd ZU eed< eed< dS )	ImageSizewidthheightN)r3   rM   rN   rQ   __annotations__r/   r/   r/   r0   r      s   
 r   c                       s<   e Zd Zdee ddf fddZdedefddZ  Z	S )	ImageProcessorItemsr&   r(   Nc                    ry   Nimager|   r}   r-   r/   r0   r+      r]   zImageProcessorItems.__init__r`   c                 C   sT   |  |}t|tjrt|j S t|tjtj	fr$|j
\}}}t||S t| d S r)   r<   
isinstancer$   Imager   sizenpndarrayrf   rg   shaper   r,   r`   r   _hwr/   r/   r0   get_image_size   s   


z"ImageProcessorItems.get_image_size)
r3   rM   rN   r   r   r+   rQ   r   r   rT   r/   r/   r-   r0   r      r   r   c                       r   )ImageEmbeddingItemsr&   r(   Nc                    ry   r   r|   r}   r-   r/   r0   r+      r]   zImageEmbeddingItems.__init__r   r/   r/   r-   r0   r      r   r   c                       sz   e Zd Z	ddee deeeee	f e
eeee	f   f  ddf fddZdedefdd	Zdedefd
dZ  ZS )VideoProcessorItemsNr&   metadatar(   c                    s   t  |d || _d S Nvideo)r*   r+   r   )r,   r&   r   r-   r/   r0   r+      s   
zVideoProcessorItems.__init__r`   c                 C   ra   r)   rb   rc   r/   r/   r0   get_num_frames   re   z"VideoProcessorItems.get_num_framesc                 C   sX   |  |d }t|tjrt|j S t|tjtj	fr&|j
\}}}t||S t| d S )Nr   r   r   r/   r/   r0   get_frame_size   s   

z"VideoProcessorItems.get_frame_sizer)   )r3   rM   rN   r   r   r   r   rp   rP   r
   rR   r+   rQ   r   r   r   rT   r/   r/   r-   r0   r      s    	r   c                       r   )VideoEmbeddingItemsr&   r(   Nc                    ry   r   r|   r}   r-   r/   r0   r+      r]   zVideoEmbeddingItems.__init__r   r/   r/   r-   r0   r      r   r   _D)boundc                   @   sp   e Zd ZdZdddededefddZdeeef fd	d
Z	dede
ee eee df f defddZdS )MultiModalDataItemsz
    As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but
    normalized such that each entry corresponds to a list.
    T)strictr'   r   r(   c                C   s<   || vr|rt |  }td|d| dS | |  S )z
        Get the number of data items belonging to a modality.

        If `strict=False`, return `0` instead of raising [`KeyError`][]
        even if the modality is not found.
        	Modality " not found. Available modalities: r   )rm   rl   KeyErrorr8   )r,   r'   r   available_modalitiesr/   r/   r0   r8   
  s   
zMultiModalDataItems.get_countc                 C   s   dd |   D S )z3Get the number of items belonging to each modality.c                 S   s   i | ]	\}}||  qS r/   r7   )rF   mrw   r/   r/   r0   rv     s    z6MultiModalDataItems.get_all_counts.<locals>.<dictcomp>)rw   r5   r/   r/   r0   get_all_counts  s   z"MultiModalDataItems.get_all_countstyp.c                 C   s\   || vrt |  }td|d| | | }t||s,td|d| dt| |S )zs
        Get the data items belonging to a modality,
        requiring that they belong to a certain type.
        r   r   z(Invalid type of data items for modality=z. Expected type: z, but found type: )rm   rl   r   r   	TypeErrorr2   )r,   r'   r   r   rw   r/   r/   r0   	get_items  s   	


zMultiModalDataItems.get_itemsN)r3   rM   rN   rO   rP   boolrQ   r8   r   r   r   r2   r   tupler   r/   r/   r/   r0   r     s    r   ModalityDataParserc                	       sV  e Zd ZdZdddddee ded d	ed
df fddZde	d
e
eejeej f  fddZde	d
e
d fddZded
eejee f fddZded
eejeeeef  f fddZdee d
eeeef  fddZdee d
eeeef  fddZdee d
eeeef  fddZ d
e!ee"f fddZ#d e$d
e%fd!d"Z&  Z'S )#MultiModalDataParsera.  
    Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
    into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].

    Args:
        target_sr (float, optional): Enables automatic resampling of audio
            items to the model's expected sampling rate.
    NlibrosaF)	target_sraudio_resample_methodvideo_needs_metadatar   r   )r   Zscipyr   r(   c                   s"   t    t||d| _|| _d S )N)r   method)r*   r+   r   audio_resamplerr   )r,   r   r   r   r-   r/   r0   r+   D  s   

zMultiModalDataParser.__init__r&   c                 C   s4   t |tjr|jdkS t|tjr|d jdkS dS )N   r      F)r   rf   rg   ndimr   r}   r/   r/   r0   _is_embeddingsS  s
   
z#MultiModalDataParser._is_embeddingsc                 C   s6   t |trt|dkS t |tjtjfr|jdkS dS )Nr   F)r   rR   r4   r   r   rf   rg   r   r}   r/   r/   r0   	_is_empty]  s
   

zMultiModalDataParser._is_emptyr{   c                 C   ^   t |tr|S t |trt|d fS t |tjr|d fS t |tjr)| d fS t	| d S r)   
r   r   rR   r   arrayr   rf   rg   numpyr   )r,   r{   r/   r/   r0   _get_audio_with_sre     

z'MultiModalDataParser._get_audio_with_srr   c                 C   r   r)   r   )r,   r   r/   r/   r0   _get_video_with_metadatat  r   z-MultiModalDataParser._get_video_with_metadatac                 C   s   |  |st|tr|  |d rd S | |rt|S t|ts4t|tjt	j
fr/|jdks4t|tr8|g}nt|tjt	j
frIdd |D }n|}ttj  }|D ]}| |\}}|d u rc|}n| jj||d}|| qSt|S )Nr   r   c                 S      g | ]}|qS r/   r/   rF   elemr/   r/   r0   rH         z:MultiModalDataParser._parse_audio_data.<locals>.<listcomp>)orig_sr)r   r   r   r   r   r   floatr   r   rf   rg   r   rR   r   r   Zresampleappendrx   )r,   r&   
data_itemsZ
new_audios	data_itemr{   r   Z	new_audior/   r/   r0   _parse_audio_data  s8   



z&MultiModalDataParser._parse_audio_datac                 C   s   |  |rd S | |rt|S t|tjs$t|tjtj	fr+|j
dkr+|g}t|S t|tjtj	fr?dd |D }t|S |}t|S )Nr   c                 S   r   r/   r/   r   r/   r/   r0   rH     r   z:MultiModalDataParser._parse_image_data.<locals>.<listcomp>)r   r   r   r   r$   r   r   r   rf   rg   r   r   )r,   r&   r   r/   r/   r0   _parse_image_data  s"   



z&MultiModalDataParser._parse_image_datac                 C   s  |  |rd S | |rt|S t|tjs$t|tjt	j
fr(|jdkr(|g}n"t|tjt	j
fr9dd |D }nt|trHt|dkrH|g}n|}tttjttttf  f   }g }|D ]}| |\}}| jrw|||f || q^|| q^| jsd }t||dS )N   c                 S   r   r/   r/   r   r/   r/   r0   rH     r   z:MultiModalDataParser._parse_video_data.<locals>.<listcomp>r   )r   )r   r   r   r   r$   r   r   r   r   rf   rg   r   r   r4   rR   r   rp   rP   r
   r   r   r   r   )r,   r&   r   Z
new_videosZmetadata_lstr   r   r   r/   r/   r0   _parse_video_data  s6   



 z&MultiModalDataParser._parse_video_datac                 C   s   | j | j| jdS )N)r{   r   r   )r   r   r   r5   r/   r/   r0   _get_subparsers  s   z$MultiModalDataParser._get_subparsersmm_datac                 C   sV   |   }t }| D ]\}}||vrtd| || | }d ur(|||< q|S )NzUnsupported modality: )r   r   rw   rn   )r,   r   Z
subparsersZmm_itemsrt   ru   Zparsed_datar/   r/   r0   parse_mm_data  s   z"MultiModalDataParser.parse_mm_data)(r3   rM   rN   rO   r   r   r   r   r+   rS   r   r   rf   rg   rR   r   r   r   r   r   r   r   r!   rp   rP   r
   r   r   r%   r   r   r   r   r   r   r   r   r   r   rT   r/   r/   r-   r0   r   :  sf    




#

$r   )Cabcr   r   collectionsr   collections.abcr   r   r   r   typingr	   r
   r   r   r   r   r   r   r   r   rf   Ztyping_extensionsr   r   r   Z
vllm.utilsr   r   r{   r   Zinputsr   r   r   r   r   r   r   r   r    r!   r"   r#   Z	PIL.Imager   r$   globalsr%   rU   rg   rR   r_   rP   rh   rx   r   r   r   r   r   r   r   r   r   r   r   r/   r/   r/   r0   <module>   sH   (04<
 
2