o
    )ix                     @   st  U d dl mZmZ d dlmZmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlZd d	lmZmZ d d
lmZm Z m!Z! d dl"m#Z#m$Z$ er|d dl%Z%d dl&Z%d dl'm(Z( d dl)m*Z* ddl+m,Z, nede- dZ%edZ.edej/df Z0ee1d< 	 ee2d ej/de2ej/ e2d f Z3ee1d< 	 ee2e4 ej/df Z5ee1d< 	 ee0df Z6ee1d< 	 ee3de7e3e8e9ef f f Z:ee1d< 	 ee5e7ej/e4f df Z;ee1d< 	 ee.e2e. f Z<ee1d< 	 eG dd deddZ=ee9e<e f Z>ee1d< 	 e
d d!G d"d# d#Z?ee2d$ e2d de7d% f Z@ee1d$< 	 d&e@d'e@d(eAfd)d*ZBee9e@f ZCee1d+< 	 e
G d,d- d-ZDe
d d!G d.d/ d/eZEe
d d!G d0d1 d1eEZFe
d d!G d2d3 d3eEZGe
d d!G d4d5 d5eEZHG d6d7 d7ZIG d8d9 d9ee9eDf ZJG d:d; d;ZKee9ee? f ZLee1d<< 	 G d=d> d>eZMG d?d@ d@eMZNdS )A    )ABCabstractmethod)UserDictdefaultdict)MappingSequence)	dataclass)partial)
accumulate)	TYPE_CHECKINGAnyLiteralOptional	TypedDictTypeVarUnioncastfinalN)NotRequired	TypeAlias)
LazyLoaderfull_groupby
is_list_of)JSONTreejson_map_leaves)Image)BatchFeature   )MultiModalHashDicttorch_Tr   torch.TensorHfImageItemHfVideoItemHfAudioItem	ImageItem	VideoItem	AudioItemModalityDatac                   @   s:   e Zd ZU dZee ed< 	 ee ed< 	 ee ed< dS )MultiModalDataBuiltinsz7Type annotations for modality types predefined by vLLM.imageZvideoZaudioN)	__name__
__module____qualname____doc__r(   r%   __annotations__r&   r'    r0   r0   b/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/multimodal/inputs.pyr)   `   s   
 r)   F)totalMultiModalDataDictT)frozenc                   @   sX   e Zd ZU dZeed< 	 eed< 	 dZed ed< 	 defdd	Zd
e	de
fddZdS )PlaceholderRangea  
    Placeholder location information for multi-modal data.

    Example:

    Prompt: `AAAA BBBB What is in these images?`

    Images A and B will have:

    ```
    A: PlaceholderRange(offset=0, length=4)
    B: PlaceholderRange(offset=5, length=4)
    ```
    offsetlengthNr!   is_embedreturnc                 C   s"   | j d u r| jS t| j   S N)r8   r7   intsumitemselfr0   r0   r1   get_num_embeds   s   
zPlaceholderRange.get_num_embedsotherc                 C   sb   t || jsdS | j| jf|j|jfksdS | jd u r |jd u S |jd u r*| jd u S t| j|jS NF)
isinstance	__class__r6   r7   r8   nested_tensors_equalr?   rA   r0   r0   r1   __eq__   s   



zPlaceholderRange.__eq__)r+   r,   r-   r.   r;   r/   r8   r   r@   objectboolrG   r0   r0   r0   r1   r5   w   s   
 r5   NestedTensors)r!   .abr9   c                 C   s   t | tjrt |tjot| |S t |tjr$t | tjo#t|| S t | tr:t |to9tdd t| |D S t |trPt | toOtdd t|| D S | |kS )z[Equality check between
    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.c                 s       | ]
\}}t ||V  qd S r:   rE   ).0a_b_r0   r0   r1   	<genexpr>       z'nested_tensors_equal.<locals>.<genexpr>c                 s   rM   r:   rN   )rO   rQ   rP   r0   r0   r1   rR      rS   )rC   r   Tensorequallistallzip)rK   rL   r0   r0   r1   rE      s   



rE   BatchedTensorInputsc                   @   sL   e Zd ZU dZeed< 	 eed< 	 eed< 	 ded< 	 dedefd	d
Z	dS )MultiModalFieldElemz
    Represents a keyword argument corresponding to a multi-modal item
    in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
    modalitykeydataBaseMultiModalFieldfieldrA   r9   c                 C   sz   t || jsdS | jd u r|jd u }n|jd u r| jd u }nt| j|j}| j| jf|j|jfko<|o<t| jt|jkS rB   )rC   rD   r]   rE   r[   r\   typer_   )r?   rA   Z
data_equalr0   r0   r1   rG      s   

zMultiModalFieldElem.__eq__N)
r+   r,   r-   r.   strr/   rJ   rH   rI   rG   r0   r0   r0   r1   rZ      s   
 	rZ   c                
   @   s   e Zd ZdZdedefddZededededee	 fdd	Z
ed
ee dedefddZdddee	 dedefddZdS )r^   z
    Defines how to interpret tensor data belonging to a keyword argument in
    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] for multiple
    multi-modal items, and vice versa.
    r[   r\   c                   s*   t t||| d dtdtf fdd}|S )N)r[   r\   r_   r]   r9   c                    s
    | dS )Nr]   r0   rb   fr0   r1   factory  s   
z3BaseMultiModalField._field_factory.<locals>.factory)r	   rZ   rJ   )r?   r[   r\   re   r0   rc   r1   _field_factory  s   z"BaseMultiModalField._field_factoryr]   r9   c                 C      t )a
  
        Construct
        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
        instances to represent the provided data.

        This is the inverse of
        [`reduce_data`][vllm.multimodal.inputs.BaseMultiModalField.reduce_data].
        NotImplementedErrorr?   r[   r\   r]   r0   r0   r1   build_elems  s   zBaseMultiModalField.build_elemsbatch
pin_memoryc                C   rg   r:   rh   r?   rl   rm   r0   r0   r1   _reduce_data$  s   z BaseMultiModalField._reduce_dataFrm   elemsc                C   sH   dd |D }t t|dkrtd|dd |D }| j||dS )z
        Merge the data from multiple instances of
        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem].

        This is the inverse of
        [`build_elems`][vllm.multimodal.inputs.BaseMultiModalField.build_elems].
        c                 S   s   g | ]}t |jqS r0   )r`   r_   rO   r=   r0   r0   r1   
<listcomp>:      z3BaseMultiModalField.reduce_data.<locals>.<listcomp>r   z#Cannot merge different field_types=c                 S   s   g | ]}|j qS r0   rb   rO   elemr0   r0   r1   rs   >      rp   )lenset
ValueErrorro   )r?   rq   rm   Zfield_typesrl   r0   r0   r1   reduce_data-  s
   zBaseMultiModalField.reduce_dataN)r+   r,   r-   r.   ra   rf   r   rJ   r   rZ   rk   rV   rI   ro   r{   r0   r0   r0   r1   r^      s:    r^   c                	   @   sH   e Zd ZdZdedededee fddZde	e d	e
defd
dZdS )MultiModalBatchedFieldzo
    Info:
        [`MultiModalFieldConfig.batched`][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
    r[   r\   r]   r9   c                    s    | j ||d  fdd|D S )Nr[   r\   c                    s   g | ]} |qS r0   r0   rr   field_factoryr0   r1   rs   P      z6MultiModalBatchedField.build_elems.<locals>.<listcomp>)rf   rj   r0   r~   r1   rk   I  s   z"MultiModalBatchedField.build_elemsrl   rm   c                   s   t |dkrLt|tjddrLt |dkr|d d S |d j t fdd|D rLtjt |g|d jR |d j	|d j
|d}tj||dS |S )	Nr   rW   checkr   c                 3   s    | ]}|j  kV  qd S r:   shaperu   first_shaper0   r1   rR   _      z6MultiModalBatchedField._reduce_data.<locals>.<genexpr>dtypedevicerm   out)rx   r   r   rT   	unsqueeze
contiguousr   rW   emptyr   r   stack)r?   rl   rm   r   r0   r   r1   ro   R  s   
z#MultiModalBatchedField._reduce_dataN)r+   r,   r-   r.   ra   rJ   r   rZ   rk   rV   rI   ro   r0   r0   r0   r1   r|   B  s$    
	r|   c                	   @   sr   e Zd ZU dZeee eee  f ed< dZe	ed< de
de
dedee fd	d
Zdee dedefddZdS )MultiModalFlatFieldz
    Info:
        [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
        [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes]
    slicesr   dimr[   r\   r]   r9   c                    sH   | j ||dt| jtddst tjsJ d fdd| jD S )Nr}   rW   r   z,torch.Tensor is required for multiple slicesc                    s   g | ]} t t| qS r0   )r   slice)rO   sr]   r   r0   r1   rs   }  s    z3MultiModalFlatField.build_elems.<locals>.<listcomp>)rf   r   r   r   rC   r   rT   rj   r0   r   r1   rk   s  s   zMultiModalFlatField.build_elemsrl   rm   c                   s  t |dkrst|tjddrst |dkr|d  S | j| jdk t |d j  dtjffdd  |d t fdd	|D rs\}}tfd
d	|D }tj	g |||R |d j
|d j|d}tj|| j|dS | jdks|J ddd |D S )Nr   rW   r   r   tensorc                    s    | j d   | j  d d  fS Nr   r   )r   r   r0   r1   _shape_before_after  s    z=MultiModalFlatField._reduce_data.<locals>._shape_before_afterc                 3   s    | ]	} |kV  qd S r:   r0   ru   )r   r   r0   r1   rR     s    z3MultiModalFlatField._reduce_data.<locals>.<genexpr>c                 3   s    | ]}|j   V  qd S r:   r   rr   r   r0   r1   rR     r   r   )r   r   z$dim == 0 is required for nested listc                 S   s   g | ]	}|D ]}|qqS r0   r0   )rO   rv   er0   r0   r1   rs         z4MultiModalFlatField._reduce_data.<locals>.<listcomp>)rx   r   r   rT   r   r   r   rW   r<   r   r   r   concat)r?   rl   rm   Zshape_beforeZshape_afterZshape_concatr   r0   )r   r   r   r1   ro     s"   z MultiModalFlatField._reduce_dataN)r+   r,   r-   r.   r   r   r   r/   r   r;   ra   rJ   rZ   rk   rV   rI   ro   r0   r0   r0   r1   r   i  s(   
 
r   c                	   @   sR   e Zd ZU dZeed< dedededee	 fddZ
d	ee d
edefddZdS )MultiModalSharedFieldzm
    Info:
        [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared]
    
batch_sizer[   r\   r]   r9   c                 C   s   | j ||d}||g| j S )Nr}   )rf   r   )r?   r[   r\   r]   r   r0   r0   r1   rk     s   z!MultiModalSharedField.build_elemsrl   rm   c                C   s   |d S )Nr   r0   rn   r0   r0   r1   ro     s   z"MultiModalSharedField._reduce_dataN)r+   r,   r-   r.   r;   r/   ra   rJ   r   rZ   rk   rV   rI   ro   r0   r0   r0   r1   r     s&   
 
	r   c                
       s   e Zd ZedefddZe	ddedeee eee  f de	fddZ
e	dded	d
de	fddZedede	fddZdededdf fddZdededee fddZ  ZS )MultiModalFieldConfigr[   c                 C   s   t t | dS )a  
        Defines a field where an element in the batch is obtained by
        indexing into the first dimension of the underlying data.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.

        Example:

        ```
        Input:
            Data: [[AAAA]
                [BBBB]
                [CCCC]]

        Output:
            Element 1: [AAAA]
            Element 2: [BBBB]
            Element 3: [CCCC]
        ```
        r_   r[   )r   r|   r[   r0   r0   r1   batched  s   zMultiModalFieldConfig.batchedr   r   r   c                 C   s   t t||d| dS )a  
        Defines a field where an element in the batch is obtained by
        slicing along the first dimension of the underlying data.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            slices: For each multi-modal item, a slice (dim=0) or a tuple of
                slices (dim>0) that is used to extract the data corresponding
                to it.
            dim: The dimension to extract data, default to 0.

        Example:

        ```
        Given:
            slices: [slice(0, 3), slice(3, 7), slice(7, 9)]

        Input:
            Data: [AAABBBBCC]

        Output:
            Element 1: [AAA]
            Element 2: [BBBB]
            Element 3: [CC]
        ```

        ```
        Given:
            slices: [
                (slice(None), slice(0, 3)),
                (slice(None), slice(3, 7)),
                (slice(None), slice(7, 9))]
            dim: 1

        Input:
            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]

        Output:
            Element 1: [[A],[A],[A]]
            Element 2: [[B],[B],[B],[B]]
            Element 3: [[C],[C]]
        ```
        )r   r   r   )r   r   )r[   r   r   r0   r0   r1   flat  s   0
zMultiModalFieldConfig.flatsize_per_itemr!   c                    sT   |j dkrtd|j dgt| fddtt|D }tj| | dS )aP  
        Defines a field where an element in the batch is obtained by
        slicing along the first dimension of the underlying data.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            slices: For each multi-modal item, the size of the slice that
                is used to extract the data corresponding to it.
            dim: The dimension to slice, default to 0.

        Example:

        ```
        Given:
            size_per_item: [3, 4, 2]

        Input:
            Data: [AAABBBBCC]

        Output:
            Element 1: [AAA]
            Element 2: [BBBB]
            Element 3: [CC]
        ```

        ```
        Given:
            slices: [3, 4, 2]
            dim: 1

        Input:
            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]

        Output:
            Element 1: [[A],[A],[A]]
            Element 2: [[B],[B],[B],[B]]
            Element 3: [[C],[C]]
        ```

        Info:
            [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
        r   z7size_per_item should be a 1-D tensor, but found shape: r   c                    s6   g | ]}t d d d f  t | |d  f qS r   )r   )rO   ir   Z
slice_idxsr0   r1   rs   C  s
    z9MultiModalFieldConfig.flat_from_sizes.<locals>.<listcomp>r   )ndimrz   r   r
   rangerx   r   r   )r[   r   r   r   r0   r   r1   flat_from_sizes  s   
0
z%MultiModalFieldConfig.flat_from_sizesr   c                 C   s   t t|| dS )a  
        Defines a field where an element in the batch is obtained by
        taking the entirety of the underlying data.

        This means that the data is the same for each element in the batch.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            batch_size: The number of multi-modal items which share this data.

        Example:

        ```
        Given:
            batch_size: 4

        Input:
            Data: [XYZ]

        Output:
            Element 1: [XYZ]
            Element 2: [XYZ]
            Element 3: [XYZ]
            Element 4: [XYZ]
        ```
        r   )r   r   )r[   r   r0   r0   r1   sharedI  s   zMultiModalFieldConfig.sharedr_   r9   Nc                    s   t    || _|| _d S r:   )super__init__r_   r[   )r?   r_   r[   rD   r0   r1   r   k  s   

zMultiModalFieldConfig.__init__r\   rl   c                 C   s   | j | j||S r:   )r_   rk   r[   )r?   r\   rl   r0   r0   r1   rk   q  s   z!MultiModalFieldConfig.build_elems)r   )r+   r,   r-   staticmethodra   r   r   r   r   r;   r   r   r   r^   r   rJ   rZ   rk   __classcell__r0   r0   r   r1   r     s6    4:!r   c                       s   e Zd ZdZedefddZedee fddZ	i fde
eef d	d
f fddZed	efddZd	e
eef fddZ  ZS )MultiModalKwargsItemz
    A collection of
    [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
    corresponding to a data item in
    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
    r[   c                 C   s&   t | dtdtdd}t|gS )zConvenience class for testing.dummyr   )r[   r\   r]   r_   )rZ   r   r   r   r   
from_elems)r[   Zmm_elemr0   r0   r1   r     s   zMultiModalKwargsItem.dummyrq   c                 C   s   t dd | D S )Nc                 S   s   i | ]}|j |qS r0   r\   ru   r0   r0   r1   
<dictcomp>  r   z3MultiModalKwargsItem.from_elems.<locals>.<dictcomp>)r   )rq   r0   r0   r1   r     s   zMultiModalKwargsItem.from_elemsr]   r9   Nc                    sL   t  | dd | j D }t|dksJ d| tt|| _d S )Nc                 S   s   h | ]}|j qS r0   r   ru   r0   r0   r1   	<setcomp>  rw   z0MultiModalKwargsItem.__init__.<locals>.<setcomp>r   zFound different modalities=)r   r   r]   valuesrx   nextiter	_modality)r?   r]   
modalitiesr   r0   r1   r     s   zMultiModalKwargsItem.__init__c                 C      | j S r:   )r   r>   r0   r0   r1   r[     s   zMultiModalKwargsItem.modalityc                 C   s   dd |   D S )Nc                 S   s   i | ]\}}||j qS r0   rb   )rO   r\   rv   r0   r0   r1   r     s    z1MultiModalKwargsItem.get_data.<locals>.<dictcomp>)itemsr>   r0   r0   r1   get_data  s   zMultiModalKwargsItem.get_data)r+   r,   r-   r.   r   ra   r   r   rZ   r   r   r   propertyr[   rJ   r   r   r0   r0   r   r1   r   y  s    
"r   c                       s  e Zd ZdZedddeeef fddZd;de	e
 d	d
f fddZedd Ze	d<deded	efddZe	d<ded  ded	efddZededejjd	efddZdd Zdd Zdd  Zd=d!efd"d#Zd!efd$d%Zd&d' Zd!efd(d)Zd*ed	efd+d,Z d-ed.ed	d
fd/d0Z!d.ed	e"fd1d2Z#d.ed3e"d	e
fd4d5Z$d.ed	e	e
 fd6d7Z%dd8ded	eeef fd9d:Z&  Z'S )>MultiModalKwargsa  
    A dictionary that represents the keyword arguments to
    [`torch.nn.Module.forward`][].

    The metadata `items` enables us to obtain the keyword arguments
    corresponding to each data item in
    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via
    [`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and
    [`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items].
    	hf_inputsr   config_by_keyc                    s.  t ttt f   tttt f t}| D ]%\}}| |}|d ur<|||}t	|dkr<| |< ||j
 | qtt  }| D ]L\}}	 fdd|	D }
dd |
 D }t	t| dkrptd|d|tt| }t|D ]fdd	|
 D }|t| q|qFt|S )
Nr   c                    s   i | ]}| | qS r0   r0   )rO   k)elems_by_keyr0   r1   r     rt   z3MultiModalKwargs.from_hf_inputs.<locals>.<dictcomp>c                 S   s   i | ]	\}}|t |qS r0   )rx   )rO   r   vr0   r0   r1   r     r   r   z0Cannot merge different batch sizes for modality=z! Found: batch_sizes=c                    s   g | ]}|  qS r0   r0   )rO   r   )item_idxr0   r1   rs     r   z3MultiModalKwargs.from_hf_inputs.<locals>.<listcomp>)dictra   r   rZ   r   ry   r   getrk   rx   r[   addrV   r   r   rz   r   r   r   appendr   r   )r   r   Zkeys_by_modalityr\   configrl   rq   r   r[   keysZelems_in_modalityZbatch_sizesr   r0   )r   r   r1   from_hf_inputs  s4   

zMultiModalKwargs.from_hf_inputsr0   r   r9   Nc                    s.   t    t|dd d}t|| _d | _d S )Nc                 S   r   r:   r   xr0   r0   r1   <lambda>  s    z+MultiModalKwargs.__init__.<locals>.<lambda>r   )r   r   r   r   _items_by_modality_data)r?   r   Zitems_by_modalityr   r0   r1   r     s   


zMultiModalKwargs.__init__c                 C   s
   | j  S r:   )r   r   r>   r0   r0   r1   r     s   
zMultiModalKwargs.modalitiesFnested_tensorsrm   c                    s   t | tjr| S t | tjrt| S t | ttfrt| S  fdd| D }t	|tjdds2|S t
ttj |tdkrId d S tfddD rVS tjtgd jR d jd j d	}tj|d
S )z
        Stack the inner dimensions that have the same shape in
        a nested list of tensors.

        Thus, a dimension represented by a list means that the inner
        dimensions are different for each element along that dimension.
        c                    s   g | ]}t | qS r0   r   
_try_stackrO   trp   r0   r1   rs     s    z/MultiModalKwargs._try_stack.<locals>.<listcomp>rW   r   r   r   c                 3   s     | ]}|j  d  j kV  qdS )r   Nr   r   )tensors_r0   r1   rR     s    z.MultiModalKwargs._try_stack.<locals>.<genexpr>r   r   )rC   r   rT   npndarrayZ
from_numpyr;   floatr   r   r   rV   rx   r   r   anyr   r   r   r   r   )r   rm   ZstackedZoutputsr0   )rm   r   r1   r     s0   



zMultiModalKwargs._try_stackinputs_listc                    sd   t | dkri S tttt f t}| D ]}| D ]\}}|| | qq fdd| D S )a  
        Batch multiple inputs together into a dictionary.

        The resulting dictionary has the same keys as the inputs.
        If the corresponding value from each input is a tensor and they all
        share the same shape, the output value is a single batched tensor;
        otherwise, the output value is a list containing the original value
        from each input.
        r   c                    s   i | ]\}}|t | qS r0   r   )rO   r   Z	item_listrp   r0   r1   r     s    z*MultiModalKwargs.batch.<locals>.<dictcomp>)rx   r   ra   rV   rJ   r   r   )r   rm   Z
item_listsZinputsr   r   r0   rp   r1   rl     s   
zMultiModalKwargs.batchbatched_inputsr   c                   s,   t ttj | }t fdd|}t t|S )Nc                    s   | j  ddS )NT)r   Znon_blocking)tor   r   r0   r1   r   *  s    z,MultiModalKwargs.as_kwargs.<locals>.<lambda>)r   r   r   rT   r   rY   )r   r   Zjson_inputsZjson_mappedr0   r   r1   	as_kwargs!  s   

zMultiModalKwargs.as_kwargsc                 C      |    S r:   )r   r   r>   r0   r0   r1   r   0     zMultiModalKwargs.keysc                 C   r   r:   )r   r   r>   r0   r0   r1   r   3  r   zMultiModalKwargs.valuesc                 C   r   r:   )r   r   r>   r0   r0   r1   r   6  r   zMultiModalKwargs.itemsr\   c                C   s   |   ||S r:   )r   r   )r?   r\   defaultr0   r0   r1   r   9  s   zMultiModalKwargs.getc                 O   s`   t |  }|j|g|R i |}| j D ]}|D ]}|j|g|R i | qqd | _|S r:   )r   r   popr   r   r   )r?   r\   argskwargsr]   resr   r=   r0   r0   r1   r   <  s   zMultiModalKwargs.popc                 C   s   t |  S r:   )r   r   r>   r0   r0   r1   __iter__H  r   zMultiModalKwargs.__iter__c                 C   s   |   | S r:   )r   )r?   r\   r0   r0   r1   __getitem__K  r   zMultiModalKwargs.__getitem__rA   c                 C   s   t || jsdS | j|jkS rB   )rC   rD   r   rF   r0   r0   r1   rG   N  s   zMultiModalKwargs.__eq__method_namer[   c                 C   sF   | j std| d|| j vr!t| j  }td|d| d S )N`zH` is not supported when MultiModalKwargs is not initialized with `items`z	Modality z" not found. Available modalities: )r   RuntimeErrorry   r   KeyError)r?   r   r[   Zavailable_modalitiesr0   r0   r1   _validate_modalityT  s   


z#MultiModalKwargs._validate_modalityc                 C   s   |  d| t| j| S )z0Get the number of items belonging to a modality.get_item_count)r   rx   r   r?   r[   r0   r0   r1   r   _  s   zMultiModalKwargs.get_item_count
item_indexc                 C   s   |  d| | j| | S )zr
        Get the keyword arguments corresponding to an item identified by
        its modality and index.
        get_itemr   r   )r?   r[   r   r0   r0   r1   r   d  s   zMultiModalKwargs.get_itemc                 C   s   |  d| | j| S )zg
        Get the keyword arguments corresponding to each item belonging to
        a modality.
        	get_itemsr   r   r0   r0   r1   r   l  s   
zMultiModalKwargs.get_itemsrp   c                   s~   | j d ur| j S tttt f t}| j D ]}|D ]}| D ]\}}|| | q!qq fdd| D }|| _ |S )Nc                    s2   i | ]\}}t |d kr||d  jj| dqS )r   rp   )rx   r_   r{   )rO   r\   rq   rp   r0   r1   r     s    z-MultiModalKwargs.get_data.<locals>.<dictcomp>)	r   r   ra   rV   rZ   r   r   r   r   )r?   rm   r   r   r=   r\   rv   r]   r0   rp   r1   r   t  s   

zMultiModalKwargs.get_data)r0   )Fr:   )(r+   r,   r-   r.   r   r   ra   r   r   r   r   r   r   r   rJ   rI   r   rV   rY   rl   r   typesZDevicer   r   r   r   r   r   r   r   rH   rG   r   r;   r   r   r   r   r   r0   r0   r   r1   r     sf    
!
+

r   MultiModalPlaceholderDictc                   @   sx   e Zd ZU dZed ed< 	 eed< 	 ee ed< 	 e	ee  ed< 	 e
ed< 	 ed ed	< 	 d
ed< 	 e	e ed< dS )MultiModalInputsz
    Represents the outputs of
    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor],
    ready to be passed to vLLM internals.
    Z
multimodalr`   promptZprompt_token_idsZtoken_type_idsZ	mm_kwargsr   Z	mm_hashesr   Zmm_placeholdersZ
cache_saltN)r+   r,   r-   r.   r   r/   ra   rV   r;   r   r   r   r0   r0   r0   r1   r     s$   
 r   c                   @   s:   e Zd ZU dZeed< 	 ee ed< 	 eee  ed< dS )MultiModalEncDecInputsz
    Represents the outputs of
    [`EncDecMultiModalProcessor`][vllm.multimodal.processing.EncDecMultiModalProcessor]
    ready to be passed to vLLM internals.
    Zencoder_promptZencoder_prompt_token_idsZencoder_token_type_idsN)	r+   r,   r-   r.   ra   r/   rV   r;   r   r0   r0   r0   r1   r     s   
 r   )Oabcr   r   collectionsr   r   collections.abcr   r   dataclassesr   	functoolsr	   	itertoolsr
   typingr   r   r   r   r   r   r   r   r   numpyr   Ztyping_extensionsr   r   Z
vllm.utilsr   r   r   Zvllm.utils.jsontreer   r   r   Ztorch.typesZ	PIL.Imager   Z%transformers.feature_extraction_utilsr   Zhasherr   globalsr    r   r"   r/   rV   r#   r   r$   r%   tupler   ra   r&   r'   r(   r)   r3   r5   rJ   rI   rE   rY   rZ   r^   r|   r   r   r   r   r   r   r   r   r0   r0   r0   r1   <module>   s   ,	
03D&6 @& j%