o
    )iI+                     @   s(  d dl mZmZ d dlmZ d dlmZmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ dd	lmZmZmZmZmZ dd
lm Z m!Z!m"Z" ee#Z$eG dd dZ%G dd de
Z&G dd de
Z'ede!dZ(G dd dee	e( Z)G dd de	e( Z*dS )    )ABCabstractmethod)Mapping)	dataclassfield)Generic
NamedTupleOptionalTypeVarUnioncastN)Image)init_logger   )MultiModalDataDictMultiModalEncDecInputsMultiModalInputsMultiModalKwargsMultiModalPlaceholderDict)BaseMultiModalProcessorBaseProcessingInfoEncDecMultiModalProcessorc                   @   sb   e Zd ZU dZeeee f ed< e	ed< e
edZeeef ed< e
edZeeef ed< dS )ProcessorInputszq
    Represents the keyword arguments to
    [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
    promptmm_data)default_factoryhf_processor_mm_kwargstokenization_kwargsN)__name__
__module____qualname____doc__r   strlistint__annotations__r   r   dictr   r   objectr    r(   r(   e/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/multimodal/profiling.pyr      s   
 r   c                   @   s   e Zd ZU dZee ed< dS )DummyEncoderDataDummy data used for profiling.prompt_token_idsN)r   r   r    r!   r#   r$   r%   r(   r(   r(   r)   r*   $   s   
 r*   c                   @   s.   e Zd ZU dZee ed< eed< eed< dS )DummyDecoderDatar+   r,   multi_modal_datamulti_modal_placeholdersN)	r   r   r    r!   r#   r$   r%   r   r   r(   r(   r(   r)   r-   *   s
   
 r-   _I)boundc                       s   e Zd ZdZdeddf fddZedeee	f defdd	Z
ed
e	deee	f defddZd
e	deee	f defddZde	de	deej fddZde	de	de	deej fddZde	de	de	de	deej f
ddZ  ZS )BaseDummyInputsBuilderz_
    Abstract base class that constructs the dummy data to profile
    multi-modal models.
    inforeturnNc                       t    || _d S N)super__init__r3   )selfr3   	__class__r(   r)   r8   ;   s   

zBaseDummyInputsBuilder.__init__	mm_countsc                 C      t )zD
        Build the text input corresponding to `mm_counts`.
        NotImplementedError)r9   r<   r(   r(   r)   get_dummy_text@   s   z%BaseDummyInputsBuilder.get_dummy_textseq_lenc                 C   r=   )z
        Build the multimodal input which, after processing, results in
        the maximum possible number of placeholder tokens.
        r>   r9   rA   r<   r(   r(   r)   get_dummy_mm_dataG   s   
z(BaseDummyInputsBuilder.get_dummy_mm_datac                 C   s,   |  |}| ||}ddi}t|||dS )z
        Build the input which, after processing, results in
        the maximum possible number of placeholder tokens.
        Z
truncationF)r   r   r   )r@   rC   r   )r9   rA   r<   Z
dummy_textZdummy_mm_datar   r(   r(   r)   get_dummy_processor_inputsS   s   
	z1BaseDummyInputsBuilder.get_dummy_processor_inputslength
num_audiosc                C   s"   |dkrg S t |f}|g| S )Nr   )npZzeros)r9   rE   rF   Zaudior(   r(   r)   _get_dummy_audiosd   s   
z(BaseDummyInputsBuilder._get_dummy_audioswidthheight
num_imagesc                C   s*   |dkrg S t jd||fdd}|g| S )Nr   RGB   )color)r   new)r9   rI   rJ   rK   imager(   r(   r)   _get_dummy_imageso   s   
z(BaseDummyInputsBuilder._get_dummy_images
num_frames
num_videosc                C   s*   |dkrg S t |||dfd}|g| S )Nr      rM   )rG   full)r9   rI   rJ   rR   rS   Zvideor(   r(   r)   _get_dummy_videos{   s   
z(BaseDummyInputsBuilder._get_dummy_videos)r   r   r    r!   r0   r8   r   r   r"   r$   r@   r   rC   r   rD   r#   nptZNDArrayrH   r   rQ   rV   __classcell__r(   r(   r:   r)   r2   5   s^    




r2   c                       sV  e Zd ZdZdee ddf fddZedefddZ	ede
e fd	d
Zdeeef fddZ	ddedeeeef  defddZ	ddededeeef fddZ	ddedeeeef  defddZ	ddedeeeef  defddZ		d dedeeeef  dedeeef fddZ	ddedeeeef  fddZ  ZS )!MultiModalProfilerzL
    Contains code for running memory profiling for multi-modal models.
    	processorr4   Nc                    r5   r6   )r7   r8   rZ   )r9   rZ   r:   r(   r)   r8      s   

zMultiModalProfiler.__init__c                 C      | j jS r6   )rZ   r3   r9   r(   r(   r)   processing_info      z"MultiModalProfiler.processing_infoc                 C   r[   r6   )rZ   dummy_inputsr\   r(   r(   r)   r_      r^   zMultiModalProfiler.dummy_inputsc                 C   r[   r6   )rZ   Zallowed_mm_limitsr\   r(   r(   r)   get_mm_limits   s   z MultiModalProfiler.get_mm_limitsrA   r<   c                 C   s>   |d u r|   }| j}|||}| jj|j|j|j|jdS )N)r   r   r   r   )	r`   r_   rD   rZ   applyr   r   r   r   )r9   rA   r<   factoryZprocessor_inputsr(   r(   r)   _get_dummy_mm_inputs   s   z'MultiModalProfiler._get_dummy_mm_inputsT	mm_inputsmm_embeddings_onlyc                    s   |d } fdd|  D S )Nmm_placeholdersc                    s(   i | ]\}}|t  fd d|D qS )c                 3   s"    | ]} r
|  n|jV  qd S r6   )Zget_num_embedsrE   ).0itemre   r(   r)   	<genexpr>   s    zCMultiModalProfiler._get_mm_num_tokens.<locals>.<dictcomp>.<genexpr>)sum)rg   ZmodalityZplaceholdersri   r(   r)   
<dictcomp>   s    z9MultiModalProfiler._get_mm_num_tokens.<locals>.<dictcomp>)items)r9   rd   re   Zplaceholders_by_modalityr(   ri   r)   _get_mm_num_tokens   s   
z%MultiModalProfiler._get_mm_num_tokensc              	   C   s   |  ||}tt|}|d }t|}tt| j}|jr/t||| }|dg|  t|S ||krCt	j
sCtd||t| | t|S )Nencoder_prompt_token_idsr   a  The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). This may cause certain multi-modal inputs to fail during inference, even when the input text is short. To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.)rc   r   r   lenr   rZ   Zpad_dummy_encoder_promptmaxextendenvsVLLM_USE_V1loggerwarning_oncer"   rn   r*   )r9   rA   r<   rd   ro   	total_lenrZ   Znum_tokens_to_padr(   r(   r)   get_encoder_dummy_data   s"   

z)MultiModalProfiler.get_encoder_dummy_datac              	   C   sv   |  ||}|d }t|}||kr"tjs"td||t| | ||k r0|dg||   t	||d |d dS )Nr,   a  The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). This may cause certain multi-modal inputs to fail during inference, even when the input text is short. To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.r   Z	mm_kwargsrf   )r,   r.   r/   )
rc   rp   rs   rt   ru   rv   r"   rn   rr   r-   )r9   rA   r<   rd   r,   rw   r(   r(   r)   get_decoder_dummy_data   s"   
z)MultiModalProfiler.get_decoder_dummy_datac                    s   d u r|   | jj|d  d ur>d u rt  }nt fdd   @ D }||kr<td||  S | |}| j	||dS )N)rA   r<   c                 3   s     | ]} | |  V  qd S r6   r(   )rg   kZmax_tokens_per_itemr<   r(   r)   rj     s    z8MultiModalProfiler._get_mm_max_tokens.<locals>.<genexpr>zThe sequence length (%d) is smaller than the pre-defined worst-case total number of multimodal tokens (%d). This may cause certain multi-modal inputs to fail during inference. To avoid this, you should increase `max_model_len` or reduce `mm_counts`.ri   )
r`   r]   Zget_mm_max_tokens_per_itemrk   valueskeysru   rv   rc   rn   )r9   rA   r<   re   Ztotal_mm_tokensrd   r(   r{   r)   _get_mm_max_tokens  s2   	z%MultiModalProfiler._get_mm_max_tokensc                 C   s   | j ||ddS )a  
        Returns the maximum length of the multimodal (image placeholders+text)
        tokens, including any break/text tokens in-between image embeddings.

        <im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>
        Returns 9, even when the number of image embeddings is 6.
        
        This is important to take into account when profiling and
        initializing the encoder cache size.
        Fri   )r~   rB   r(   r(   r)   get_mm_max_contiguous_tokens'  s   z/MultiModalProfiler.get_mm_max_contiguous_tokensr6   )T)NT)r   r   r    r!   r   r0   r8   propertyr   r]   r2   r_   r   r"   r$   r`   r	   r   rc   boolrn   r*   rx   r-   ry   r~   r   rX   r(   r(   r:   r)   rY      sz    



$
#

'rY   )+abcr   r   collections.abcr   dataclassesr   r   typingr   r   r	   r
   r   r   numpyrG   Znumpy.typingrW   ZPILr   Z	vllm.envsrs   Zvllm.loggerr   Zinputsr   r   r   r   r   
processingr   r   r   r   ru   r   r*   r-   r0   r2   rY   r(   r(   r(   r)   <module>   s&    T