o
    )i-B                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZmZ d dlmZmZ d dlmZ d dlZd dlm
Z d dlZd d	lmZmZ d d
lmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 edZ1erddl2m3Z3m4Z4m5Z5m6Z6 neZ3eZ4eZ5eZ6ee j7dZ8e9e8j: G dd dZ;dej<de=de>fddZ?ddddejd e>d!e>de>fd"d#Z@d$ejAde>fd%d&ZBd'e6deCeDe>eEf  fd(d)ZFed*d+eCe4 deCeCe4  fd,d-ZGdd.d/d0eCe5 d1ejHjId2eJdeeDe>eEe3f  fd3d4ZKd5ejLd6ejMjNdejLfd7d8ZO	dEd9e>d:eePe>ef  deDej<eeEe=f f fd;d<ZQ	dEd=e>d>eePe>ef  dejfd?d@ZR	dEdAe>dBeePe>ef  deDejAePe>ef f fdCdDZSdS )F    N)Iterable)ThreadPoolExecutor)groupby)Path)TYPE_CHECKINGAnyOptionalTypeVarUnion)ParseResulturlparse)url2pathname)ImageUnidentifiedImageError)
deprecated)HTTPConnectionglobal_http_connection)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather   )AudioMediaIO)MediaIO)ImageEmbeddingMediaIOImageMediaIO)VideoMediaIO_M)BatchedTensorInputsMultiModalKwargsMultiModalKwargsItemMultiModalPlaceholderDict)max_workersc                       s  e Zd Zdefdddeeeeeef f  dededdf fdd	Z	d
e
dee defddZd
e
dee defddZdddedee dee defddZdddedee dee defddZdedeejeeef f fddZdedeejeeef f fddZdddededejfd d!Zdddededejfd"d#Zddd$ededeejeeef f fd%d&Zddd$ededeejeeef f fd'd(Zd)ede j!fd*d+Z"  Z#S ),MediaConnectorN )allowed_local_media_pathmedia_io_kwargs
connectionr$   returnc                   sj   t    |r	|ni | _|| _|r.t|}| s!td| d| s-td| dnd}|| _dS )a  
        Args:
            media_io_kwargs: Additional args passed to process media 
                             inputs, keyed by modalities. For example, 
                             to set num_frames for video, set 
                             `--media-io-kwargs '{"video":{"num_frames":40}}'`
            connection: HTTP connection client to download media contents.
            allowed_local_media_path: A local directory to load media files
                                      from.
        z/Invalid `--allowed-local-media-path`: The path z does not exist.z must be a directory.N)	super__init__r%   r&   r   exists
ValueErroris_dirr$   )selfr%   r&   r$   Zallowed_local_media_path_	__class__ a/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/multimodal/utils.pyr)   1   s*   


zMediaConnector.__init__url_specmedia_ioc                 C   sB   |j dd\}}|dd\}}|dkrd}t||||S )N,r   ;base64z,Only base64 data URLs are supported for now.)pathsplitNotImplementedErrorload_base64)r-   r2   r3   Z	data_specdata
media_typeZ	data_typemsgr0   r0   r1   _load_data_urlX   s   zMediaConnector._load_data_urlc                 C   sR   | j }|d u rtdtt|j}|| jvr$td| d| d||S )Nz=Cannot load local files without `--allowed-local-media-path`.zThe file path z3 must be a subpath of `--allowed-local-media-path` .)	r$   RuntimeErrorr   r   r7   resolveparentsr+   Z	load_file)r-   r2   r3   r$   filepathr0   r0   r1   _load_file_urlf   s   
zMediaConnector._load_file_urlfetch_timeouturlrF   c                C   sj   t |}|jdr| j}|j||d}||S |jdkr$| ||S |jdkr/| ||S d}t|Nhttp)timeoutr;   filez0The URL must be either a HTTP, data or file URL.)	r   scheme
startswithr&   	get_bytes
load_bytesr>   rD   r+   )r-   rG   r3   rF   r2   r&   r;   r=   r0   r0   r1   load_from_urlx   s   


zMediaConnector.load_from_urlc          
         s   t |}t }|jdr)| j}|j||dI d H }|t|j	|}|I d H S |jdkr<|t| j
||}|I d H S |jdkrO|t| j||}|I d H S d}	t|	rH   )r   asyncioget_running_looprL   rM   r&   Zasync_get_bytesrun_in_executorglobal_thread_poolrO   r>   rD   r+   )
r-   rG   r3   rF   r2   loopr&   r;   futurer=   r0   r0   r1   load_from_url_async   s0   




z"MediaConnector.load_from_url_async	audio_urlc                 C   s*   t di | jdi }| j||tjdS )z(
        Load audio from a URL.
        audiorE   Nr0   )r   r%   getrP   envsVLLM_AUDIO_FETCH_TIMEOUTr-   rX   audio_ior0   r0   r1   fetch_audio   s   zMediaConnector.fetch_audioc                    s2   t di | jdi }| j||tjdI dH S )z8
        Asynchronously fetch audio from a URL.
        rY   rE   Nr0   )r   r%   rZ   rW   r[   r\   r]   r0   r0   r1   fetch_audio_async   s   z MediaConnector.fetch_audio_asyncRGB
image_mode	image_urlrc   c             
   C   sX   t dd|i| jdi }z
| j||tjdW S  ty+ } ztt||d}~ww )z
        Load a PIL image from a HTTP or base64 data URL.

        By default, the image is converted into RGB format.
        rc   imagerE   Nr0   )	r   r%   rZ   rP   r[   VLLM_IMAGE_FETCH_TIMEOUTr   r+   strr-   rd   rc   image_ioer0   r0   r1   fetch_image   s   
zMediaConnector.fetch_imagec             
      s`   t dd|i| jdi }z| j||tjdI dH W S  ty/ } ztt||d}~ww )z
        Asynchronously load a PIL image from a HTTP or base64 data URL.

        By default, the image is converted into RGB format.
        rc   re   rE   Nr0   )	r   r%   rZ   rW   r[   rf   r   r+   rg   rh   r0   r0   r1   fetch_image_async   s   
z MediaConnector.fetch_image_async	video_urlc                C   sH   t dd|i| jdi }t|fi | jdi }| j||tjdS )z<
        Load video from a HTTP or base64 data URL.
        rc   re   videorE   Nr0   )r   r%   rZ   r   rP   r[   VLLM_VIDEO_FETCH_TIMEOUTr-   rm   rc   ri   video_ior0   r0   r1   fetch_video   s   
	zMediaConnector.fetch_videoc                   sP   t dd|i| jdi }t|fi | jdi }| j||tjdI dH S )z
        Asynchronously load video from a HTTP or base64 data URL.

        By default, the image is converted into RGB format.
        rc   re   rn   rE   Nr0   )r   r%   rZ   r   rW   r[   ro   rp   r0   r0   r1   fetch_video_async  s   
z MediaConnector.fetch_video_asyncr;   c                 C   s   t  }|d|S )z2
        Load image embedding from a URL.
        r#   )r   r:   )r-   r;   Zimage_embedding_ior0   r0   r1   fetch_image_embedding'  s   z$MediaConnector.fetch_image_embedding)$__name__
__module____qualname__r   r   dictrg   r   r   r)   r   r   r   r>   rD   intrP   rW   tuplenpndarrayr
   floatr_   r`   r   rk   rl   nptNDArrayrr   rs   torchTensorrt   __classcell__r0   r0   r.   r1   r"   /   s    '









r"   rY   sampling_rater'   c                 C   s   t  }|| |fS )zEncode audio as base64.)r   encode_base64)rY   r   r^   r0   r0   r1   encode_audio_base643  s   r   ra   ZJPEG)rc   formatre   rc   r   c                C   s   t |d}|j| |dS )z
    Encode a pillow image to base64 format.

    By default, the image is converted into RGB format before being encoded.
    rb   )Zimage_format)r   r   )re   rc   r   ri   r0   r0   r1   encode_image_base64<  s   
r   framesc                 C   s   t  }t|}|| S N)r   r   r   )r   ri   rq   r0   r0   r1   encode_video_base64K  s   
r   mm_positionsc                 C   s0   dd |   D }t|dd d}dd |D S )a/  
    Given a `MultiModalPlaceholderDict`, output a sequence of keys to
    sort the dictionary by `offset` (starting index in the input sequence)
    in ascending order.

    Returns:
        A list of `(modality, idx)`, which can be used to access an item
        by `mm_positions[modality][idx]`.
    c                 s   s0    | ]\}}t |D ]
\}}|||fV  q
qd S r   )	enumerate).0modalityitemsidxitemr0   r0   r1   	<genexpr>\  s    z'argsort_mm_positions.<locals>.<genexpr>c                 S   s
   | d j S )N   )offset)xr0   r0   r1   <lambda>`  s   
 z&argsort_mm_positions.<locals>.<lambda>keyc                 S   s   g | ]	\}}}||fqS r0   r0   )r   r   r   _r0   r0   r1   
<listcomp>b  s    z(argsort_mm_positions.<locals>.<listcomp>)r   sorted)r   Z
flat_itemsZsorted_flat_itemsr0   r0   r1   argsort_mm_positionsQ  s
   r   z`group_mm_inputs_by_modality` is superseded by `group_mm_kwargs_by_modality` and will be removed in v0.13. Please use `group_mm_kwargs_by_modality` instead.	mm_inputsc                 C   s8   | sg S dt dtttf fdd}dd t| |dD S )Nmm_inputr'   c                 S   s6   t | jdkrt| S t | jdkrt| jd S dS )Nr   r   r#   )lenZ
modalitiesidlist)r   r0   r0   r1   modality_group_funcn  s
   z8group_mm_inputs_by_modality.<locals>.modality_group_funcc                 S   s   g | ]\}}t |qS r0   )r   )r   r   groupr0   r0   r1   r   |  s    z/group_mm_inputs_by_modality.<locals>.<listcomp>r   )r   r
   rg   ry   r   )r   r   r0   r0   r1   group_mm_inputs_by_modalityf  s   
r   F)device
pin_memory	mm_kwargsr   r   c                #   sj    ddl m  t| dd dD ]#\}}t|} j j fdd|D |d|d	}|t||fV  qd
S )a  Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
    modality together into the same `MultiModalKwargs` instance.

    Args:
        mm_inputs: List of `MultiModalKwargsItem`.

    Yields:
        A tuple `(modality, num_items, grouped_kwargs)`.
    r   r   c                 S   s   | j S r   )r   )r   r0   r0   r1   r     s    z-group_mm_kwargs_by_modality.<locals>.<lambda>r   c                    s   g | ]} |gqS r0   r0   )r   r   r   r0   r1   r     s    z/group_mm_kwargs_by_modality.<locals>.<listcomp>)r   )r   N)Zvllm.multimodal.inputsr   r   r   Z	as_kwargsbatchr   )r   r   r   r   r   Z	items_lstZmm_kwargs_groupr0   r   r1   group_mm_kwargs_by_modality  s   r   image_inputvision_modelc                 C   s   | j d }t }|| d | }|| | }dd|  d   d|f }tjj| |}t }||| |d | df }	||	}
t|
dd}
|
d|df }
|
S )aY  Run a vision model with data parallelism (DP) sharding. The function 
    will shard the input image tensor on the first dimension and run the vision
    model

    Args:
        image_input (torch.Tensor): Image input tensor.
        vision_model (torch.nn.Module): Vision model.

    Returns:
        torch.Tensor: Output image embeddings
    r   r   )r   r   .)dimN)	shaper   r   r   nnZ
functionalpadr   r   )r   r   Z
num_chunksZmp_world_sizeZnum_chunks_per_rankZnum_padded_chunksr   Zimage_input_paddedZrankZimage_input_per_rankZvision_embeddingsr0   r0   r1   run_dp_sharded_vision_model  s,   
r   rX   audio_io_kwargsc                 C   $   |sdnd|i}t |d}|| S )z
    Args:
        audio_url: URL of the audio file to fetch.
        audio_io_kwargs: Additional kwargs passed to handle audio IO.
    NrY   r%   )r"   r_   )rX   r   r%   media_connectorr0   r0   r1   r_     
   	

r_   rd   image_io_kwargsc                 C   r   )z
    Args:
        image_url: URL of the image file to fetch.
        image_io_kwargs: Additional kwargs passed to handle image IO.
    Nre   r   )r"   rk   )rd   r   r%   r   r0   r0   r1   rk     r   rk   rm   video_io_kwargsc                 C   r   )z
    Args:
        video_url: URL of the video file to fetch.
        video_io_kwargs: Additional kwargs passed to handle video IO.
    Nrn   r   )r"   rr   )rm   r   r%   r   r0   r0   r1   rr     r   rr   r   )TrQ   atexitcollections.abcr   concurrent.futuresr   	itertoolsr   pathlibr   typingr   r   r   r	   r
   urllib.parser   r   urllib.requestr   numpyr{   Znumpy.typingr~   r   ZPILr   r   Ztyping_extensionsr   Z	vllm.envsr[   Zvllm.connectionsr   r   Zvllm.distributedr   r   r   rY   r   baser   re   r   r   rn   r   r   Zinputsr   r   r   r    ZVLLM_MEDIA_LOADING_THREAD_COUNTrT   registershutdownr"   r|   r}   rg   r   r   r   r   r   rz   ry   r   r   typesZDeviceboolr   r   r   Moduler   rx   r_   rk   rr   r0   r0   r0   r1   <module>   s     




,
"

