o
    W+ iw                     @   st  d dl mZ d dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8T ddl9m:Z: ddl;m<Z< g dZ=e7j>e/j?e!j@dG dd de5ZAe7j>e/j?e!jBdG dd de5ZCdd ZDe7j>e/j?e!jEdG dd de5ZFe7j>e/j?e!jGdG d d! d!e5ZHe7j>e/j?e!jIdG d"d# d#e5ZJe7j>e/j?e!jKdG d$d% d%e5ZLe7j>e/j?e!jMdG d&d' d'e5ZNe7j>e/j?e!jOdG d(d) d)e5ZPdS )*    N)BytesIO)AnyDictListTupleUnion)Image)create_transform
transforms)ImageFolder)Compose	NormalizeResizeToTensor)snapshot_download)Preprocessors)Input)VCenterCropVCompose
VNormalizeVRescale	VToTensor)
load_image)Config)FieldsInvokeModeKeys	ModelFileTasks   )Preprocessor)PREPROCESSORS)*)
collate_fn)OFA_TASK_KEY_MAPPING)$DiffusionImageGenerationPreprocessorOfaPreprocessorMPlugPreprocessorHiTeAPreprocessorMplugOwlPreprocessor)module_namec                       s6   e Zd ZdZ fddZdeeef fddZ  Z	S )r&   a    Preprocessor the data with the combination of image and text.
        Args:
            data: process the value as an image for keys ending with 'FILE'
                or existing in preprocessor_image_keys and pass-through the values of other keys.

    c              	      s   t  j|i | |dd| _|ddg| _|ddg| _t|dg | _|dd| _t	
t	j| jt	jjd	| jrDt	| jnt	| jt	 t	| j| jg| _d S )
N
resolutioni   meang      ?stdZ
image_keyscenter_cropTinterpolation)super__init__popZpreprocessor_resolutionZpreprocessor_meanZpreprocessor_stdsetpreprocessor_image_keysr/   r   r   r   ZInterpolationModeZBILINEARZ
CenterCropZ
RandomCropr   r   transform_input)selfargskwargs	__class__ p/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/modelscope/preprocessors/multi_modal.pyr3   2   s*   z-DiffusionImageGenerationPreprocessor.__init__returnc                 C   sj   i }|  D ],\}}|ds|| jv r(t|}| |}|||dd < q|r,|nd|| < q|S )Nz:FILE )itemsendswithr6   r   r7   replacelower)r8   dataresultskeyvalueimageimgr=   r=   r>   __call__F   s   
z-DiffusionImageGenerationPreprocessor.__call__)
__name__
__module____qualname____doc__r3   r   strr   rK   __classcell__r=   r=   r;   r>   r&   '   s    r&   c                       s   e Zd Zejfdef fddZdeee	e f de
eef fddZdd	 Zdeeee
eef f de
eef fd
dZ  ZS )r'   	model_dirc                    s   t  j|i | tjttjttjttj	t
tjttjttjttjttjttjttjttjti}t|r6|n	t|tjtj id}t!"t#|t$j%| _&|| j&j' | j&||d| _(t)| j&j' | _*| j(j+| _+|,ddrod| _-dS d| _-dS )preprocess the data

        Args:
            model_dir (str): model path
            mode: preprocessor mode (model mode)
        
user_agent)cfgrR   mode
no_collateNTF).r2   r3   r   Zocr_recognitionZOfaOcrRecognitionPreprocessorimage_captioningZOfaImageCaptioningPreprocessorZvisual_groundingZOfaVisualGroundingPreprocessorZvisual_question_answeringZ&OfaVisualQuestionAnsweringPreprocessorZvisual_entailmentZOfaVisualEntailmentPreprocessorZimage_classificationZ"OfaImageClassificationPreprocessorZtext_classificationZ!OfaTextClassificationPreprocessorZtext_summarizationZOfaSummarizationPreprocessorZtext_to_image_synthesisZ#OfaTextToImageSynthesisPreprocessorZauto_speech_recognitionZOfaASRPreprocessorZsudokuZOfaSudokuPreprocessorZtext2sqlZOfaTextToSqlPreprocessorospexistsr   r   KEYPREPROCESSORr   	from_filejoinr   CONFIGURATIONrV   task
preprocessr%   keys	tokenizergetrX   )r8   rR   rW   r9   r:   Zpreprocess_mappingr;   r=   r>   r3   V   s:   



zOfaPreprocessor.__init__inputr?   c                 C   sB   t  }t|tst|ts|f}t| j|D ]\}}|||< q|S N)dict
isinstancetuplelistziprc   )r8   rf   rE   rG   itemr=   r=   r>   _build_dict   s   
zOfaPreprocessor._build_dictc                 C   sz   d|v r;| j jdd dkr;t|d trt|d }n|d }|jdkr*|d}t }|j	|dd t
||d< |S )NrI   typeofaRGBZJPEG)format)rV   modelre   ri   rP   r   rW   convertr   saver   open)r8   rE   rI   Z
img_bufferr=   r=   r>   #_ofa_input_compatibility_conversion   s   

z3OfaPreprocessor._ofa_input_compatibility_conversionc           	      O   sr   t |tr|}n| |}| |}t }| D ]
\}}t|||< q||d< | jr-|S t|g| jj	| jj
dS )Nsample)Zpad_idxZeos_idx)ri   rh   rn   rb   rA   rP   rX   r$   rd   Zpad_token_idZeos_token_id)	r8   rf   r9   r:   rE   rx   Zstr_datakvr=   r=   r>   rK      s   


zOfaPreprocessor.__call__)rL   rM   rN   r   	INFERENCErP   r3   r   r   r   r   r   rn   rw   rj   rK   rQ   r=   r=   r;   r>   r'   R   s    &)
r'   c                 C   s
   |  dS )Nrq   )rt   )rI   r=   r=   r>   _convert_to_rgb   s   
r|   c                       s   e Zd Zejfdef fddZdd Z	ddeee	e f de
d	ejfd
dZdefddZdefddZdeeeeeef f d	eeef fddZ  ZS )CLIPPreprocessorrR   c                    s   t  j|i | t|r|n	t|tjtjid}|| _ddl	m
} d|v r4t|d |r4|d | _n| dtj }||d| _d|v rSt|d trS|d | _nttd|d	d
d | _|  | _ddd| _dS )rS   rT   r   )FullTokenizerrd   /)
vocab_filer,   z{}/vision_model_config.jsonutf-8encodingimage_resolutionrJ   text)rJ   r   N)r2   r3   rZ   r[   r   r   r\   r]   rW   Z1modelscope.models.multi_modal.clip.bert_tokenizerr~   ri   rd   r   Z
VOCAB_FILEintr   jsonloadrv   rr   _build_image_transformimg_preprocess
input_keys)r8   rR   rW   r9   r:   r~   r   r;   r=   r>   r3      s0   
zCLIPPreprocessor.__init__c              
   C   s~   | j tjkr(t| jddd ddddd}t|jd d tg |jdd   }|S tt| j| jft	j
d	tt tddg}|S )
N)g?g      ?ToriginalZbicubicg3<4'?gwgM?gy{ ?gB91?gwt.?g	U?)Z
input_sizescaleZis_trainingZcolor_jitterZauto_augmentr1   r-   r.   r0   )rW   r   ZTRAINr	   r   r   r   r|   r   r   BICUBICr   r   )r8   Z	transformr=   r=   r>   r      s4   
z'CLIPPreprocessor._build_image_transform4   textscontext_lengthr?   c              	   C   s   t |tr|g}g }|D ]#}|| jjd g| j| j|d|d   | jjd g  qtjt	||tj
d}t|D ]\}}t	||ksKJ t|||dt	|f< q?|S )a  
        Returns the tokenized representation of given input string(s)
        Parameters
        ----------
        texts : Union[str, List[str]]
            An input string or a list of input strings to tokenize
        context_length : int
            The context length to use; all baseline models use 24 as the context length
        Returns
        -------
        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
        z[CLS]N   z[SEP])Zdtype)ri   rP   appendrd   ZvocabZconvert_tokens_to_idstokenizetorchZzeroslenlong	enumerateZtensor)r8   r   r   Z
all_tokensr   resultitokensr=   r=   r>   r      s(   


zCLIPPreprocessor.tokenizenew_keyc                 C      || j d< d S )NrJ   r   r8   r   r=   r=   r>   set_input_img_key     z"CLIPPreprocessor.set_input_img_keyc                 C   r   )Nr   r   r   r=   r=   r>   set_input_text_key  r   z#CLIPPreprocessor.set_input_text_keyrf   c                    s^  i } j d }||v r^|| d ur^|| }t|tjr$ |d}n6t|trQtdd |D rAtj fdd|D dd}ndd |D d }t	d| t	d	t
| ||d<  j d
 }	|	|v r||	 d ur||	 }
t|
tr| |
}n-t|
trtdd |
D r |
}ndd |
D d }t	d| t	dt
|
 ||d
< |S )NrJ   r   c                 S   s   g | ]}t |tjqS r=   )ri   r   .0elemr=   r=   r>   
<listcomp>"  s    z-CLIPPreprocessor.__call__.<locals>.<listcomp>c                    s   g | ]}  |qS r=   )r   r   r8   r=   r>   r   %  s    dimc                 S   s    g | ]}t |tjst|qS r=   )ri   r   ro   r   r=   r=   r>   r   )  s    
zfimg should be PIL.Image or List[PIL.Image],                             but got a List containing one z4img should be PIL.Image or List[PIL.Image], but got r   c                 S   s   g | ]}t |tqS r=   )ri   rP   r   r=   r=   r>   r   B  s    c                 S   s   g | ]}t |tst|qS r=   )ri   rP   ro   r   r=   r=   r>   r   E  s    z?text should be str or List[str], but got a List containing one z)text should be str or List[str], but got )r   ri   r   r   Z	unsqueezerk   allr   stack	TypeErrorro   rP   r   )r8   rf   r9   r:   outputZinput_img_keyZimage_inputZimage_tensorZunsupported_elem_typeZinput_text_keyZ
text_inputZtext_tensorr=   r   r>   rK     sf   





zCLIPPreprocessor.__call__)r   )rL   rM   rN   r   r{   rP   r3   r   r   r   r   r   
LongTensorr   r   r   rj   r   r   rK   rQ   r=   r=   r;   r>   r}      s&    $
"
r}   c                       s   e Zd Zejdfdededef fddZedd Z	ed	d
 Z
dedeejef fddZdeejeeeef f deeef fddZ  ZS )r(      rR   rW   tokenizer_max_lengthc                    s:   t  j|i | || _|| _|| _d | _d | _i | _d S rg   )r2   r3   rR   rW   r   
_tokenizer_patch_resize_transform
_image_mapr8   rR   rW   r   r9   r:   r;   r=   r>   r3   Z  s   
zMPlugPreprocessor.__init__c                 C   *   ddl m} | jd u r|| j| _| jS Nr   )BertTokenizerZtransformersr   r   from_pretrainedrR   r8   r   r=   r=   r>   rd   i     
zMPlugPreprocessor.tokenizerc                 C   |   | j d u r;ddlm} ddlm}m} |t| j	|}d}d}|
|j|j|jftjd| |j||dg| _ | j S )Nr   r
   )CONFIG_NAMEMPlugConfigr   r   r0   r-   r.   )r   torchvisionr   #modelscope.models.multi_modal.mplugr   r   from_yaml_filerZ   r_   rR   r   r   	image_resr   r   r   r   )r8   r   r   r   configr-   r.   r=   r=   r>   patch_resize_transformq      
z(MPlugPreprocessor.patch_resize_transformpathr?   c                 C   0   || j vrt| j }t||f| j |< | j | S rg   r   r   r   r8   r   indexr=   r=   r>   
image_open     


zMPlugPreprocessor.image_openrE   c                 C   sN  t t| jtj| _t|t	j	t
fr|}nt|tr!|d }n|d }d}t|t
r3| |\}}|d}| |}| jjtjkrFdn|t|trNdnd|v rTdnd }| j| dd	| jd
d}| jtjkrwtj|gdd}||dS |d }| j|dd	| jd
d}||j |j |j |j d}| jjtjkr||d< |S )Nr   rI   rq   r@   r    r   question
max_lengthTptpaddingZ
truncationr   Zreturn_tensorsr   )rI   r   answer)rI   question_input_idsquestion_attention_maskanswer_input_idsanswer_attention_maskr   )r   r^   rZ   r_   rR   r   r`   rV   ri   r   rP   rj   r   rt   r   ra   r   rY   rd   rD   r   rW   r   r{   r   r   	input_idssqueezeattention_maskZimage_text_retrieval)r8   rE   rI   r   r   r   r   r=   r=   r>   rK     sZ   





zMPlugPreprocessor.__call__)rL   rM   rN   r   r{   rP   r   r3   propertyrd   r   r   r   r   r   rj   r   r   rK   rQ   r=   r=   r;   r>   r(   V  s,    



r(   c                       sN   e Zd Zejfdedef fddZdeeef deeef fddZ	  Z
S )	VLDocPreprocessorrR   rW   c                    s   t  j|i | || _|| _t|d}t|ddd}t|}W d   n1 s,w   Y  ddl	m
} t|tj}	||	| _ddlm}
m} |d	d	|d
 d |d
 d dd	dd| _|
|d |d | j| j|d
 d |d
 d d| _dS )zPreprocess data for the model `VLDocForDocVLEmbedding`.

        Args:
            model_dir (str): model path in model hub.
            mode (str): model mode, in ('train', 'eval', 'inference').
        zconfig.jsonrr   r   Nr   )VLDocXLMTokenizer)	ProcessorImageProcessorT
image_sizer    )heightwidthF)Zdo_preprocessZ	do_resizer   Zdo_normalizeZ	apply_ocrmax_seq_lengthmax_block_num)r   r   Zimg_processorrd   r   r   )r2   r3   rR   rW   rZ   r_   rv   r   r   Z0modelscope.models.multi_modal.vldoc.tokenizationr   r   ZTOKENIZER_FOLDERr   rd   Z.modelscope.models.multi_modal.vldoc.processingr   r   Zimg_procproc)r8   rR   rW   r9   r:   Zmodel_cfg_pathfZ	model_cfgr   Ztokenizer_pathr   r   r;   r=   r>   r3     s8   

	

zVLDocPreprocessor.__init__rf   r?   c           
   	   O   sz   g }|d D ]%}t |d}t|}|d }|| W d   n1 s&w   Y  q|d |d}| jdi |}	|	S )z
        Args:
            input: {
                'images': ['img_path1', 'img_path2', ...],
                'ocr_info_paths': ['json_path1', 'json_path2', ...]
            }
        Return:
            encodings: Dict[str, Tensor]
        Zocr_info_pathsr   formNimages)r   	ocr_infosr=   )rv   r   r   r   r   )
r8   rf   r9   r:   r   Zone_ocr_info_pathr   Zocr_infoZ
proc_input	encodingsr=   r=   r>   rK     s   
zVLDocPreprocessor.__call__)rL   rM   rN   r   r{   rP   r3   r   r   rK   rQ   r=   r=   r;   r>   r     s    +
r   c                       s   e Zd Zejdfdededef fddZedd Z	ed	d
 Z
edd Zdedeejef fddZdededee fddZdeejeeeef f deeef fddZ  ZS )r)   r   rR   rW   r   c                    s@   t  j|i | || _|| _|| _d | _d | _d | _i | _d S rg   )	r2   r3   rR   rW   r   r   r   _num_frames
_video_mapr   r;   r=   r>   r3   	  s   
zHiTeAPreprocessor.__init__c                 C   r   r   r   r   r=   r=   r>   rd     r   zHiTeAPreprocessor.tokenizerc                 C   r   )Nr   r
   r   HiTeAConfigr   r   r0   r   )r   r   r   r   r   r   r   rZ   r_   rR   r   r   r   r   r   r   r   )r8   r   r   r   r   r-   r.   r=   r=   r>   r   !  r   z(HiTeAPreprocessor.patch_resize_transformc                 C   sH   | j d u r!ddlm} ddlm}m} |t| j	|}|j
| _ | j S )Nr   r
   r   )r   r   r   r   r   r   r   rZ   r_   rR   
num_frames)r8   r   r   r   r   r=   r=   r>   r   5  s   
zHiTeAPreprocessor.num_framesr   r?   c                 C   s@   || j vrt| j }tj|tdd}||f| j |< | j | S )Nr   )ctx)r   r   decordVideoReadercpu)r8   r   r   Zvrr=   r=   r>   
video_openA  s
   


zHiTeAPreprocessor.video_openr   vlenc           
      C   s   t ||}tjd||d dt}g }t|d d D ]\}}||||d  d f qdd |D }t||k rL|d g| }	||	d t|< |	}|S )Nr   r    )startstopnumc                 S   s    g | ]}|d  |d  d qS )r   r    r   r=   r   xr=   r=   r>   r   Q  s     z3HiTeAPreprocessor.sample_frames.<locals>.<listcomp>)minnpZlinspaceZastyper   r   r   r   )
r8   r   r   Zacc_samplesZ	intervalsrangesidxZintervframe_indicesZpadded_frame_indicesr=   r=   r>   sample_framesH  s   

zHiTeAPreprocessor.sample_framesrE   c                    sx  t t jtj _t|t	j
tfr|}nt|tr!|d }n|d }d}t|tr3 |\}}  jt|}|d t|| } fdd| D }tj|dd} jjtjkrfdn|t|trndnd|v rtdnd	 } j| d
d jdd} jtj krtj|gdd}||dS |d } j|d
d jdd}||j!" |j#" |j!" |j#" d}|S )Nr   videoc                    s   g | ]
}  t|qS r=   )r   r   Z	fromarray)r   r   r   r=   r>   r   k  s    z.HiTeAPreprocessor.__call__.<locals>.<listcomp>r   r@   r    r   r   r   Tr   r   )r  r   r   )r  r   r   r   r   )$r   r^   rZ   r_   rR   r   r`   rV   ri   r   r   rP   rj   r   r  r   r   seekr   Z
from_numpyZ	get_batchZasnumpynumpyr   ra   r   Zvideo_captioningrd   rD   r   rW   r   r{   r   r   r   )r8   rE   r  r   r  r   r   r   r=   r   r>   rK   Y  s`   





zHiTeAPreprocessor.__call__)rL   rM   rN   r   r{   rP   r   r3   r   rd   r   r   r   r   r   r   r   r  r   rj   r   r   rK   rQ   r=   r=   r;   r>   r)     s2    




r)   c                       s   e Zd Zejfdedef fddZedd Zedd Z	d	ed
e
ejef fddZded
ee fddZdeeee f d
efddZdeeef d
eeef fddZ  ZS )r*   rR   rW   c                    s>   t  j|i | || _|| _d | _d | _ddi| _i | _d S )N	<|image|>A   )r2   r3   rR   rW   r   r   media_tokenr   )r8   rR   rW   r9   r:   r;   r=   r>   r3     s   

zMplugOwlPreprocessor.__init__c                 C   r   )Nr   )LlamaTokenizer)Zmodelscope.models.nlp.llamar  r   r   rR   )r8   r  r=   r=   r>   rd     r   zMplugOwlPreprocessor.tokenizerc                 C   sP   | j d u r%ddlm} d}d}||jdtjd| |j||dg| _ | j S )Nr   r
   r   r   )   r  r0   r   )	r   r   r   r   r   r   r   r   r   )r8   r   r-   r.   r=   r=   r>   r     s   
z+MplugOwlPreprocessor.patch_resize_transformr   r?   c                 C   r   rg   r   r   r=   r=   r>   r     r   zMplugOwlPreprocessor.image_openr   c                    s   dd t | j D }| j }| jjg} fdd| D }t|r2|| j ddd  }|S |}dtt	j
t| }t	d	| d
 }dd |D }t |D ]!\}	}
|
|v rj|||
 g||
  7 }qV| j|
ddd }||7 }qV|S )Nc                 S   s    i | ]\}}|t |d   qS )r    )r   )r   r   ry   r=   r=   r>   
<dictcomp>  s    z6MplugOwlPreprocessor.tokenize_text.<locals>.<dictcomp>c                    s   g | ]}| vqS r=   r=   )r   r  r   r=   r>   r     s    z6MplugOwlPreprocessor.tokenize_text.<locals>.<listcomp>F)Zadd_special_tokensr   |()c                 S   s   g | ]
}t |d kr|qS )r   )r   r   r=   r=   r>   r     s    )r   r  rc   copyrd   Zbos_token_idr   r_   mapreescaperk   split)r8   r   Zmedia_tokensZmedia_lengthsZprompt_chunk	conditionZ	enc_chunkpatternZ
chunk_strsr  Z	chunk_strZ	tmp_chunkr=   r  r>   tokenize_text  s:   




z"MplugOwlPreprocessor.tokenize_textmessagesc                 C   s   g }g }|d }|D ]O}|d dkrd}n|d dkrd}nd}t |d tr5| |d  }|| q
|d D ]}t |trG| | }n| d	}||d
  || q9q
d|}|d7 }||fS )Nr  rolesystemr@   userzHuman: zAI: contentr	  rI   
z
AI: )ri   rP   r   r_   )r8   r  r   rI   Zturnr  r   tr=   r=   r>   rt     s,   


zMplugOwlPreprocessor.convertc           	      K   s   i }|  |\}}t|dkr+g }|D ]}|| | |d  tj|dd}qnd}| |}t|g}||d|}|S )a  
        Args:
            messages: {[
                {'role': 'system', 'content': 'message1'},
                {'role': 'user', 'content': 'message2'},
                {'role': 'user', 'content': ['message2', {"image": 'image_path'}, 'message3', ...]},
            ]}
            The 'role' should be choose from ['system', 'user', 'assistant'].
            The 'content' can be either str or List[Union[str, Dict]]
        Return:
            output: Dict[str, Tensor]
        r   r   N)pixel_valuesr   )	rt   r   r   r   r   r   r   r  r   )	r8   r  Zforward_paramsr   r   r   r"  rI   r   r=   r=   r>   rK     s&   
zMplugOwlPreprocessor.__call__)rL   rM   rN   r   r{   rP   r3   r   rd   r   r   r   r   r   r   r  r   rt   r   rK   rQ   r=   r=   r;   r>   r*     s"    

!
r*   c                       s2   e Zd Z fddZdeeef fddZ  ZS )+ImageCaptioningClipInterrogatorPreprocessorc                    s   t  jdi | d S )Nr=   )r2   r3   )r8   r:   r;   r=   r>   r3   $  s   z4ImageCaptioningClipInterrogatorPreprocessor.__init__r?   c                 C   s    t |}t|ddd}|S )Nr   r   r    )r   r  arrayZ	transpose)r8   rE   rI   r=   r=   r>   rK   '  s   z4ImageCaptioningClipInterrogatorPreprocessor.__call__)	rL   rM   rN   r3   r   rP   r   rK   rQ   r=   r=   r;   r>   r#    s    r#  )QZos.pathr   rZ   r  ior   typingr   r   r   r   r   r   r   r  r  r   ZPILr   Z	timm.datar	   r   r   Ztorchvision.datasetsr   Ztorchvision.transformsr   r   r   r   Z modelscope.hub.snapshot_downloadr   Zmodelscope.metainfor   Zmodelscope.pipelines.baser   Z7modelscope.pipelines.cv.cmdssl_video_embedding_pipeliner   r   r   r   r   Zmodelscope.preprocessorsr   Zmodelscope.utils.configr   Zmodelscope.utils.constantr   r   r   r   r   baser!   Zbuilderr"   rp   Zofa.utils.collater$   Zofa.utils.constantr%   __all__Zregister_moduleZmulti_modalZ'diffusion_image_generation_preprocessorr&   Zofa_tasks_preprocessorr'   r|   Zclip_preprocessorr}   Zmplug_tasks_preprocessorr(   Zvldoc_preprocessorr   Zhitea_tasks_preprocessorr)   Zmplug_owl_preprocessorr*   Z/image_captioning_clip_interrogator_preprocessorr#  r=   r=   r=   r>   <module>   s   (S *eF 	 