o
    | i}N                     @   s   d dl Z d dlmZmZ d dlZd dlmZ d dlm	Z	m
Z
 d dlmZmZmZ d dlmZ d dlmZmZmZmZmZmZmZ dd Zeed	d
fddZeed	d
fddZG dd dZG dd deZede dS )    N)ListTuple)ImageImageOps)AutoProcessorBatchFeatureLlamaTokenizerFast)ProcessorMixin)
IMAGE_SIZE	BASE_SIZE	CROP_MODE	MIN_CROPS	MAX_CROPSPROMPT	TOKENIZERc                 C   s|   t d}d}|| }|D ]/}|d |d  }	t| |	 }
|
|k r%|
}|}q|
|kr;|d| | |d  |d  kr;|}q|S )Ninf   r   r   r         ?)floatabs)aspect_ratiotarget_ratioswidthheight
image_sizeZbest_ratio_diff
best_ratioarearatiotarget_aspect_ratioZ
ratio_diff r    U/home/app/DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/image_process.pyfind_closest_aspect_ratio   s    r"     Fc           	         sN   | | }t  fddt d D }t|dd d}t||| ||}|S )Nc                 3   X    | ]'}t d |d  D ]}t d |d  D ]}||  kr|| kr||fV  qqqdS r   Nrange.0nijmax_nummin_numr    r!   	<genexpr>        
zcount_tiles.<locals>.<genexpr>r   c                 S      | d | d  S Nr   r   r    xr    r    r!   <lambda>$       zcount_tiles.<locals>.<lambda>key)setr'   sortedr"   )	
orig_widthorig_heightr/   r.   r   use_thumbnailr   r   r   r    r-   r!   count_tiles   s   
r?   c                    s4  | j \}}|| }t fddt d D }t|dd d}t|||||}	||	d  }
||	d  }|	d |	d  }| |
|f}g }t|D ].}||
|  | ||
|  | ||
|  d | ||
|  d | f}||}|| qKt||ksJ |rt|dkr| ||f}|| ||	fS )Nc                 3   r$   r%   r&   r(   r-   r    r!   r0   2   r1   z%dynamic_preprocess.<locals>.<genexpr>r   c                 S   r2   r3   r    r4   r    r    r!   r6   6   r7   z$dynamic_preprocess.<locals>.<lambda>r8   r   )	sizer:   r'   r;   r"   resizecropappendlen)imager/   r.   r   r>   r<   r=   r   r   r   Ztarget_widthZtarget_heightblocksZresized_imgZprocessed_imagesr+   boxZ	split_imgZthumbnail_imgr    r-   r!   dynamic_preprocess-   s6   



rH   c                   @   sN   e Zd Z			ddeeeef deeeef defddZdejfd	d
ZdS )ImageTransformr   r   r   Tmeanstd	normalizec                 C   sB   || _ || _|| _t g}|r|t|| t|| _d S N)	rK   rL   rM   TToTensorrC   	NormalizeCompose	transform)selfrK   rL   rM   Ztransform_pipelinesr    r    r!   __init__[   s   
zImageTransform.__init__pil_imgc                 C   s   |  |}|S rN   )rS   )rT   rV   r5   r    r    r!   __call__j   s   
zImageTransform.__call__N)rJ   rJ   T)	__name__
__module____qualname__r   r   boolrU   r   rW   r    r    r    r!   rI   Y   s    
rI   c                       sN  e Zd ZdZdgZeddggddddddd	d
dddfdedeeeef  dededee	e	e	f dee	e	e	f de
dedede
dede
def fddZedd Zedd Zedd  Zd5d!ed"e
d#e
fd$d%Zd&ee d'efd(d)Z	d6d*ed+ed,e
fd-d.Zdd/d*ed+ed,e
fd0d1Z			d7d+eej d"e
d#e
d2e
fd3d4Z  ZS )8DeepseekOCRProcessor)LlamaTokenizerr   	tokenizeri         rJ   Tz<image>u   <｜▁pad▁｜>Fdeepseekicandidate_resolutions
patch_sizedownsample_ratio
image_mean	image_stdrM   image_token	pad_tokenadd_special_token
sft_formatmask_prompt	ignore_idc                    s   t | _t| _d| _|| _|| _|| _d| _t	|||d| _
|| _d| j_| jjd u r2| jd|	i | jj|| _|| _|	| _|
| _|| _|| _|| _t j|fi | d S )Nr_   r`   )rK   rL   rM   leftrh   )r
   r   r   	base_sizerc   re   rf   rM   rd   rI   image_transformr^   padding_siderh   add_special_tokensvocabgetimage_token_idrg   ri   rj   rk   rl   superrU   )rT   r^   rb   rc   rd   re   rf   rM   rg   rh   ri   rj   rk   rl   kwargs	__class__r    r!   rU   s   s0   
zDeepseekOCRProcessor.__init__c                 C      | j jS rN   )r^   bos_token_idrT   r    r    r!   bos_id      zDeepseekOCRProcessor.bos_idc                 C   ry   rN   )r^   eos_token_idr{   r    r    r!   eos_id   r}   zDeepseekOCRProcessor.eos_idc                 C   ry   rN   )r^   pad_token_idr{   r    r    r!   pad_id   r}   zDeepseekOCRProcessor.pad_idtextboseosc                 C   s4   | j j|dd}|r| jg| }|r|| jg }|S )NF)rq   )r^   encoder|   r   )rT   r   r   r   tr    r    r!   r      s   zDeepseekOCRProcessor.encoder   returnc                 K   s   | j j|fi |S rN   )r^   decode)rT   r   rv   r    r    r!   r      s   zDeepseekOCRProcessor.decodepromptimagesinference_modec                 K   sD   |dur|dusJ d|}|d \}}}}	}
}}||||	|
|dS )a  

        Args:
            prompt (str): the formatted prompt;
            conversations (List[Dict]): conversations with a list of messages;
            images (List[ImageType]): the list of images;
            inference_mode (bool): if True, then remove the last eos token;
            system_prompt (str): the system prompt;
            **kwargs:

        Returns:
            outputs (BaseProcessorOutput): the output of the processor,
                - input_ids (torch.LongTensor): [N + image tokens]
                - target_ids (torch.LongTensor): [N + image tokens]
                - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
                - image_id (int): the id of the image token
                - num_image_tokens (List[int]): the number of image tokens
        Nz0prompt and images must be used at the same time.r   )	input_idspixel_valuesimages_cropimages_seq_maskimages_spatial_cropnum_image_tokensr    )rT   r   r   r   rv   rj   r   r   r   r   r   r   _r    r    r!   process_one   s   z DeepseekOCRProcessor.process_one)r   c                K   s   | j |||d}|S )aJ  

        Args:
            prompt (str): the formatted prompt;
            images (List[ImageType]): the list of images;
            inference_mode (bool): if True, then remove the last eos token;
            **kwargs:

        Returns:
            outputs (BaseProcessorOutput): the output of the processor,
                - input_ids (torch.LongTensor): [N + image tokens]
                - images (torch.FloatTensor): [n_images, 3, H, W]
                - image_id (int): the id of the image token
                - num_image_tokens (List[int]): the number of image tokens
        )r   r   r   )r   )rT   r   r   r   rv   preparer    r    r!   rW   *  s   zDeepseekOCRProcessor.__call__croppingc           !      C   s  t }|| jt|ksJ || j}g g g g f\}}}	}
g }g }g }t||D ]\}}	 | j|ddd}||7 }|	dgt| 7 }		 ||j |jd dkr]|jd dkr]ddg}n|rht	|t
d\}}nddg}	 | jdkr}|s}|| j| jf}tj|| j| jftdd | jjD d	}|| | 	 |\}}|
||g |dks|dkr	 tt|D ]}|| ||  q	 t| j| j | j }t| j| j | j }| jg| | jg | }|| jg7 }|dks|dkr|| jg||  | jg ||  7 }||7 }|	d
gt| 7 }	|t| q)	 | j|d ddd}||7 }|	dgt| 7 }		 |rD| jg| }dg|	 }	|rR|| jg }|	dg }	t|t|	ksiJ dt| dt|	 g }|D ]}|| jkr||| qm|| j qmt|t|	  krt|ksn J dt| dt| dt|	 dt|}t|}tj|	tjd}	| j||dk || jkB < | j ||dk < d
}|r|d | jksJ |dd }|dd }|	dd }	t|dkrt!dd| j| jf}tj!dtj"d}
t!dd| j| jf#d} n+tj$|dd}tj|
tj"d}
|r7tj$|dd#d} nt!dd| j| jf#d} |#d}||| |	|
||ggS )z Tokenize text with <image> tags.F)r   r   r   r#   r   )r   c                 s   s    | ]	}t |d  V  qdS )   N)int)r)   r5   r    r    r!   r0     s    z<DeepseekOCRProcessor.tokenize_with_images.<locals>.<genexpr>)colorTz2tokenize_with_images func: tokenized_str's length z) is not equal to imags_seq_mask's length ztokenized_str's length z, input_ids' length z, imags_seq_mask's length z, are not equal)dtypeN   r   )dim)%r   countrg   rD   splitzipr   rC   r@   rH   r
   r   rA   r   padrn   tuplero   rK   r'   mathceilrc   rd   rt   r|   r   rl   torch
LongTensortensorr[   r   zeroslong	unsqueezestack)!rT   r   r   r   r   conversationZtext_splitsZimages_listZimages_crop_listr   r   image_shapesr   Ztokenized_strZtext_seprE   Ztokenized_sep
crop_ratioZimages_crop_rawZglobal_viewnum_width_tilesnum_height_tilesr+   num_queriesZnum_queries_baseZtokenized_imageZmasked_tokenized_strtoken_indexr   Z
target_idsr   r   r   r    r    r!   tokenize_with_imagesJ  s   



(


z)DeepseekOCRProcessor.tokenize_with_images)TF)T)TTT)rX   rY   rZ   tokenizer_class
attributesr   r   r   r   r   r[   strrU   propertyr|   r   r   r   r   r   r   rW   r   r   __classcell__r    r    rw   r!   r\   o   s    	
e




>
$r\   ZDeepseekVLV2Processor)r   typingr   r   r   Ztorchvision.transforms
transformsrO   PILr   r   transformersr   r   r   Ztransformers.processing_utilsr	   configr
   r   r   r   r   r   r   r"   r?   rH   rI   r\   registerr    r    r    r!   <module>   s"    $,   
