o
    i-V                     @   s8  d Z ddlZddlmZmZmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlZddlmZ ddlm  mZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z< ddl=m>Z> ddl?m@Z@mAZAmBZB ddlCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZK ddlLmMZM ddlNmOZO ddlPmQZQ ddlRmSZSmTZTmUZUmVZVmWZW dZXG dd de/ZYG dd de3eY ZZG d d! d!e.eY Z[e"j\e[eYeZd"G d#d$ d$ej]eAeBZ^dS )%zFInference-only Deepseek-OCR model compatible with HuggingFace weights.    N)IterableMappingSequence)ListLiteralOptionalSetTuple	TypedDictUnion)	rearrangerepeat)BatchFeature)
VllmConfig)SamplingMetadata)QuantizationConfig)set_default_torch_dtype)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsNestedTensors)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)BaseDummyInputsBuilder)IntermediateTensors)DeepseekVLV2ConfigMlpProjectorConfigVisionEncoderConfig)DeepseekOCRProcessorcount_tiles)cached_tokenizer_from_config)MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapper
flatten_bninit_vllm_registered_modelmaybe_prefixmerge_multimodal_embeddings)build_sam_vit_b)build_clip_l)MlpProjector)Dict)
IMAGE_SIZE	BASE_SIZE	CROP_MODEPRINT_NUM_VIS_TOKENSPROMPT<image>c                	   @   sj   e Zd Zdd ZdefddZdeeee	 f fddZ
d	d
de	de	dede	fddZdefddZdS )DeepseekOCRProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr"   self rA   L/home/app/DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepseek_ocr.pyr>   4   s   z'DeepseekOCRProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S r<   )r=   get_hf_processorr%   )r@   rC   rA   rA   rB   rD   7   s   z*DeepseekOCRProcessingInfo.get_hf_processorreturnc                 C   s   dd iS )NimagerA   r?   rA   rA   rB   get_supported_mm_limits:   s   z1DeepseekOCRProcessingInfo.get_supported_mm_limitsT)croppingimage_widthimage_heightrH   c                C   s   |   }t}t}d}d}tr'|dkr|dkrddg}	nt||td}	|	\}
}nd }
}t|| |  }}t|| |  }}||d  }|
dksO|dkrZ|| |
| d  }nd}|| d S )N      i     )
image_sizer   )rD   r5   r6   r7   r&   mathceil)r@   rI   rJ   rH   hf_processorrN   Z	base_size
patch_sizedownsample_ratioZ
crop_ratioZnum_width_tilesZnum_height_tileshwh2w2Zglobal_views_tokensZlocal_views_tokensrA   rA   rB   get_num_image_tokens=   s$   

z.DeepseekOCRProcessingInfo.get_num_image_tokensc                 C   s(   t dkrtdkrtdddS tdddS )Ni         )widthheight)r5   r6   r   r?   rA   rA   rB   !get_image_size_with_most_featuresl   s   z;DeepseekOCRProcessingInfo.get_image_size_with_most_featuresN)__name__
__module____qualname__r>   objectrD   r   strr   intrG   boolrX   r   r]   rA   rA   rA   rB   r;   2   s    
/r;   c                   @   sD   e Zd Zdeeef defddZdedeeef defddZdS )	DeepseekOCRDummyInputsBuilder	mm_countsrE   c                 C   s$   | dd}| j }|j}|| S )NrF   r   )getinforD   image_token)r@   rf   
num_images	processorri   rA   rA   rB   get_dummy_textv   s   
z,DeepseekOCRDummyInputsBuilder.get_dummy_textseq_lenc                 C   sN   | dd}| j }dtv r#dt j| j|j|j|dddt	diS dg iS )NrF   r   r:   )r[   r\   rj   T)imagesboseosrH   )
rg   rh   r]   r9   r%   tokenize_with_images_get_dummy_imagesr[   r\   r7   )r@   rm   rf   rj   Zmax_image_sizerA   rA   rB   get_dummy_mm_data~   s   
z/DeepseekOCRDummyInputsBuilder.get_dummy_mm_dataN)	r^   r_   r`   r   rb   rc   rl   r   rs   rA   rA   rA   rB   re   s   s    
re   c                       s   e Zd Zdedeeef deeef defddZdedeeef deeef fd	d
Z	de
deeef dedee fddZdeeee f de
deeef deee eef f fddZ  ZS )DeepseekOCRMultiModalProcessorpromptmm_data	mm_kwargsrE   c                 C   sR   |r| j j| j jdi |tdd|i||}|S | j  }||ddd}|S )Nru   Tpt)add_special_tokensreturn_tensorsrA   )rh   r=   call_hf_processorrD   dictget_tokenizer)r@   ru   rv   rw   Zprocessed_outputs	tokenizerrA   rA   rB   _call_hf_processor   s   	
z1DeepseekOCRMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s    t tdtdtddS )NrF   )pixel_valuesimages_spatial_cropimages_crop)r|   r   batched)r@   r   r   rA   rA   rB   _get_mm_fields_config   s
   z4DeepseekOCRMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sN   j jdi |}|j t tsJ dtf fdd}td g|dgS )Nitem_idxc                    sj    dttf}t|tr|| }n|d d d d }|d d d d }jj||td} g| S )NrF   r   rM   )rI   rJ   rH   )	get_itemsr   r   
isinstanceget_feature_sizerh   rX   r7   )r   rn   Znum_image_tokensr[   r\   image_token_idr   r@   rA   rB   get_replacement_deepseek_vl2   s   

zXDeepseekOCRMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_deepseek_vl2rF   )modalitytargetreplacementrA   )rh   rD   r   r   rc   r   )r@   r   r   r   rQ   r   rA   r   rB   _get_prompt_updates   s   z2DeepseekOCRMultiModalProcessor._get_prompt_updatesmm_data_itemsc                    s6   |j ddddkr| j|||ddS t j|||dS )NrF   F)strict   T)ru   r   r   enable_hf_prompt_update)ru   r   r   )	get_count_apply_hf_processor_mainsuper_cached_apply_hf_processor)r@   ru   r   r   	__class__rA   rB   r      s   
z9DeepseekOCRMultiModalProcessor._cached_apply_hf_processor)r^   r_   r`   rb   r   ra   r   r   r   r   r   r   r   r   r   r   listrc   tuplerd   r   __classcell__rA   rA   r   rB   rt      sF    







)
rt   )rh   dummy_inputsc                       s6  e Zd ZeddidZdddedef fdd	Zd
efddZ	de
jde
jde
jdefddZde
jfddZde
jjfddZd
edee fddZ	d*de
jdee de
jfddZ		d+de
jde
jdee d ee
j d
ef
d!d"Zd#e
jd$edee
j fd%d&Zd'eeee
jf  dee fd(d)Z  ZS ),DeepseekOCRForCausalLM	language.zlanguage_model.)Zorig_to_new_prefix )prefixvllm_configr   c                   sJ  t    |jj}|j}|jj}|| _|| _|j| _|j| _|j	| _	|j}t
|}|jt | _t | _t | _d}ttdd|d| _|j| _|j| _dttj|tjd }	| jdkrttt||	 | _tt||	 | _nt d| j | j	j!d	krd
g}
n| j	j"sdg}
ndg}
t#|| j	t$|d|
d| _%| j%j&| _&d S )NrY   linearrZ   )projector_type	input_dimn_embedrM   dtype2Dz.Only 2D tile_tag is supported currently, got: Znoaux_tcDeepseekV3ForCausalLMDeepseekForCausalLMDeepseekV2ForCausalLMlanguage)r   	hf_configr   architectures)'r   __init__model_configr   quant_configmultimodal_configconfigvision_configprojector_configtext_configr'   vocab_IMAGE_TOKENr   r1   	sam_modelr2   vision_modelr3   r4   	projectortile_tagglobal_view_postorchsqrttensorfloat32nn	Parameterrandnimage_newlineview_seperator
ValueErrortopk_methoduse_mlar.   r/   language_modelmake_empty_intermediate_tensors)r@   r   r   r   r   r   r   r~   r   Z	embed_stdr   r   rA   rB   r     sL   



zDeepseekOCRForCausalLM.__init__rC   c                 K   s   | dd }| dd }| dd }|d u st| dkr!d S |d ur]t|tjtfs6tdt| t|tjtfsGtdt| t|tjtfsXtdt| |||gS t	d)	Nr   r   r   r   z*Incorrect type of pixel values. Got type: z)Incorrect type of image sizes. Got type: z(Incorrect type of image crop. Got type: z This line should be unreachable.)
popr   sumitemr   Tensorr   r   typeAssertionError)r@   rC   r   r   r   rA   rA   rB   _parse_and_validate_image_inputK  s(   
z6DeepseekOCRForCausalLM._parse_and_validate_image_inputr   r   r   rE   c              	   C   s  g }t   t|dD ]}|| d t j}|| }|| d }t | dkr5|jt jkr<|t j	}| 
|}	| ||	}
t j|
d d dd f |	ddddfdd}| |}| 
|}| ||}t j|d d dd f |ddddfdd}| |}trtd td|j td|j td |j\}}}t|d	  }}|j\}}}t|d	  }}|d |d }}||||}t j|| jd d d d f |d|gdd}|d|}||||||dddd
d|| || |}t j|| jd d d d f || d|gdd}|d|}t j||| jd d d f gdd}n| 
|}| ||}t j|d d dd f |ddddfdd}| |}trstd td|j td td |j\}}}t|d	  }}||||}t j|| jd d d d f |d|gdd}|d|}t j|| jd d d f gdd}|| qW d    |S 1 sw   Y  |S )Nr   rM   r   r   )dimz=====================zBASE: z	PATCHES: g      ?   rL   z
NO PATCHES)r   no_gradrangesizetobfloat16r   r   r   float16r   r   catflattenpermuter   r8   printshaperc   viewr   expandreshaper   append)r@   r   r   r   Zimages_in_this_batchZjdxpatchesZ	image_oriZ
crop_shapeZlocal_features_1Zlocal_features_2Zlocal_featuresZglobal_features_1Zglobal_features_2Zglobal_features_hwn_dimrT   rU   _2Zhw2Zn_dim2rV   rW   Zwidth_crop_numZheight_crop_numZglobal_local_featuresrA   rA   rB   _pixel_values_to_embeddingj  sz   
2

2
"2&$
2
" 
UUz1DeepseekOCRForCausalLM._pixel_values_to_embeddingc                 C   sV   |d  tj}|d d ur|d  tjnd }|d j tjd}| j|||d}|S )Nr   rM   r   r   )r   r   r   )r   r   r   longr   )r@   image_inputr   r   r   Zvision_featuresrA   rA   rB   _process_image_input  s    z+DeepseekOCRForCausalLM._process_image_inputc                 C   s   | j S r<   )r   r?   rA   rA   rB   get_language_model  s   z)DeepseekOCRForCausalLM.get_language_modelc                 K   s*   | j di |}|d u rd S | |}|S )NrA   )r   r   )r@   rC   r   vision_embeddingsrA   rA   rB   get_multimodal_embeddings  s
   
z0DeepseekOCRForCausalLM.get_multimodal_embeddingsN	input_idsmultimodal_embeddingsc                 C   s(   | j |}|d urt|||| j}|S r<   )r   get_input_embeddingsr0   r   )r@   r   r   inputs_embedsrA   rA   rB   r     s   z+DeepseekOCRForCausalLM.get_input_embeddings	positionsintermediate_tensorsr   c                 K   sL   |d urd }n|d u r| j di |}| ||}d }| j||||d}|S )N)r   rA   )r   r   r   )r@   r   r   r   r   rC   r   hidden_statesrA   rA   rB   forward  s   zDeepseekOCRForCausalLM.forwardr   sampling_metadatac                 C   s   | j ||S r<   )r   compute_logits)r@   r   r   rA   rA   rB   r   ,  s   z%DeepseekOCRForCausalLM.compute_logitsweightsc                 C   s|   g }|D ]+\}}d|v sd|v sd|v sd|v sd|v r$| ddd}nd	| }|||f qt| }|j|| jd
}|S )Nr   r   r   r   r   zmodel.r   rM   r   )mapper)replacer   r+   load_weightshf_to_vllm_mapper)r@   r   Zprocessed_weightsnamer   new_nameloaderZautoloaded_weightsrA   rA   rB   r  5  s   (z#DeepseekOCRForCausalLM.load_weightsr<   )NN)r^   r_   r`   r,   r  r   rb   r   ra   r   r   r   r   r   r   r   Moduler   r   r(   r   r   r!   r   r   r   r   r	   r   r  r   rA   rA   r   rB   r     sn    @

k




,	r   )___doc__rO   collections.abcr   r   r   typingr   r   r   r   r	   r
   r   r   torch.nnr   Ztorch.nn.functional
functionalFeinopsr   r   transformersr   vllm.configr   vllm.model_executorr   'vllm.model_executor.layers.quantizationr   Z&vllm.model_executor.model_loader.utilsr   vllm.multimodalr   Zvllm.multimodal.inputsr   r   r   r   Zvllm.multimodal.parser   r   r   r   Zvllm.multimodal.processingr   r   r   r   Zvllm.multimodal.profilingr    vllm.sequencer!   ,vllm.transformers_utils.configs.deepseek_vl2r"   r#   r$   process.image_processr%   r&   !vllm.transformers_utils.tokenizerr'   Z%vllm.model_executor.models.interfacesr(   r)   r*    vllm.model_executor.models.utilsr+   r,   r-   r.   r/   r0   Zdeepencoder.sam_vary_sdpar1   Zdeepencoder.clip_sdpar2   Zdeepencoder.build_linearr3   Zaddictr4   r   r5   r6   r7   r8   r9   r   r;   re   rt   register_processorr  r   rA   rA   rA   rB   <module>   sV   $ 
A
#k