o
    | i?                  	   @   s  d dl mZ d dlZd dlmZmZ d dlmZ d dl	Z	d dl
mZ d dl	mZ d dlmZmZ G dd	 d	e	jjZd
d Ze	jjdd ZG dd dejZG dd dejZG dd de	jjZG dd dejZG dd dejZG dd dejZed=i ddddddd dd!d"d#d$d%d$d&d'd(d)d*d+d,d+d-d'd.d/d0d/d1d2d3d4d5g Zd6d7 Ze d8krQd d9l!m"Z" ed=i ddddd dd!d"d#d$d%d$d&d'd(d)d*d+d,d+d-d'd.d/d0d/d1d2d3d4d5g Ze" Z#eed'd'd:Z$e	%d)d;ddZ&e	' 7 e#e&Z(e)e(j* e$e&e(Z+e)e+j* e	,e+ddd<df e(-d).d d)d<Z/e)e/j* W d   dS 1 sJw   Y  dS dS )>    )nullcontextN)OptionalTuple)EasyDict)
functional)nn)flash_attn_qkvpacked_funcflash_attn_funcc                       s(   e Zd ZdZdejf fddZ  ZS )LayerNormfp32z*Subclass torch's LayerNorm to handle fp16.xc                    s$   |j }t |tj}||S N)dtypesuperforwardtypetorchfloat32)selfr   	orig_typeret	__class__ U/home/app/DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/clip_sdpa.pyr   9   s   
zLayerNormfp32.forward)__name__
__module____qualname____doc__r   Tensorr   __classcell__r   r   r   r   r
   6   s    r
   c           
      C   s   |  d}| d}|d d |dd  }}tt|jd d }tt|}| j}||kr||d|||dddd	 }|
tj}tj|||fdddd	
|}|dddd}||| |}tj||gdd
}	|	d|| d |}	|	S | S )Nr            bicubicTF)sizemode	antialiasalign_cornersdim)r%   squeezeintmathsqrtshaper   viewpermute
contiguoustor   r   Finterpolatecat)
abs_postgt_sizer*   Zabs_pos_new	cls_tokenold_pos_embedsrc_sizer   new_pos_embedZvision_pos_embedr   r   r   get_abs_pos?   s8   

r=   c                 C   s   | t d|   S )NgZd;?)r   sigmoid)r   r   r   r   
quick_gelue   s   r?   c                       s&   e Zd Zd	 fdd	Zdd Z  ZS )
CLIPVisionEmbeddings         r"   c                    s   t    || _|| _|| _tjt| j| _	tjj
|| j| j| jdd| _| j| j d | _| jd | _tj| j| j| _| dt| jd d S )NF)in_channelsout_channelskernel_sizestridebiasr#   r!   position_ids)r!   r    )r   __init__	embed_dim
image_size
patch_sizer   r   	Parameterrandnclass_embeddingConv2dpatch_embeddingZnum_patchesZnum_positions	Embeddingposition_embeddingregister_bufferarangeexpand)r   hidden_sizerL   rM   num_channelsr   r   r   rJ   l   s$   
zCLIPVisionEmbeddings.__init__c                 C   sv   |j d }|d ur|}n| |}|ddd}| j|dd}tj||gdd}|t| 	| j
|d }|S )Nr   r#   r!   r    r)   )r/   rR   flatten	transposerP   rW   r   r6   r=   rT   rI   r%   )r   pixel_valuespatch_embeds
batch_sizeZclass_embeds
embeddingsr   r   r   r      s   

zCLIPVisionEmbeddings.forward)rA   rB   rC   r"   r   r   r   rJ   r   r   r   r   r   r   r@   k   s    r@   c                       s.   e Zd Zdedef fddZdd Z  ZS )NoTPFeedForwardr*   
hidden_dimc                    s6   t    tjj||dd| _tjj||dd| _d S )NTrH   )r   rJ   r   r   Linearfc1fc2)r   cfgr*   rb   r   r   r   rJ      s   
zNoTPFeedForward.__init__c                 C   s   |  t| |}|S r   )rf   r?   re   )r   r   outputr   r   r   r      s   zNoTPFeedForward.forward)r   r   r   r,   rJ   r   r   r   r   r   r   ra      s    ra   c                       s,   e Zd Z fddZdejfddZ  ZS )NoTPAttentionc                    sx   t    |j| _|j| _|j|j | _|j| _|j	| _
tjj|j|jd dd| _tjj|j|jdd| _|j| _d S )Nr"   Trc   )r   rJ   num_attention_heads	num_headsZn_local_headsrX   head_dim
seq_lengthmax_seq_lenuse_flash_attnuse_flash_attentionr   r   rd   qkv_projout_projattention_dropoutZ	attn_drop)r   rg   r   r   r   rJ      s   
zNoTPAttention.__init__r   c           
      C   s   |j \}}}| |}|||d| j| j}| jr%t|}|||d}nJtj|ddd\}}}	|	d}|	d}|		d}	|
dddd}|
dddd}|	
dddd}	tjjj|||	d d}|
dddd||d}| |}|S )Nr"   r    r!   r#   r)   r   )	attn_mask)r/   rq   r0   rk   rl   rp   r   r   splitr+   r1   r   r   scaled_dot_product_attentionreshaperr   )
r   r   bszseqlen_Zxqkvrh   xqxkxvr   r   r   r      s"   




zNoTPAttention.forward)r   r   r   rJ   r   r   r   r   r   r   r   r   ri      s
    ri   c                       s4   e Zd Zddef fddZdejfddZ  ZS )	NoTPTransformerBlock   layer_idc                    s|   t    |j| _|j| _|j|j | _t|| _t	||j|j
d| _|| _tjj|j|jd| _tjj|j|jd| _d S )N)r*   rb   eps)r   rJ   rj   n_headsrX   r*   rl   ri   	self_attnra   ffn_hidden_sizemlpr   r   r   	LayerNormlayernorm_epsilonlayer_norm1layer_norm2)r   rg   r   multiple_ofr   r   r   rJ     s   


zNoTPTransformerBlock.__init__r   c                 C   s4   | j | |}|| }|| j| | }|S r   )r   r   r   r   r   )r   r   residualhoutr   r   r   r   1  s   zNoTPTransformerBlock.forward)r   )	r   r   r   r,   rJ   r   r   r   r   r   r   r   r   r~     s    r~   c                       s$   e Zd Z fddZdd Z  ZS )NoTPTransformerc                    sN   t    || _|j| _tj | _t| jD ]}| j	t
||d  qd S )Nr!   )r   rJ   rg   
num_layersr   r   
ModuleListlayersrangeappendr~   )r   rg   r   r   r   r   rJ   9  s   
zNoTPTransformer.__init__c                 C   s    t | jD ]\}}||}q|S r   )	enumerater   )r   hidden_stateslidlayerr   r   r   r   I  s   
zNoTPTransformer.forwardr`   r   r   r   r   r   8  s    r   c                       sD   e Zd Z		d	d fddZdd Zdefdd	Zd
d Z  ZS )VitModelFreturnNc                    s   t    t|j|j|jd| _|r| j D ]\}}d|_qt	|d| _
|ddr=td t|j|ddd| _ntjj|j|ddd| _|rZ| j D ]\}}d|_qR|  D ]}d	|_q^d S )
N)rX   rL   rM   F)rg   Zfp32normzLoad fp32 layernorm for ViT.pre_layernorm_epsilonh㈵>r   T)r   rJ   r@   rX   rL   rM   r_   named_parametersrequires_gradr   transformergetloggerinfor
   pre_layrnormr   r   r   
parametersZmicro_dp)r   rg   freeze_embedfreeze_pre_normnameparampr   r   r   rJ   h  s,   




zVitModel.__init__c                 C   s$   t |ts|g}| j|d  d S )Nr   )
isinstancelistr   set_input_tensor)r   input_tensorr   r   r   r     s   
zVitModel.set_input_tensorc                 C   s   dS )N	open_clipr   )r   r   r   r   __str__  s   zVitModel.__str__c                 C   s$   |  ||}| |}| |}|S r   )r_   r   r   )r   r   r]   r   rh   r   r   r   r     s   

zVitModel.forward)FF)r   N)	r   r   r   rJ   r   strr   r   r   r   r   r   r   r   g  s    +r   r      rX   rA   rk      rj   r   i   rm   r   max_position_embeddingsro   FZunderstand_projector_strider#   hidden_dropoutg        rs   Zno_persist_layer_normr   r   r   rL   rB   rM   rC   Zrecompute_listc                   C   s   t tdddS )NFrg   r   r   )r   vit_model_cfgr   r   r   r   build_clip_l  s
   r   __main__)build_sam_vit_br   r"   r!   r   )0
contextlibr   r-   typingr   r   Zeasydictr   adictr   torch.nnr   r4   r   
flash_attnr   r	   r   r
   r=   jitscriptr?   Moduler@   ra   ri   r~   r   r   r   r   r   Z mmgpt.model.vision_encoder.sam_br   	sam_modelvision_modelzerosr   no_gradpatch_embedprintr/   yaddrZ   r1   Zimage_featurer   r   r   r   <module>   s    -	&
4D;/D	

	




,$