o
    | iK                     @   s  d dl Z d dlmZ d dlm  mZ d dlmZmZm	Z	 d dl
mZ d dlmZ dd ZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZde jdedee jeeef f fddZde jdedeeef deeef de jf
ddZdedede jde jfddZd e jd!e jd"e jdeeef deeef de jfd#d$ZG d%d& d&ejZd+d'd(Z	d+d)d*ZdS ),    N)OptionalTupleType)partial)flash_attn_qkvpacked_funcc                 C   sj   | j }| d}||kr3| dddd}|tj}tj|||fdddd|}|dddd}|S | S )	N   r         bicubicTF)sizemode	antialiasalign_corners)dtyper   permutetotorchfloat32Finterpolate)Zabs_posZtgt_sizer   src_sizeZold_pos_embedZnew_pos_embed r   Y/home/app/DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.pyget_abs_pos   s"   
r   c                	       sP   e Zd Zejfdededeej ddf fddZde	j
de	j
fd	d
Z  ZS )MLPBlockembedding_dimmlp_dimactreturnNc                    s2   t    t||| _t||| _| | _d S N)super__init__nnLinearlin1lin2r   )selfr   r   r   	__class__r   r   r!   ,   s   
zMLPBlock.__init__xc                 C   s   |  | | |S r   )r%   r   r$   r&   r)   r   r   r   forward7   s   zMLPBlock.forward)__name__
__module____qualname__r"   GELUintr   Moduler!   r   Tensorr+   __classcell__r   r   r'   r   r   +   s    r   c                       sB   e Zd Zddededdf fddZdejdejfd	d
Z  Z	S )LayerNorm2dư>num_channelsepsr   Nc                    s8   t    tt|| _tt|| _|| _	d S r   )
r    r!   r"   	Parameterr   onesweightzerosbiasr7   )r&   r6   r7   r'   r   r   r!   >   s   

zLayerNorm2d.__init__r)   c                 C   sn   |j ddd}|| dj ddd}|| t|| j  }| jd d d d f | | jd d d d f  }|S )Nr   T)keepdimr	   )meanpowr   sqrtr7   r:   r<   )r&   r)   usr   r   r   r+   D   s
   ,zLayerNorm2d.forward)r5   )
r,   r-   r.   r0   floatr!   r   r2   r+   r3   r   r   r'   r   r4   =   s    r4   c                #       s   e Zd Zdddddddddejejdd	dd
dfdededededededededede	ej
 de	ej
 dededededeedf ddf" fdd Zd!ejdejfd"d#Z  ZS )$ImageEncoderViT      r               @   TFr   r   img_size
patch_sizein_chans	embed_dimdepth	num_heads	mlp_ratio	out_chansqkv_bias
norm_layer	act_layeruse_abs_posuse_rel_posrel_pos_zero_initwindow_sizeglobal_attn_indexes.r   Nc                    s  t    || _t||f||f||d| _d| _|r*tt	d|| || || _t
 | _t|D ]"}t||||	|
|||||vrD|nd|| || fd
}| j| q3ttj||dddt|tj||dddd	t|| _tjd
dddddd| _tjddddddd| _dS )a  
        Args:
            img_size (int): Input image size.
            patch_size (int): Patch size.
            in_chans (int): Number of input image channels.
            embed_dim (int): Patch embedding dimension.
            depth (int): Depth of ViT.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            norm_layer (nn.Module): Normalization layer.
            act_layer (nn.Module): Activation layer.
            use_abs_pos (bool): If True, use absolute positional embeddings.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks.
            global_attn_indexes (list): Indexes for blocks using global attention.
        )kernel_sizestriderM   rN   Nr   r   )
dimrP   rQ   rS   rT   rU   rW   rX   rY   
input_sizeF)r[   r<   r   )r[   paddingr<   rJ   i   r	   )r[   r\   r_   r<   rE   )r    r!   rK   
PatchEmbedpatch_embed	pos_embedr"   r8   r   r;   
ModuleListblocksrangeBlockappend
SequentialConv2dr4   necknet_2net_3)r&   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   iblockr'   r   r   r!   N   s^   
%
zImageEncoderViT.__init__r)   c                 C   sl   |  |}| jd ur|t| j|d }| jD ]}||}q| |dddd}| |}| |}|S )Nr   r   r   r	   )	ra   rb   r   r   rd   rj   r   rk   rl   )r&   r)   blkZneck_outputZconv2_outputZconv3_outputr   r   r   r+      s   





zImageEncoderViT.forward)r,   r-   r.   r"   	LayerNormr/   r0   rC   boolr   r1   r   r!   r   r2   r+   r3   r   r   r'   r   rD   M   sj    	

[rD   c                       s   e Zd ZdZddejejddddfdeded	ed
e	de
ej de
ej de	de	dedeeeef  ddf fddZdejdejfddZ  ZS )rf   zSTransformer blocks with support of window attention and residual propagation blocksrI   TFr   Nr]   rP   rQ   rS   rT   rU   rW   rX   rY   r^   r   c                    sf   t    ||| _t||||||	dkr|
n|	|	fd| _||| _t|t|| |d| _|	| _	dS )ai  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            norm_layer (nn.Module): Normalization layer.
            act_layer (nn.Module): Activation layer.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks. If it equals 0, then
                use global attention.
            input_size (tuple(int, int) or None): Input resolution for calculating the relative
                positional parameter size.
        r   )rP   rS   rW   rX   r^   )r   r   r   N)
r    r!   norm1	Attentionattnnorm2r   r0   mlprY   )r&   r]   rP   rQ   rS   rT   rU   rW   rX   rY   r^   r'   r   r   r!      s   


	
zBlock.__init__r)   c                 C   s   |}|  |}| jdkr|jd |jd }}t|| j\}}| |}| jdkr3t|| j|||f}|| }|| | | }|S )Nr   r   r	   )rr   rY   shapewindow_partitionrt   window_unpartitionrv   ru   )r&   r)   shortcutHWpad_hwr   r   r   r+      s   



zBlock.forward)r,   r-   r.   __doc__r"   rp   r/   r0   rC   rq   r   r1   r   r   r!   r   r2   r+   r3   r   r   r'   r   rf      sD    	
,rf   c                       sl   e Zd ZdZ					ddededed	ed
edeeeef  ddf fddZde	j
de	j
fddZ  ZS )rs   z=Multi-head Attention block with relative position embeddings.   TFNr]   rP   rS   rW   rX   r^   r   c                    s   t    || _|| }|d | _tj||d |d| _t||| _|| _| jrS|dus1J dt	t
d|d  d || _t	t
d|d  d || _dS dS )	a  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads.
            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
            rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            input_size (tuple(int, int) or None): Input resolution for calculating the relative
                positional parameter size.
        g      r   )r<   NzBInput size must be provided if using relative positional encoding.r	   r   r   )r    r!   rP   scaler"   r#   qkvprojrW   r8   r   r;   	rel_pos_h	rel_pos_w)r&   r]   rP   rS   rW   rX   r^   head_dimr'   r   r   r!      s   


 $zAttention.__init__r)   c              	   C   s  |j \}}}}| |||| d| jdddddd}|d|| j || dd\}}}	d\}
}| jrGt|| j| j	||f||f\}
}|
|| j|| d}|
|| j|| d}|	
|| j|| d}	| jr|

|| j|
d|
d|
d}
|
|| j|d|d|d}|
| 
|| j|
d|
d|d }tjjj|||	|d}n	tjj|||	}|
|| j||dddddd|||d}| |}|S )	Nr   r	   r   r      )NN)	attn_mask)rw   r   reshaperP   r   unbindrW   add_decomposed_rel_posr   r   viewr   r   r"   
functionalscaled_dot_product_attentionr   )r&   r)   Br{   r|   _r   qkvrel_hrel_w	attn_biasr   r   r   r+   #  s$   ,& &&,.
zAttention.forward)r   TFTN)r,   r-   r.   r~   r0   rq   r   r   r!   r   r2   r+   r3   r   r   r'   r   rs      s.    $rs   r)   rY   r   c              	   C   s   | j \}}}}|||  | }|||  | }|dks|dkr+t| ddd|d|f} || || }}	| ||| ||	| ||} | dddddd d|||}
|
||	ffS )aU  
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.

    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    r   r   r   r	   r      r   )rw   r   padr   r   
contiguous)r)   rY   r   r{   r|   Cpad_hpad_wHpWpwindowsr   r   r   rx   F  s   $rx   r   r}   hwc           
      C   s   |\}}|\}}| j d || | |  }| ||| || ||d}	|	dddddd |||d}	||ks=||krO|	ddd|d|ddf  }	|	S )	a  
    Window unpartition into original sequences and removing padding.
    Args:
        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        window_size (int): window size.
        pad_hw (Tuple): padded height and width (Hp, Wp).
        hw (Tuple): original height and width (H, W) before padding.

    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    r   r   r   r   r	   r   r   N)rw   r   r   r   )
r   rY   r}   r   r   r   r{   r|   r   r)   r   r   r   ry   ^  s   $$ry   q_sizek_sizerel_posc           	      C   s   t dt| | d }|jd |kr>|j}|tj}tj|	d|jd d
ddd|dd|}|	d|
dd}n|}tj| |jddddf t||  d	 }tj||jddddf t| | d	 }|| |d t| | d	  }||  S )
a\  
    Get relative positional embeddings according to the relative positions of
        query and key sizes.
    Args:
        q_size (int): size of query q.
        k_size (int): size of key k.
        rel_pos (Tensor): relative position embeddings (L, C).

    Returns:
        Extracted positional embeddings according to relative positions.
    r	   r   r   r   linear)r   r   )deviceNg      ?)r0   maxrw   r   r   r   r   r   r   r   r   aranger   long)	r   r   r   Zmax_rel_distr   Zrel_pos_resizedZq_coordsZk_coordsrelative_coordsr   r   r   get_rel_posw  s"   **r   r   r   r   c                 C   s   |\}}|\}}t |||}	t |||}
| j\}}}| ||||}td||	}td||
}|d}|d}|||| |d}|||| d|}||fS )a  
    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
    Args:
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkr   r   )r   rw   r   r   einsum	unsqueeze)r   r   r   r   r   q_hq_wk_hk_wRhRwr   r   r]   r_qr   r   r   r   r   r     s   

r   c                       st   e Zd ZdZ					ddeeef deeef deeef d	ed
eddf fddZdejdejfddZ	  Z
S )r`   z#
    Image to Patch Embedding.
    rF   rF   r   r   r   rG   r[   r\   r_   rM   rN   r   Nc                    s$   t    tj|||||d| _dS )aP  
        Args:
            kernel_size (Tuple): kernel size of the projection layer.
            stride (Tuple): stride of the projection layer.
            padding (Tuple): padding size of the projection layer.
            in_chans (int): Number of input image channels.
            embed_dim (int): Patch embedding dimension.
        )r[   r\   r_   N)r    r!   r"   ri   r   )r&   r[   r\   r_   rM   rN   r'   r   r   r!     s   

zPatchEmbed.__init__r)   c                 C   s   |  |}|dddd}|S )Nr   r	   r   r   )r   r   r*   r   r   r   r+     s   
zPatchEmbed.forward)r   r   r   r   rG   )r,   r-   r.   r~   r   r0   r!   r   r2   r+   r3   r   r   r'   r   r`     s*    


r`   c                 C   s   t dddg d| dS )NrG   rH   )r	   r   r      )encoder_embed_dimencoder_depthencoder_num_headsencoder_global_attn_indexes
checkpoint)
_build_sam)r   r   r   r   build_sam_vit_b  s   r   c                 C   sz   d}d}d}|| }t || |dttjjdd||dd|d|d	}	|d ur;t|}
|	jd
d |
 D dd t| |	S )NrJ   rE   rF   r   r5   )r7   T   )rO   rN   rK   rQ   rT   rP   rL   rS   rW   rZ   rY   rR   c                 S   s&   i | ]\}}d |v r|dd |qS )Zvision_tower_high   Nr   ).0r   r   r   r   r   
<dictcomp>  s   & z_build_sam.<locals>.<dictcomp>)strict)	rD   r   r   r"   rp   loadload_state_dictitemsprint)r   r   r   r   r   Zprompt_embed_dim
image_sizeZvit_patch_sizeZimage_embedding_sizeZimage_encoder
state_dictr   r   r   r     s.   
r   r   )r   torch.nnr"   torch.nn.functionalr   r   typingr   r   r   	functoolsr   
flash_attnr   r   r1   r   r4   rD   rf   rs   r2   r0   rx   ry   r   r   r`   r   r   r   r   r   r   <module>   sT   mB*J


#


%
"