B
    5dm^                 @   s  d dl mZ d dl mZ d dl mZ d dlZd dlmZ d dlmZ d dlm  m	Z
 d dlmZmZ d dlZd dlmZ G dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdd ZG d d! d!ejZG d"d# d#ejZ d$d% Z!e"d&krd dl#Z#d'Z$d(Z%e!e$e% e  Z&e'e%Z(e&)e( e&*  e+d)d*d+d,gZ,e-e,Z,d)gZ.e-e.Z.e&e,e.d-d.d/Z/e0e/d  j1 e0e/d   dS )0    )absolute_import)division)print_functionN)	ParamAttr)NormalXavierNormal)ResNet45c                   s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	PositionalEncoding   c                s&   t t|   | d| || d S )N	pos_table)superr	   __init__register_buffer_get_sinusoid_encoding_table)selfd_hid
n_position)	__class__ 7/tmp/pip-unpacked-wheel-ndi_cy3p/paddleocr/visionlan.pyr      s    zPositionalEncoding.__init__c                s    fddt fddt|D }t |dddddf |dddddf< t |dddddf |dddddf< tj|d	d
}tj|dd}|S )z" Sinusoid position encoding table c                s    fddt D S )Nc          	      s(   g | ] }t d d|d     qS )i'     )nppower).0hid_j)r   positionr   r   
<listcomp>$   s    zcPositionalEncoding._get_sinusoid_encoding_table.<locals>.get_position_angle_vec.<locals>.<listcomp>)range)r   )r   )r   r   get_position_angle_vec#   s    zOPositionalEncoding._get_sinusoid_encoding_table.<locals>.get_position_angle_vecc                s   g | ]} |qS r   r   )r   Zpos_i)r   r   r   r   %   s    zCPositionalEncoding._get_sinusoid_encoding_table.<locals>.<listcomp>Nr   r      float32)dtype)axis)r   arrayr   sincospaddle	to_tensor	unsqueeze)r   r   r   sinusoid_tabler   )r   r   r   r   !   s    ..z/PositionalEncoding._get_sinusoid_encoding_tablec             C   s(   || j d d d |jd f    S )Nr   )r   shapeclonedetach)r   xr   r   r   forward,   s    zPositionalEncoding.forward)r
   )__name__
__module____qualname__r   r   r.   __classcell__r   r   )r   r   r	      s   r	   c                   s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	PositionalEncoding_bkr
   c                s&   t t|   | d| || d S )Nr   )r   r	   r   r   r   )r   r   r   )r   r   r   r   1   s    zPositionalEncoding_bk.__init__c                s   t  fddt D }t |ddg}t |}t j|ddd}|| }t |ddddd	f |ddddd	f< t |ddddd	f |ddddd	f< t j|dd}|S )
z3
        Sinusoid position encoding table.
        c          	      s(   g | ] }d t dd|d     qS )g      ?i'  r   )r   r   )r   r   )r   r   r   r   <   s    zFPositionalEncoding_bk._get_sinusoid_encoding_table.<locals>.<listcomp>r   )r"   r    Nr   r   )	r&   r'   r   reshapearanger(   Zastyper$   r%   )r   r   r   denominatorZ
pos_tensorr)   r   )r   r   r   7   s    
..z2PositionalEncoding_bk._get_sinusoid_encoding_tablec             C   s(   || j d d d |jd f    S )Nr   )r   r*   r+   r,   )r   r-   r   r   r   r.   G   s    zPositionalEncoding_bk.forward)r
   )r/   r0   r1   r   r   r.   r2   r   r   )r   r   r3   0   s   r3   c                   s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
ScaledDotProductAttentionzScaled Dot-Product Attention皙?c                s2   t t|   || _t|| _tjdd| _d S )Nr   )r"   )	r   r8   r   temperaturennDropoutdropoutSoftmaxsoftmax)r   r:   Zattn_dropout)r   r   r   r   M   s    z"ScaledDotProductAttention.__init__Nc             C   s   t j|dddgd}t ||}|| j }|d k	r||d}| dkrZt j|dd}n(| dkrt j|dd}t j|dd}|jd |jd  |jd |jd  g}t |d|d |d dg}d||dk< | 	|}| 
|}t ||}|S )Nr   r   r   )permg    e   )r"   )r&   	transposebmmr:   Zmasked_filldimr(   r*   tiler?   r=   )r   qkvmaskZattnrepeat_timesoutputr   r   r   r.   S   s"    
(

z!ScaledDotProductAttention.forward)r9   )N)r/   r0   r1   __doc__r   r.   r2   r   r   )r   r   r8   K   s   r8   c                   s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
MultiHeadAttentionz Multi-Head Attention module皙?c                s  t t|   || _|| _|| _tj||| tt	dt
d||  ddd| _tj||| tt	dt
d||  ddd| _tj||| tt	dt
d||  ddd| _tt
|dd| _t|| _tj|| |tt dd| _t|| _d S )Nr   g       @)ZmeanZstd)Zinitializer)Zweight_attrg      ?)r:   )r   rM   r   n_headd_kd_vr;   Linearr   r   r   sqrtw_qsw_ksw_vsr8   r   	attention	LayerNorm
layer_normr   fcr<   r=   )r   rO   d_modelrP   rQ   r=   )r   r   r   r   i   s    &&&zMultiHeadAttention.__init__Nc             C   s  | j | j| j  }}}|j\}}	}
|j\}}}
|j\}}}
|}| |}tj|||	||gd}| |}tj|||||gd}| |}tj|||||gd}tj	|ddddgd}tj|d|	|gd}tj	|ddddgd}tj|d||gd}tj	|ddddgd}tj|d||gd}|d k	r8t
||ddgnd }| j||||d}tj||||	|gd}tj	|ddddgd}tj|||	dgd}| | |}| || }|S )	N)r*   r   r   r   rA   )r@   r4   )rI   )rP   rQ   rO   r*   rT   r&   r5   rU   rV   rB   rE   rW   r=   rZ   rY   )r   rF   rG   rH   rI   rP   rQ   rO   Zsz_blen_q_len_kZlen_vresidualrK   r   r   r   r.   {   s2    


 zMultiHeadAttention.forward)rN   )N)r/   r0   r1   rL   r   r.   r2   r   r   )r   r   rM   g   s   rM   c                   s,   e Zd ZdZd fdd		ZdddZ  ZS )MultiHeadAttention_bk       @   皙?Fr   c                s   t    || _|| _|| _|| _|d | _|| | _|| | _t	j
| j| j|d| _t	j
| j| j|d| _t	j
| j| j|d| _t	j
| j||d| _t	|| _t	|| _d S )Ng      )Z	bias_attr)r   r   
mask_valuerO   rP   rQ   scaleZdim_kdim_vr;   rR   linear_qlinear_klinear_vrZ   r<   	attn_drop	proj_drop)r   rO   r[   rP   rQ   r=   Zqkv_biasrf   )r   r   r   r      s    	



zMultiHeadAttention_bk.__init__Nc             C   s  |j \}}}|j \}}}| |}t|||| j| jg}tj|ddddgd}| |}t|||| j| jg}tj|ddddgd}| |}t|||| j| j	g}tj|ddddgd}t
||| j }	|d k	rx| dkrtj|dd}n*| dkr"tj|dd}tj|dd}|	j d |j d  |	j d |j d  g}
t|d|
d |
d dg}td|	|| jk< tj|	dd}| |}t
||}tj|ddddgd}t|||| jg}| |}| |}|S )	Nr   r   r   rA   )r@   )r"   z-infr4   )r*   ri   r&   r5   rO   rP   rB   rj   rk   rQ   matmulrg   rD   r(   rE   floatrf   Fr?   rl   rh   rZ   rm   )r   rF   rG   rH   rI   Z
batch_sizer\   r]   r^   ZlogitsrJ   weightsZattn_outr   r   r   r.      s:    



(


zMultiHeadAttention_bk.forward)rb   rc   rd   rd   re   Fr   )N)r/   r0   r1   rL   r   r.   r2   r   r   )r   r   r`      s         r`   c                   s&   e Zd Zd fdd	Zdd Z  ZS )PositionwiseFeedForward皙?c                sJ   t t|   t||d| _t||d| _t|| _t	|| _
d S )Nr   )r   rr   r   r;   ZConv1Dw_1w_2rX   rY   r<   r=   )r   Zd_inr   r=   )r   r   r   r      s
    z PositionwiseFeedForward.__init__c             C   s^   |}t j|dddgd}| t| |}t j|dddgd}| |}| || }|S )Nr   r   r   )r@   )r&   rB   ru   rp   Zrelurt   r=   rY   )r   r-   r_   r   r   r   r.      s    
zPositionwiseFeedForward.forward)rs   )r/   r0   r1   r   r.   r2   r   r   )r   r   rr      s   rr   c                   s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
EncoderLayerz Compose with two layers 皙?c                s6   t t|   t|||||d| _t|||d| _d S )N)r=   )r   rv   r   rM   slf_attnrr   pos_ffn)r   r[   d_innerrO   rP   rQ   r=   )r   r   r   r      s    zEncoderLayer.__init__Nc             C   s    | j ||||d}| |}|S )N)rI   )rx   ry   )r   Z	enc_inputslf_attn_mask
enc_outputr   r   r   r.      s    
zEncoderLayer.forward)rw   )N)r/   r0   r1   rL   r   r.   r2   r   r   )r   r   rv      s   rv   c            	       s(   e Zd Zd fdd		ZdddZ  ZS )Transformer_Encoderr   rb      rd      皙?   c
       
         sf   t t|   t||	d| _tjd| _t fddt	|D | _
tjdd| _d S )N)r   )pc          
      s    g | ]}t  d qS ))r=   )rv   )r   r]   )rz   rP   r[   rQ   r=   rO   r   r   r     s   z0Transformer_Encoder.__init__.<locals>.<listcomp>gư>)epsilon)r   r}   r   r	   position_encr;   r<   r=   Z	LayerListr   layer_stackrX   rY   )
r   n_layersrO   Z
d_word_vecrP   rQ   r[   rz   r=   r   )r   )rz   rP   r[   rQ   r=   rO   r   r      s    zTransformer_Encoder.__init__Fc             C   s:   |  | |}x| jD ]}|||d}qW | |}|S )N)r{   )r=   r   r   rY   )r   r|   src_maskZreturn_attnsZ	enc_layerr   r   r   r.     s
    
zTransformer_Encoder.forward)	r   rb   r~   rd   rd   r~   r   r   r   )F)r/   r0   r1   r   r.   r2   r   r   )r   r   r}      s           	r}   c                   s&   e Zd Zd fdd	Zdd Z  ZS )	PP_layer      r   c                sh   t t|   || _t||| _t||| _t||| _	t||| _
t | _tjdd| _d S )Nr   )r"   )r   r   r   character_lenr;   	Embeddingf0_embeddingrR   w0wvweTanhactiver>   r?   )r   n_dimN_max_characterr   )r   r   r   r     s    
zPP_layer.__init__c             C   s   t j| jdd}|d|jd dg}| |}t j|dddgd}| |}| 	t j|dddgd| 
| }| |}| t j|dddgd}t ||}|S )Nint64)r!   r   r4   r   r   )r@   )r&   r6   r   r(   expandr*   r   rB   r   r   r   r   r?   rC   )r   r|   Zreading_ordertg_outputr   r   r   r.   #  s    

$
zPP_layer.forward)r   r   r   )r/   r0   r1   r   r.   r2   r   r   )r   r   r     s   r   c                   s(   e Zd Zd fdd	Zdd	d
Z  ZS )
Prediction   %   r   r   c                sT   t t|   t|||d| _t|||d| _t||| _t||| _	|| _
d S )N)r   r   r   )r   r   r   r   pppp_sharer;   rR   w_vrmw_sharenclass)r   r   n_classr   r   )r   r   r   r   5  s    zPrediction.__init__FTc             C   s   |rt|s.|  |}| |}d}d}|||fS |  |}| |}| |}| |}| |}| |}|||fS |  |}| |}|S d S )Nr   )r   r   r   r   )r   Zcnn_featuref_resf_sub
train_modeuse_mlmr   r   r   r   r.   =  s"    











zPrediction.forward)r   r   r   r   )FT)r/   r0   r1   r   r.   r2   r   r   )r   r   r   4  s   r   c                   s*   e Zd ZdZd fdd	Zdd Z  ZS )MLMzArchitecture of MLM   c                sz   t t|   tddd| _tddd| _tdd| _t	dd| _
t	||| _t | _t	|d| _t | _d S )Nr   r   )r   r   r   r   i   )r   r   r   r}   MLM_SequenceModeling_maskMLM_SequenceModeling_WCLr;   r   pos_embeddingrR   	w0_linearr   r   r   r   ZSigmoidsigmoid)r   r   )r   r   r   r   T  s    
zMLM.__init__c             C   s   | j |d d}tj|dd}| |}| tj|dd}tj|dddgd}| || | }| 	|}tj|dddgd}| 
|}tj|dddgd}|d|  }|| }| j|d d}| j|d d}|||fS )	N)r   r   )r!   r   )r"   r   r   )r@   )r   r&   r'   r   r   r(   rB   r   r   r   r   r   )r   r-   	label_posZfeature_v_seqZpos_embZatt_map_subr   r   r   r   r   r.   _  s    


zMLM.forward)r   )r/   r0   r1   rL   r   r.   r2   r   r   )r   r   r   R  s   r   c             C   sN   | j \}}}tj| dddgd} t| ||ddg} tj| ddddgd} | S )Nr   r   r   )r@       rb   rA   )r*   r&   rB   r5   )r-   bZw_hcr   r   r   trans_1d_2dw  s
    r   c                   s*   e Zd ZdZ fddZdddZ  ZS )MLM_VRMa	  
    MLM+VRM, MLM is only used in training.
    ratio controls the occluded number in a batch.
    The pipeline of VisionLAN in testing is very concise with only a backbone + sequence modeling(transformer unit) + prediction layer(pp layer).
    x: input image
    label_pos: character index
    training_step: LF or LA process
    output
    text_pre: prediction of VRM
    test_rem: prediction of remaining string in MLM
    text_mas: prediction of occluded character in MLM
    mask_c_show: visualization of Mask_c
    c                s@   t t|   t | _tddd| _tddddd| _d| _d S )NrA   r   )r   r   i      r   )r   r   r   r   )r   r   r   r   r}   SequenceModelingr   r   )r   )r   r   r   r     s
    zMLM_VRM.__init__Fc             C   sZ  |j \}}}}d}	tj|ddddgd}t|||dg}tj|dddgd}|r|dkrd}
d}| j|d d	}| j||
|d
dd\}}}||||fS |dkr| ||\}
}}| j|d d	}| j||
|d
d\}}}t|}||||fS |dkr| ||\}
}}t|}|d }|dkrV|d|d d d d f |d|d d d d f< n|}|d|  }| j|d d	}| j||
|d
d\}}}t|}||||fS t	nd}
d}| j|d d	}| j||
|ddd}tj|dddgd}|	}|	}tj
||| jg|jd}tj
|g|jd}d}xd|kr||k r||d d d d f }|||< |dd jdd}x:t|D ].}|| dkrp|| dkrp|d ||< qpW |d7 }qW x.td|D ] }t|| dkr|||< qW d}tj
t| | jg|jd}xJtd|D ]<}t|| }|d||d d f |||| < ||7 }qW ||fS d S )Nr   r   r   rA   r   )r@   r4   ZLF_1)r   TF)r   r   ZLF_2)r   LA)r*   r!   )r"   )r*   r&   rB   r5   r   r   r   r   Z
zeros_likeNotImplementedErrorzerosr   r!   ZtopkZsqueezer   intsum)r   r-   r   training_stepr   r   r   hwZnTr   r   text_pretest_remtext_masZmask_cZmask_c_showZcharacter_maskZratioZcontextual_featureZlenTextZnstepsZout_res
out_lengthZnow_stepZ
tmp_resultjstartrK   iZ
cur_lengthr   r   r   r.     sv    


2"zMLM_VRM.forward)F)r/   r0   r1   rL   r   r.   r2   r   r   )r   r   r     s   r   c                   s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
	VisionLANz
    Architecture of VisionLAN
    input
    x: input image
    label_pos: character index
    output
    text_pre: word-level prediction from VRM
    test_rem: remaining string prediction from MLM
    text_mas: occluded character prediction from MLM
    Fc                s&   t t|   t|d| _t | _d S )N)compress_layer)r   r   r   r   backboner   )r   r   )r   r   r   r     s    zVisionLAN.__init__Tc             C   sV   |  |}|r4| j||||d\}}}}	||||	fS | j||||d\}
}|
|fS d S )N)r   )r   r   )r   r-   r   r   r   featuresr   r   r   Zmask_maprK   r   r   r   r   r.     s    
zVisionLAN.forward)F)T)r/   r0   r1   rL   r   r.   r2   r   r   )r   r   r     s   
r   c             C   s  t  }| }tj| dd}| }| }td d}d}x\|D ]T}	|	dd}	|	dd}	|	d	d
}	d|	krtqB|	|krt|	 |d7 }qB|d7 }qBW td| td| td d}d}xZ|D ]R}
d|
 }
|
dd}
|
d
d	}
d|	krq|
|krt|
 |d7 }q|d7 }qW td| td| dddddddddddddddg}x|D ]}||   }x4|D ],}||krpd|krptd | |	 }qpW d|kr|dd}d	|kr|d	d
}d|krڐqV|dd}|||< qVW t
|| td! d S )"Ncpu)Zmap_locationtorchr   zmodule.ra   Zrunning_meanZ_meanZrunning_varZ	_varianceZnum_batches_trackedr   zright : zwrong : r&   zMLM.w0_linearzMLM.wvzMLM.werZ   zslf_attn.w_qszslf_attn.w_kszslf_attn.w_vszpp.w0zpp.wvzpp.wezpp_share.w0zpp_share.wvzpp_share.wer   r   Zbiaszkey: ok)r   
state_dictr   loadkeysprintreplacer   numpyrB   r&   save)input_fp	output_fpZpaddle_modelZpaddle_dictZ
torch_dictZpaddle_dict_keysZtorch_dict_keysrightZwrongZ	torch_keyZpadddle_keyZfc_nameskeyZweightrZ   r   r   r   trans_visionlan  sr    














r   __main__z./pretrained_model/final.pthz!./pretrained_model/final.pdparamsr   rA   rd   r   r   F)r   r   )2
__future__r   r   r   r&   r   Z	paddle.nnr;   Zpaddle.nn.functionalZ
functionalrp   Zpaddle.nn.initializerr   r   r   r   Z)ppocr.modeling.backbones.rec_resnet_asterr   ZLayerr	   r3   r8   rM   r`   rr   rv   r}   r   r   r   r   r   r   r   r/   r   r   r   modelr   r   Zset_state_dictevalZonesr-   r'   r   outr   r*   r   r   r   r   <module>   sP   3A%_E





