o
    rqi                     @   s   d dl Z d dlZd dlm  mZ dd ZG dd dejjZG dd dejjZ	G dd	 d	ejjZ
G d
d dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZdS )    Nc                 C   s4   | dt dt |    t d| t |    S )z%Mindspore's fast gelu implementation.   gZd;gZd;?)torchexpabs)x r   n/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/nlp/codegeex/codegeex.py	fast_gelu   s   r	   c                       s(   e Zd ZdZ fddZdd Z  ZS )MLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension. At the end, dropout is also
    applied.
    c                    sN   t t|   || _tj| jd| j | _t| _	tjd| j | j| _
d S )N   )superr
   __init__hidden_sizer   nnLineardense_h_to_4hr	   activation_funcdense_4h_to_h)selfr   	__class__r   r   r      s   
zMLP.__init__c                 C   s"   |  |}| |}| |}|S N)r   r   r   )r   hidden_statesZintermediate_paralleloutputr   r   r   forward+   s   


zMLP.forward__name__
__module____qualname____doc__r   r   __classcell__r   r   r   r   r
      s    r
   c                       8   e Zd ZdZ		d	 fdd	Z				d
ddZ  ZS )SelfAttentionzself-attention layer abstract class.

    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.
    Tc                       t t|   || _|| _|| _|| _td|| _| j| j dks#J t	| j| j | _
tj| j| j| _tj| j| j| _tj| j| j| _t| j
| _tjjdd| _tj| j| j| _d S Nr   r   dim)r   r"   r   r   num_attention_headsfp16attention_softmax_in_fp32maxlayer_numberinthidden_size_per_attention_headr   r   r   querykeyvaluemathsqrtnorm_factorSoftmaxsoftmaxdenser   r   r(   r,   r)   r*   r   r   r   r   <       zSelfAttention.__init__NFc                 C   s<  |  |}| |}| |}	| d d | j| jf }
|j|
 }| d d | j| jf }
|j|
 }|	 d d | j| jf }
|	j|
 }	|d urh|\}}tj|	||fdd}tj|	|	|	fdd}	|rn||	f}|d|d|d|df}|
 |d |d |d  d}|
 |d |d |d  d}t|dd|dddd| j }|j| }|rt 2 |d ur|d|dd d |df d}n|dd |dd |df }W d    n1 sw   Y  |d urt|}d|d d d d |d d d f< ||d	  }| jr*| |  }n| |}|	d|	d|d|	df}|	|	d|d |d  d}	||d |d  |d d}t||	dddd}|j| }|dddd
 }| d d
 | jf }|j| }| |}|r||g}|S Nr%   r   r&   r         .Tg     @r/   r0   r1   sizer(   r.   viewr   catZtype_as
contiguousmatmul	transposer4   Zno_gradZ	unsqueezecloner*   r6   floathalfZbmmZsqueezeZpermuter   r7   )r   r   attention_mask
layer_pastget_key_valueprompt_lengthcontext_lengthquery_layer	key_layervalue_layernew_query_layer_shapepast_key
past_valuepresentoutput_sizematmul_resultattention_scoresattention_probscontext_layernew_context_layer_shaper   r   r   r   r   X   s   










 




zSelfAttention.forwardTTNFNNr   r   r   r   r   r"   5   s     r"   c                       r!   )TopQuerySelfAttentionzTop query self-attention layer abstract class.

    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.
    Tc                    r#   r$   )r   r\   r   r   r(   r)   r*   r+   r,   r-   r.   r   r   r   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r   r   r   r      r9   zTopQuerySelfAttention.__init__NFc                 C   s<  |  |}| |}	| |}
| d d | j| jf }|j| }|	 d d | j| jf }|	j| }	|
 d d | j| jf }|
j| }
|d urh|\}}tj|	|	|	fdd}	tj|	|
|
fdd}
|rn|	|
f}|d|d|d|	df}|
 |d |d |d  d}|	
 |d |d |d  d}	t|dd|	dddd| j }|j| }|rt 2 |d ur|d|dd d |df d}n|dd |dd |df }W d    n1 sw   Y  |d urt|}d|d d d d |d d d f< ||d	  }| jr*| |  }n| |}|
d|
d|d|
df}|
|
d|d |d  d}
||d |d  |d d}t||
dddd}|j| }|dddd
 }| d d
 | jf }|j| }| |}|r||g}|S r:   r>   )r   r   query_hidden_staterH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   r   r   r   r   r   	  s   










 




zTopQuerySelfAttention.forwardrZ   r[   r   r   r   r   r   r\      s    !r\   c                       s:   e Zd ZdZ			d
 fdd	Z				ddd	Z  ZS )TransformerLayerzA single transformer layer.

    Transformore layer takes input with size [b, s, h] and returns an
    output of the same size.
    h㈵>Tc                    sl   t t|   || _|| _|| _tjj|| jd| _	t
|||||| _tjj| j| jd| _t| j| _d S NZeps)r   r^   r   r   layernorm_epsilonr,   r   r   	LayerNorminput_layernormr"   	attentionpost_attention_layernormr
   mlp)r   r   r(   r,   rb   r)   r*   r   r   r   r     s   	zTransformerLayer.__init__NFc                 C   sd   |  |}| j||||||d}|r|\}}	|}
||
 }| |}| |}|| }|r0||	g}|S NrI   rJ   rK   rL   rd   re   rf   rg   )r   r   rH   rI   rJ   rK   rL   layernorm_outputattention_outputpresentsresiduallayernorm_input
mlp_outputr   r   r   r   r     s&   


zTransformerLayer.forward)r_   TTr[   r   r   r   r   r   r^     s     r^   c                       s6   e Zd ZdZ	d	 fdd	Z				d
ddZ  ZS )TopQueryLayerzA single top query layer.

    Top query layer takes input with size [b, s, h] and returns an
    output of the same size.
    r_   c                    sv   t t|   || _|| _|| _|| _tjj	| j| jd| _
t| j| j| j| _tjj	| j| jd| _t| j| _d S r`   )r   rq   r   r   r(   rb   r,   r   r   rc   rd   r\   re   rf   r
   rg   )r   r   r(   r,   rb   r   r   r   r     s    zTopQueryLayer.__init__NFc              	   C   sv   |d ksJ |  |}| j|||||||d}	|r|	\}	}
|}|	| }| |}| |}|}|| }|r9||
g}|S rh   rj   )r   r   r]   rH   rI   rJ   rK   rL   rk   rl   rm   rn   ro   rp   r   r   r   r   r     s,   
	

zTopQueryLayer.forwardr_   r[   r   r   r   r   r   rq     s    !rq   c                       sV   e Zd ZdZ	d fdd	Zdd Zdd Z			
				dddZ				
dddZ  Z	S )TransformerzTransformer class.r_   c                    s   t t  |_|_|_|_d _jd u sJ jd u r&j_jj dks2J dfdd tj	
 fddtjD _tjjj_tj	jjjd_d S )Nr   z?number of layers should be divisible by number of unique layersc                    s   t  j j| S r   )r^   r   r(   )r,   )r   r   r   build_layerT  s   
z)Transformer.__init__.<locals>.build_layerc                    s   g | ]} |d  qS )r   r   ).0i)rt   r   r   
<listcomp>Y  s    z(Transformer.__init__.<locals>.<listcomp>ra   )r   rs   r   r   r(   rb   
num_layersnum_unique_layersr   r   Z
ModuleListrangelayersrq   topQueryLayerrc   final_layernorm)r   r   r(   rx   rb   r   )rt   r   r   r   ;  s.   
zTransformer.__init__c                 C   s
   || j  S r   )ry   r   r,   r   r   r   _get_layer_indexb  s   
zTransformer._get_layer_indexc                 C   s   | j | | S r   )r{   r   r~   r   r   r   
_get_layere  s   zTransformer._get_layerNFc              	   C   s   | dd }| dd }|rg }t| jD ]&}	| |	}
d }|d ur*||	 }|
||||||d}|r?|\}}|| q| |}d }|d urP|| j }| j|||||||d}|rg|\}}|| | dd }|ru||g}|S )Nr   r   ri   )rD   rB   rz   rx   r   appendr}   r|   )r   r   r]   rH   rI   rJ   rK   rL   rm   indexlayerZpastrS   Zhidden_states_r   r   r   r   r   h  sR   



	
zTransformer.forward c                 C   s   |  |||S r   )
state_dict)r   destinationprefix	keep_varsr   r   r   state_dict_for_save_checkpoint  s   z*Transformer.state_dict_for_save_checkpointrr   r[   Nr   F)
r   r   r   r   r   r   r   r   r   r    r   r   r   r   rs   8  s    '
@rs   c                       B   e Zd ZdZ fddZdd Z			dd	d
ZdddZ  ZS )	EmbeddingLanguage model embeddings.

    Arguments:
        hidden_size: hidden size
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
    c                    sd   t t|   || _|| _|| _tj| j| j| _d| _	tj| j| j| _
| j
 | _
d| _d S )Nword_embeddingsposition_embeddings)r   r   r   r   
vocab_sizemax_sequence_lengthr   r   r   _word_embeddings_keyr   rG   _position_embeddings_keyr   r   r   r   r   r   r   r     s   


zEmbedding.__init__c                 C   s    |  |}| |}|| }|S r   )r   r   )r   	input_idsposition_idsZwords_embeddingsr   
embeddingsr   r   r   r     s   

zEmbedding.forwardNr   Fc                 C   s4   i }| j ||||| j< | j||||| j< |S zFor easy load.)r   r   r   r   r   r   r   r   r   state_dict_r   r   r   r     s   z(Embedding.state_dict_for_save_checkpointTc                 C   s   | j |v r|| j  }ni }| D ]}d|v r"|| ||dd < q|d d| j |d< | jj||d | j|v rA|| j }ni }| D ]}d|v rX|| ||dd < qG| jj||d dS )	Customized load.r   zword_embeddings.r   weightNstrictr   zposition_embeddings.)r   keyssplitr   r   load_state_dictr   r   r   r   r   r   r0   r   r   r   r     s&   

zEmbedding.load_state_dictr   T	r   r   r   r   r   r   r   r   r    r   r   r   r   r     s    		
r   c                       r   )QueryEmbeddingr   c                    sJ   t t|   || _|| _|| _tj| j| j| _	| j	
 | _	d| _d S )Ntop_query_embeddings)r   r   r   r   r   r   r   r   r   r   rG   _top_query_embeddings_keyr   r   r   r   r     s   
zQueryEmbedding.__init__c                 C   s   |  |}|S r   )r   )r   r   r   r   r   r   r     s   
zQueryEmbedding.forwardNr   Fc                 C      i }| j ||||| j< |S r   )r   r   r   r   r   r   r   r   $  s   z-QueryEmbedding.state_dict_for_save_checkpointTc                 C   sZ   | j |v r|| j  }ni }| D ]}d|v r"|| ||dd < q| jj||d dS )r   r   ztop_query_embeddings.r   r   N)r   r   r   r   r   r   r   r   r   r   1  s   
zQueryEmbedding.load_state_dictr   r   r   r   r   r   r   r     s    	
r   c                       L   e Zd ZdZ fddZ				dddZ			dd	d
ZdddZ  ZS )TransformerLanguageModela  Transformer language model.

    Arguments:
        transformer_hparams: transformer hyperparameters
        attention_mask_func: a function that takes `unmaksed-attention-scores`
            with size [b, np, s, s] and an `attention-mask` and will apply
            the masking. The function should return a masked score of the
            same size [b, np, s, s].
          masked-attention-scores = attention_mask_func(
                                     unmaksed-attention-scores, attention-mask)
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
    c                    s~   t t|   || _|| _|| _|| _|| _t| j| j| j| _	d| _
t| j| j| j| _d| _t| j| j| j| _d| _d S )N	embeddingtopQueryEmbeddingtransformer)r   r   r   r   rx   r(   padded_vocab_sizemax_position_embeddingsr   r   _embedding_keyr   r   _topQueryEmbedding_keyrs   r   _transformer_keyr   r   rx   r(   r   r   r   r   r   r   Q  s(   

z!TransformerLanguageModel.__init__NFc              	   C   s6   |  ||}|}	| |	}
| j||
|||||d}|S rh   )r   r   r   )r   r   r   rH   rI   rJ   rK   rL   Zembedding_outputZquery_position_idsZqueryEmbedding_outZtransformer_outputr   r   r   r   q  s   
	z TransformerLanguageModel.forwardr   c                 C   sJ   i }| j ||||| j< | j||||| j< | j||||| j< |S r   )r   r   r   r   r   r   r   r   r   r   r   r     s   z7TransformerLanguageModel.state_dict_for_save_checkpointTc                 C   s   | j |v r|| j  }ni }| D ]}d|v r|| ||< q| jj||d | j|v r1|| j }ni }| D ]}d|v rC|| ||< q7| jj||d | j|v rW|| j }ni }| D ]}d|v rn|| ||dd < q]| jj||d dS )r   Z_embeddingsr   ztransformer.r   N)	r   r   r   r   r   r   r   r   r   r   r   r   r   r     s0   


z(TransformerLanguageModel.load_state_dictr[   r   r   r   r   r   r   r   r   A  s    %

r   c                       r   )CodeGeeXModelz/CodeGeeX: A Multilingual Code Generation Model.c                    s*   t t|   t|||||| _d| _d S )Nlanguage_model)r   r   r   r   r   _language_model_keyr   r   r   r   r     s   
zCodeGeeXModel.__init__NFc              	   C   sL   | j |||||||d}|r|\}}	t|| j jjj }
|r$|
|	g}
|
S rh   )r   FZlinearr   r   r   rG   )r   r   r   rH   rI   rJ   rK   rL   Z	lm_outputrm   r   r   r   r   r     s$   	zCodeGeeXModel.forwardr   c                 C   r   r   )r   r   r   r   r   r   r   r     s   z,CodeGeeXModel.state_dict_for_save_checkpointTc                 C   s(   | j |v r
|| j  }| jj||d dS )r   r   N)r   r   r   )r   r   r   r   r   r   r     s   

zCodeGeeXModel.load_state_dictr[   r   r   r   r   r   r   r   r     s    
!
r   )r2   r   Ztorch.nn.functionalr   Z
functionalr   r	   Moduler
   r"   r\   r^   rq   rs   r   r   r   r   r   r   r   r   <module>   s"   ' 2 0MVvT? 