o
    W+ i>                     @   s   d dl Z d dlZd dlmZmZ d dlZd dlZd dlmZ d dlm	Z
 d dlmZ d dlmZ d dlmZ dd	lmZ dd
lmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZdS )    N)OptionalUnion)nn)
functional)PreTrainedModel)TokenGeneratorOutput)	ModelFile   )
GPT3Config)samplec                       s>   e Zd ZdZ fddZdd Z	dddZdd	d
Z  ZS )GPT3SelfAttentionzParallel self-attention layer abstract class.

    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    c                    s~   t    |j| _|j| _| j| j | _t| jd| j | _tjdd| _	t
|j| _t| j| j| _t
|j| _d S )N   dim)super__init__hidden_sizenum_attention_headshidden_size_per_attention_headr   Linearquery_key_valueZSoftmaxsoftmaxDropoutZattention_probs_dropout_probattention_dropoutdensehidden_dropout_proboutput_dropoutselfconfig	__class__ o/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/nlp/gpt3/backbone.pyr   '   s   

zGPT3SelfAttention.__init__c                 C   s6   |  dd | j| jf }|j| }|ddddS )z_Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
        size [b, np, s, hn].
        Nr   r      r	   r   )sizer   r   viewpermute)r   tensorZnew_tensor_shaper#   r#   r$   _transpose_for_scores:   s
   
z'GPT3SelfAttention._transpose_for_scoresFc                 C   sF   |  d }| | | }tj|||d}|r!tdd |D S |S )Nr	   r   c                 s   s    | ]}|  V  qd S N)
contiguous).0chunkr#   r#   r$   	<genexpr>N   s    zAGPT3SelfAttention._split_tensor_along_last_dim.<locals>.<genexpr>)r   r&   torchsplittuple)r   r)   Znum_partitionsZcontiguous_split_chunksZlast_dimZlast_dim_sizeZtensor_listr#   r#   r$   _split_tensor_along_last_dimC   s   z.GPT3SelfAttention._split_tensor_along_last_dimc                 C   sP  | d}t|dd||g}| |}| |d\}}}| |}	| |}
| |}| }t|	|
dd}|t	
| j }|r`|
 d}ttjd||f|jddd|||}dd|  }t||| |}| |}| |}t||}|d	ddd }|  d d | jf }|j| }| |}| |}|S )
Nr	   r   r   r%   deviceg     @      ?r   )r&   r0   Zreshaper   r3   r*   typematmulZ	transposemathsqrtr   trilonesr6   r'   mulr   r   r(   r,   r   r   r   )r   hidden_states	ltor_maskZis_inferZtgt_lenZmixed_x_layerZmixed_query_layerZmixed_key_layerZmixed_value_layerZquery_layerZ	key_layerZvalue_layerZprevious_typeZattention_scoresZsrc_lenZconverted_maskZattention_probsZcontext_layerZnew_context_layer_shapeoutputr#   r#   r$   forwardR   sX   













zGPT3SelfAttention.forward)F)	__name__
__module____qualname____doc__r   r*   r3   rB   __classcell__r#   r#   r!   r$   r       s    
r   c                       (   e Zd ZdZ fddZdd Z  ZS )GPT3MLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    c                    sN   t    |j}t|d| | _tj| _td| || _	t
|j| _d S )N   )r   r   r   r   r   dense_h_to_4hFZgeluactivation_funcdense_4h_to_hr   r   dropout)r   r    r   r!   r#   r$   r      s   
zGPT3MLP.__init__c                 C   s,   |  |}| |}| |}| |}|S r+   )rK   rM   rN   rO   )r   r?   Zintermediate_parallelrA   r#   r#   r$   rB      s
   



zGPT3MLP.forwardrC   rD   rE   rF   r   rB   rG   r#   r#   r!   r$   rI      s    rI   c                       rH   )GPT3TransformerLayerzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    c                    sJ   t    tj|j|jd| _t|| _tj|j|jd| _	t
|| _d S )NZeps)r   r   r   	LayerNormr   layernorm_epsiloninput_layernormr   	attentionpost_attention_layernormrI   mlpr   r!   r#   r$   r      s   

zGPT3TransformerLayer.__init__c                 C   s>   |  |}| ||}|| }| |}| |}|| }|S r+   )rU   rV   rW   rX   )r   r?   r@   Zlayernorm_outputZattention_outputZlayernorm_inputZ
mlp_outputrA   r#   r#   r$   rB      s   


zGPT3TransformerLayer.forwardrP   r#   r#   r!   r$   rQ      s    rQ   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )GPT3TransformerzTransformer class.c                    sR   t    d | _ j| _tj fddt| jD | _	tj
 j jd| _d S )Nc                    s   g | ]}t  qS r#   )rQ   )r-   _r    r#   r$   
<listcomp>   s    z,GPT3Transformer.__init__.<locals>.<listcomp>rR   )r   r   Zinput_tensorZnum_hidden_layers
num_layersr0   r   Z
ModuleListrangelayersrS   r   rT   final_layernormr   r!   r[   r$   r      s   
zGPT3Transformer.__init__c                 C   s
   | j | S r+   )r_   )r   Zlayer_numberr#   r#   r$   
_get_layer   s   
zGPT3Transformer._get_layerc                 C   s2   t | jD ]}| |}|||}q| |}|S r+   )r^   r]   ra   r`   )r   r?   attention_maskindexlayerr#   r#   r$   rB      s
   

zGPT3Transformer.forward)rC   rD   rE   rF   r   ra   rB   rG   r#   r#   r!   r$   rY      s
    rY   c                       rH   )GPT3TransformerLanguageModela  Transformer language model.

    Arguments:
        transformer_hparams: transformer hyperparameters
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
        embedding_dropout_prob: dropout probability for embeddings
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    c                    sJ   t    t|j|j| _t|j|j| _t	|j
| _t|| _d S r+   )r   r   r   	Embedding
vocab_sizer   word_embeddingsmax_position_embeddingsposition_embeddingsr   r   embedding_dropoutrY   transformerr   r!   r#   r$   r     s   
z%GPT3TransformerLanguageModel.__init__c           
      C   sF   |  |}| |}|| }| |}| ||}t|| j j}	|	S r+   )rh   rj   rk   rl   rL   Zlinearweight)
r   	input_idsrb   position_idsZwords_embeddingsrj   Z
embeddingsZtransformer_inputZtransformer_outputlogitsr#   r#   r$   rB     s   


z$GPT3TransformerLanguageModel.forwardrP   r#   r#   r!   r$   re      s    re   c                       sl   e Zd ZeZdd Z fddZ			dddZede	e
eejf  fd	d
ZdddZdddZ  ZS )	GPT3Modelc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsg        )meanZstdNr7   )
isinstancer   r   rm   dataZnormal_r    Zinitializer_rangeZbiasZzero_rf   Zpadding_idxrS   Zfill_)r   moduler#   r#   r$   _init_weights#  s$   

zGPT3Model._init_weightsc                    s   t  | t|| _d S r+   )r   r   re   language_modelr   r!   r#   r$   r   5  s   zGPT3Model.__init__Nc           
      K   s   | d}ttjdd||ftj|jd}|d u r,tj|tj|jd}|d|}| 	|||}d }|d urKt
 }	|	|d| jj|d}tj||dS )Nr	   Zdtyper6   r   r   )lossrp   )r&   r0   r<   r=   longr6   ZarangeZ	unsqueezeZ	expand_asrw   r   ZCrossEntropyLossr'   r    rg   addictDict)
r   rn   rb   ro   labelskwargsZ
seq_lengthrp   ry   Zloss_fctr#   r#   r$   rB   9  s(   

zGPT3Model.forwardpretrained_model_name_or_pathc                 C   s^   | j |}| |}tj|tj}t|}d|v r|d }dd |	 D }|
| |S )N
state_dictc                 S   s   i | ]\}}| d d|qS )zmodel.language_modelrw   )replace)r-   kvr#   r#   r$   
<dictcomp>]  s    z-GPT3Model.from_pretrained.<locals>.<dictcomp>)config_classfrom_pretrainedospathjoinr   ZTORCH_MODEL_BIN_FILEr0   loaditemsZload_state_dict)clsr   r    modelZstate_dict_filer   r#   r#   r$   r   Q  s   

zGPT3Model.from_pretrainedr7   c              	   k   s   | d| jj}| d| jj}| d|dd }|d}| dtj|dg|jd}| 	 }	t|| jj
}
|	|
krFtd	|
|d }|dkretj|||jd }tj||fd
d}| jj}tj|tj|jd}t q t|	|
D ]Z}|d d d |f }| |j}|d d d
d d f }t||||| jjd}||k}|| |||f< t|d d d |d f dV  ||k | @ }||B }t|}|r n	q}W d    d S W d    d S 1 sw   Y  d S )Ntop_ktop_p
max_lengthr	   d   r   Zprompt_lengthr5   zcontext length too larger   r   rx   )r   r   temperaturerg   )	sequences)popr    r   r   r&   r0   r)   r6   minitemri   
ValueErrorZzerosrz   catZeod_idZuint8Zno_gradr^   rp   r   rg   r   byteall)r   tokensr   r~   r   r   r   Z
batch_sizelengthsZmin_prompt_lengthZmax_sequence_lengthZ
pad_lengthZpadsZtermination_idZis_generation_doneZcontext_lengthZ
tokens2userp   Zlast_token_logitsZ
new_samplestartedZ
done_tokendoner#   r#   r$   streaming_generated  st   



	

#"zGPT3Model.streaming_generatec                 K   s&   d }| j ||fi |D ]}|}q|S r+   )r   )r   r   r   r~   Zlast_outputrA   r#   r#   r$   generate  s   zGPT3Model.generate)NNN)r7   )rC   rD   rE   r
   r   rv   r   rB   classmethodr   r   strr   PathLiker   r   r   rG   r#   r#   r!   r$   rq     s     

Erq   )r:   r   typingr   r   r{   r0   r   Ztorch.nnr   rL   Ztransformers.modeling_utilsr   Zmodelscope.outputsr   Zmodelscope.utils.constantr   configurationr
   Zdistributed_gpt3r   Moduler   rI   rQ   rY   re   rq   r#   r#   r#   r$   <module>   s$   k,"'