o
    81 i5                     @   s  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z; d dl<m=Z= zd dl>m?Z? W n e@y   dZ?Y nw zd dlAmBZB W n e@y   dZBY nw z
d dlCmDZDmEZE W n e@y   d\ZDZEY nw e FeGZHd5ddZId5ddZJd5ddZKG d d! d!ejLZM	$d6d%d&ZNG d'd( d(eMZOG d)d* d*eMe;ZPd+d, ZQd-eeeRejSf  d.efd/d0ZTd1d2 ZUd3d4 ZVdS )7    N)OrderedDict
namedtuple)Sequence)partial)DictList	rearrange)
GPT2Config)remap_state_dict_hf_bigcode)remap_state_dict_hf_falcon)remap_state_dict_hf_gpt_neox)remap_state_dict_hf_gptj)remap_state_dict_hf_llama)remap_state_dict_hf_opt)BlockParallelBlock)GPT2EmbeddingsParallelGPT2Embeddings)MHAParallelMHA)FusedMLPGatedMlpMlpParallelFusedMLPParallelGatedMlpParallelMLP)
sqrelu_fwd)
all_gatherall_gather_rawget_dim_for_local_ranksync_shared_params)GenerationMixin)state_dict_from_pretrained)ColumnParallelLinear)FusedDenseSqreluDense)layer_norm_fnRMSNormNNc                 C   s  ||d}t | d| j| j }t | ddsdnd}| jsdn||  }|t | dd9 }| jr<|d us4J |t|d  }t | d	d}	|	rL|d u sLJ d
t | dd}
t | dd}tt | dd| }t | dd}t | dd }t | dd}t | dd}t | dd}t | dd}t | dd}|s|d u sJ d|d u rtnt}|d u r||	dni }|d ur|t | dddni }t | dd }t	|f| j||
|| j
|d||||||||d|||}|S )Ndevicedtypehead_dimZmup_scale_qk_dot_by_dFg      ?      ?Zmup_attn_multiplier   Zattn_dwconvz.TensorParallel MHA does not support dwconv yetqkv_proj_biasTout_proj_biasrotary_emb_fraction        rotary_emb_baseg     @rotary_emb_scale_baserotary_emb_interleaved	use_alibiwindow_size)r8   use_flash_attnfused_bias_fcz)TensorParallel MHA requires fused_bias_fc)r:   dwconvsequence_parallelprocess_groupr<   	n_head_kv)Z	num_headsnum_heads_kvr/   r0   Zdropoutsoftmax_scaleZcausal	layer_idxrotary_emb_dimr3   r4   r5   r6   r7   r9   )getattrhidden_sizenum_attention_headsZscale_attn_weightsZscale_attn_by_inverse_layer_idxfloatintr   r   r   Z
attn_pdrop)configrB   r>   r*   r+   factory_kwargsr,   Zattn_scale_powerrA   r;   r/   r0   rC   r3   r4   r5   r6   r7   r9   r:   Zmha_clsZserial_kwargsparallel_kwargsr@   	mixer_cls rM   a/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/models/gpt.pycreate_mixer_cls>   sv   

rO   c                 C   s  ||d}t | dd}t | dd}t | dd}|r | jdv s J t | dd}	|	r1| jd	ks1J d
|	r7|r7J |s|	s| jdv sBJ | jdv r| jdkrOtjn
| jdkrWtjntj}
|d u r`tnt}|d uro|t | dddni }t | dd}t|f| j	|
|||d||}|S | jdkrttj
dd}
n| jd	krt}
n| jdv rdnd}ttj|d}
|d u rtnt}|d ur|t | dddni }t|f| j	|
||d||}|S t | dd}t|tr|d usJ || }|r.td u rtd| jdv rdn| j}
|d u r	tnt}|d ur|t | dddni }t|f| j	|
|||d||}|S |	rR|d ur=|s=J d td usDJ ttf| j	|d!|}|S td")#Nr)   mlp_fc1_biasTmlp_fc2_bias	fused_mlpF)gelu_new	gelu_fastgelu_approxgelu_pytorch_tanhrelusqrelufused_dense_sqrelu_denserX   zMfused_dense_sqrelu_dense only supports approximate activation_function sqrelu
gelurS   rT   rU   rV   rW   rX   gluswiglugeglur\   r]   r^   r\   r]   r<   r=   mlp_multiple_of   )hidden_features
activationbias1bias2multiple_ofrW   )Zinplace)rS   rT   rU   rV   tanhnone)approximate)rb   rc   rd   re   mlp_checkpoint_lvlr   zfused_dense is not installedrU   )rb   rc   checkpoint_lvlrd   re   z<Tensor Parallel is not implemented for FusedDenseSqreluDense)rb   rk   zMLP type not supported)rD   activation_functionFZsigmoidZsilur[   r   r   r   n_innerrW   r   r   r   
isinstancer   r   ImportErrorr   r%   RuntimeError)rI   rB   r>   r*   r+   rJ   rP   rQ   rR   rY   rc   mlp_clsrK   r`   ri   rj   rM   rM   rN   create_mlp_cls{   s   



W


7



rs   c                 C   s:  ||d}t | dd}t| |fd|i|}t| |fd|i|}t | dd}	t|	s-tjntfd| ji|}
t | dd}|d u sE|d	krH| jn| j	}t | d
d}t | dd}|sut
| j|||
||| jt | dd||on|d u|d ud}n#|syJ t| j|||
|| jt | ddt | dd||o|d u|d ud}||_|S )Nr)   r<   Tr>   rms_normFepsresidual_in_fp32r   prenormparallel_blockfused_dropout_add_ln)norm_clsrw   resid_dropout1resid_dropout2ry   rv   r<   mark_shared_paramsZparallel_block_tied_norm)rz   r{   r|   Z	tied_normry   rv   r<   r}   )rD   rO   rs   r   nn	LayerNormr'   layer_norm_epsilonresid_pdropZ
embd_pdropr   rE   r   rB   )rI   rB   r>   r*   r+   rJ   r<   rL   rr   use_rms_normrz   rv   r{   rw   rx   blockrM   rM   rN   create_block  s\   





r   c                       s:   e Zd ZdZ fddZeddddddd	d
Z  ZS )GPTPreTrainedModelzAn abstract class to handle weights initialization and
    a simple interface for dowloading and loading pretrained models.
    c                    s6   t    t|tstd| jj| jj|| _d S )NzParameter config in `{}(config)` should be an instance of class `GPT2Config`. To create a model from a Google pretrained model use `model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`)	super__init__ro   r
   
ValueErrorformat	__class____name__rI   )selfrI   Zinputskwargsr   rM   rN   r   <  s   


zGPTPreTrainedModel.__init__TNr.   r   )strictr*   r+   
world_sizerankc                O   s.  | |g|R ||d|	}
t |d|d}|dr t||}n^|dr+t||}nS|ds5|dr;t||}nC|dsJ|dsJ|d	rPt||}n.|d
r[t||}n#|drft||}n|dsp|drvt||}nt	d| d|dkrt
||||}|
j||d}t| |
S )z
        Instantiate a GPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.
        r)   cpuZgpt2zfacebook/optzEleutherAI/gpt-j-ztogethercomputer/GPT-JT-zEleutherAI/gpt-neox-zEleutherAI/pythia-z"togethercomputer/RedPajama-INCITE-ztiiuae/falcon-zmeta-llama/Llama-zbigcode/z	WizardLM/zModel z not supportedr.   r   )r#   
startswithremap_state_dict_hf_gpt2r   r   r   r   r   r   NotImplementedErrorshard_state_dict_tpload_state_dictloggerinfo)clsZ
model_namerI   r   r*   r+   r   r   argsr   model
state_dictZload_returnrM   rM   rN   from_pretrainedH  s:   




z"GPTPreTrainedModel.from_pretrained)r   
__module____qualname____doc__r   classmethodr   __classcell__rM   rM   r   rN   r   7  s    r   {Gz?r-   Tc           	   	   C   s   t |}t| tjr8tjj| j|| d t| jdi }|	d|i t
| jd| | jd ur7tj| j nt| tjrGtjj| j|d |rh|  D ]\}}|dv rgtjj|d|| t d|  d qMd S d S )N)stdZ_optimZlr_multiplier)zout_proj.weightz
fc2.weightr2      )meanr   )mathsqrtro   r~   LinearinitZnormal_weightrD   updatesetattrbiasZzeros_Z	EmbeddingZnamed_parameters)	modulen_layerinitializer_rangemup_width_scaleZrescale_prenorm_residualZmup_init_scaleZ	optim_cfgnameprM   rM   rN   _init_weights|  s(   

r   c                       s@   e Zd Zddef fddZdd ZdddZdd	d
Z  ZS )GPTModelNrI   c              
      s  t    ||d| _t dd| _ jdv sJ t dd}t j| | }t dd| _	t d	d
| _
t dd| _t dd
}t dd }t dd
| _d u rht j| jfd|i| _nt j| jf| jd| _t fddt jD | _t dd}	|	dkr| jdd  D ]}
| jd jj|
j_qt dd
| _| jrtd u rtd| jrt j| _|stjnt }| jfd j!i| _"d ur| j"# D ]}d|_$| jrd|_%q| &t't( j j)t ddd | *  d S )Nr)   r<   TrZ   pad_vocab_size_multipler.   Zmup_embeddings_multiplierr-   rv   Frw   rt   word_embed_proj_dimrx   r=   c                    s"   g | ]}t  f|d qS ))rB   r>   )r   ).0irI   rJ   r>   rM   rN   
<listcomp>  s    z%GPTModel.__init__.<locals>.<listcomp>r1   r2   r   ry   zTriton is not installedru   r   r   r   r   )+r   r   r>   rD   r<   rl   r   ceil
vocab_sizeembeddings_multiplierrv   rw   rx   r   rE   Zmax_position_embeddings
embeddingsr   r~   Z
ModuleListrangenum_hidden_layerslayersZmixerZ
rotary_embry   r&   rp   ZDropoutr   drop_fr   r'   r   ln_f
parametersZ_shared_paramsZ_sequence_parallelapplyr   r   r   tie_weights)r   rI   r>   r*   r+   r   r   r   r   r1   layerrz   r   r   r   rN   r     s   


zGPTModel.__init__c                 C   s   | j d urt| | j  d S d S N)r>   r!   r   rM   rM   rN   r     s   
zGPTModel.tie_weightsc                    s    fddt | jD S )Nc                    s*   i | ]\}}||j  fd iqS r+   )allocate_inference_cache)r   r   r   
batch_sizer+   r   
max_seqlenrM   rN   
<dictcomp>  s    z5GPTModel.allocate_inference_cache.<locals>.<dictcomp>)	enumerater   r   r   r   r+   r   rM   r   rN   r     s   z!GPTModel.allocate_inference_cachec                 C   s  | j d ur| jrddini }| j|fd|i|}| jdkr#|| j }| jr(d }d }| j d ur9| jr9d|jd ini }|d urC||d< | jD ]$}	| jrd| jsX|	|||d\}}qF|	||||d\}}}qF|	||d}qF| jr| js| 	|}
| js|d ur|
| n|
}n| 	|}|d ur||
 | n|
| }| 
|j| j
jjd	}|S t|| j
j| j
j|| jsd n|| j
j| jr| j	jnd
dt| j
td	}|S )NZcombine_batch_seqlen_dimTposition_idsr-   Zseqlenr.   inference_params)mixer_kwargsr   r2   F)residualx1ru   Z	dropout_prw   Zis_rms_norm)r>   r<   r   r   rx   shaper   rw   ry   r   r   tor   r+   r&   r   ru   Ztrainingr   ro   r'   )r   	input_idsr   r   Zembedding_kwargshidden_statesZhidden_states2r   r   r   ZdroppedZdropped2rM   rM   rN   forward  sd   





zGPTModel.forwardNNNr   r(   )	r   r   r   r
   r   r   r   r   r   rM   rM   r   rN   r     s
    ^
r   c                       sN   e Zd Zddef fddZdd ZdddZdd
dZd fdd	Z  Z	S )GPTLMHeadModelNrI   c                    sd  ||d}t  | || _t|fd|i|| _t|dd| _t|dd}t|dd}t|j	| | }t|d	d }	|	d u rC|j
n|	}
|	d urXtj|j
|
fd
di|| _nd | _t|dd}t|dd}|| | _|d u r~tj|
|fd
|i|| _ntd u rtdt|
||f|t|ddd|| _t|dd| _| tt|j|j|d |   d S )Nr)   r>   tie_word_embeddingsTlm_head_biasFr   r.   r   r   r   r-   mup_output_multiplierz fused_dense_lib is not installedr<   )r   r<   	norm_headr   )r   r   r>   r   transformerrD   r   r   r   r   Zn_embdr~   r   project_outoutput_scalelm_headr$   rp   r   r   r   r   r   r   r   )r   rI   r>   r*   r+   rJ   r   r   r   r   	embed_dimr   r   r   rM   rN   r   B  sR   


zGPTLMHeadModel.__init__c                 C   s4   | j r| jjjj| j_| jd urt| | j d S d S r   )r   r   r   word_embeddingsr   r   r>   r!   r   rM   rM   rN   r   p  s
   
zGPTLMHeadModel.tie_weightsc                 K   s   | j j||fd|i|S )Nr+   )r   r   r   rM   rM   rN   r   v  s   z'GPTLMHeadModel.allocate_inference_cacher   c                 C   s4  |j dksJ d|j |j\}}| j|||d}|dur'|j dks'J d|dkr6|dd| df }| jdur@| |}| jdkrJ|| j }| jsS| |}n"t| jj	}	t
| jtrk| jjrkt|| jj}tj||	| jjd	}t
| jtr|durt|| jj\}}
t|d
|d}tddg}||dS )ac  
        input_ids: (batch, seqlen) int tensor
        inference_params: for generation. Adapted from Megatron-LM (and Apex)
        https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
        num_last_tokens: if > 0, only return the logits for the last n tokens
        r   z<Expected `input_ids` to have shape [b, slen], but got shape )r   r   N   z5sequence_parallel is not supported in generation moder   r-   )r   z(n b) ... d -> b ... (n d))bCausalLMOutputlogits)r   )ndimr   r   r   r   r   r   rm   	normalizer   ro   r$   r<   r   r>   Zlinearr   r   r	   r   )r   r   r   r   Znum_last_tokensr   slenr   Z	lm_logitsZlm_head_weight_r   rM   rM   rN   r   {  s4   






zGPTLMHeadModel.forwardTc                    s.  d|v rt | jj}|d|d  d}|d|d  d}||d< ||d< tt|D ]N}|d| d}|d| d	}||d| d< ||d| d< |d
kr||d|d  d}|d|d  d}||d| d< ||d| d	< q.|d}|d}||d< ||d< t j||dS )Nztransformer.ln_0.weighttransformer.layers.r.   z.norm2.weightz.norm2.biasztransformer.ln_f.weightztransformer.ln_f.biasz.norm1.weightz.norm1.biasr   ztransformer.ln_0.biasz!transformer.layers.0.norm1.weightztransformer.layers.0.norm1.biasr   )lenr   r   popreversedr   r   r   )r   r   r   Zn_layersZ	ln_weightZln_biaslr   rM   rN   r     s,   

zGPTLMHeadModel.load_state_dictr   r   )NNr   )T)
r   r   r   r
   r   r   r   r   r   r   rM   rM   r   rN   r   A  s    .

$r   c                    s  t |dd}t|j| | }| dksJ |j dks!J |jdur)|jnd|j }| dks6J |jt |d|j}|  fdd}dfd	d
	}	fdd}
 fdd}|| d d| v rt|| d d| v r}|	| d t|jD ]n}|| d| d || d| d |	| d| d d dkr| 	d| dd |j
dv r|
| d| d |
| d| d n|| d| d || d| d |	| d| d dkr| 	d| dd q| S )zConvert the state_dict of a standard GPT model to the state_dict of a GPT model
    with tensor parallel.

    This function modifies state_dict in place.
    r   r.   r   N   r?   c                    sB   || v r| | }|j d  }| |  d |  | |< d S d S )Nr   r.   )r   r   keyxdimr   r   rM   rN   shard_first_dim  s
    z,shard_state_dict_tp.<locals>.shard_first_dimc                    sh   || v r2| | fddt D  t fddd fD \}}d||f | |< d S d S )Nc                    s    g | ]}t d | qS r8   )r    sizer   Z
local_rank)rf   r   r   rM   rN   r     s    z?shard_state_dict_tp.<locals>.shard_last_dim.<locals>.<listcomp>c                 3   s     | ]}t  d | V  qd S r   )sum)r   pos)dim_each_rankrM   rN   	<genexpr>      z>shard_state_dict_tp.<locals>.shard_last_dim.<locals>.<genexpr>r.   .)r   tuple)r   r   rf   begendr   )r  rf   r   rN   shard_last_dim  s   "z+shard_state_dict_tp.<locals>.shard_last_dimc                    s^   || v r-| | }|j d  d }tt|dddd d  |  d | f d| |< d S d S )Nr   r   z(two o) ... -> two o ...twor.   ztwo o ... -> (two o) ...)r   r	   r   r   rM   rN   shard_gatedmlp_fc1_dim  s   (z3shard_state_dict_tp.<locals>.shard_gatedmlp_fc1_dimc           	         s4  || v rfddt D }fddt D }t|d  }t|d d  }t|d  }t|d d  }krbt| | ddd}t|d d |  |  f d| |< d S t| | d	d
  d}ttj||| || |  | |  |  gddd| |< d S d S )Nc                       g | ]}t  |qS rM   r    r  n_headr   rM   rN   r         
zBshard_state_dict_tp.<locals>.shard_qkv_headdim.<locals>.<listcomp>c                    r  rM   r  r  r?   r   rM   rN   r     r  r.   (three d) ... -> three d ...r   threethree d ... -> (three d) ....(nheadqkv headdim) ... -> nheadqkv headdim ...r   )nheadqkvr   r   .nheadqkv headdim ... -> (nheadqkv headdim) ...)r   r  r	   torchcat)	r   r   n_head_each_rankn_head_kv_each_rankZ
beg_n_headZ
end_n_headZbeg_n_head_kvZend_n_head_kvr   r,   r  r?   r   r   rM   rN   shard_qkv_headdim  sZ   

z.shard_state_dict_tp.<locals>.shard_qkv_headdim-transformer.embeddings.word_embeddings.weightlm_head.weight1transformer.embeddings.position_embeddings.weightr   .mixer.Wqkv.weight.mixer.Wqkv.bias.mixer.out_proj.weight)rf   z.mixer.out_proj.biasr_   .mlp.fc1.weight.mlp.fc1.bias.mlp.fc2.weightz.mlp.fc2.bias)r.   )rD   r   r   r   rE   rn   r  r   r   r   rl   )r   rI   r   r   r   r   	inner_dimr   r   r  r  r#  r   rM   r"  rN   r     sJ   
	
/


r   state_dictsrI   c                    s  t | | d  }t dd}t j| |  dks"J  j dks+J  jdur3 jnd j }| dks@J  j j dksJJ  j j fdd}dd	d
} fdd}dd }| d 	 }	|| |	d d|	v r~|| |	d d|	v r|| |	dd  j
dv r|nt|dd}
t jD ]A}|| |	d| d || |	d| d || |	d| dd |
| |	d| d || |	d| dd || |	d| dd q|	S )a-  Convert the list of sharded state_dict of a GPT model with tensor parallel to
    the state_dict of a standard GPT model.

    This function is meant to be the "reverse" of shard_state_dict_tp.

    Precondition:
        - state_dicts should be ordered in the same way as the shards were created.
    r   r   r.   Nr   c                    sF   | d   j d  krdnd}tj fdd| D |d| < d S )Nr   r.   c                       g | ]}|  qS rM   rM   r   sr   rM   rN   r   F      zKcombine_state_dicts_tp.<locals>.combine_word_embeddings.<locals>.<listcomp>r  )r   r  r  r.  r   r   r   )r   r   r2  rN   combine_word_embeddingsD  s   "$z7combine_state_dicts_tp.<locals>.combine_word_embeddingsr8   c                    s0    |v rt j fdd| D |d| < d S d S )Nc                    r/  rM   rM   r0  r2  rM   rN   r   J  r3  z?combine_state_dicts_tp.<locals>.combine_dim.<locals>.<listcomp>r  )r  r  r4  rM   r2  rN   combine_dimH  s   $z+combine_state_dicts_tp.<locals>.combine_dimc                    s*  j td |v rkr( fdd| D }ttj|ddd| < d S fddtD fddtD  fd	dt| D }tjfd
dt|D dd}tjfddt|D dd}tjfddt|D dd}tj|||gdd}t|d| < d S d S )Nr?   c                       g | ]}t |  d ddqS )r  r   r  r   r0  r2  rM   rN   r   Q  s    zGcombine_state_dicts_tp.<locals>.combine_qkv_headdim.<locals>.<listcomp>r.   r  r  c                    r  rM   r  r  r  rM   rN   r   V  r  c                    r  rM   r  r  r  rM   rN   r   Z  r  c                    s.   g | ]\}}}t | d |d|   dqS )r  r   )r  headdimr   )r   r1  Zrank_n_headZrank_n_head_kv)r8  r   rM   rN   r   ^  s    
c                    s    g | ]\}}|d  |  qS r   rM   r   r   r   )r   rM   rN   r   i  s     r   c                    s,   g | ]\}}| |  | |   qS rM   rM   r9  r   r!  rM   rN   r   k  s    c                    s(   g | ]\}}| | |  d  qS r   rM   r9  r:  rM   rN   r   u  s    r  )r  rD   r	   r  r  r   zipr   )r.  r   r   xsZwqZwkZwvZwqkv)rI   r8  r   )r   r  r   r?   r!  rN   combine_qkv_headdimL  sR   
 
z3combine_state_dicts_tp.<locals>.combine_qkv_headdimc                    s:    |v r fdd| D }t tj|ddd| < d S d S )Nc                    r7  )z(two d) ... -> two d ...r   r  r   r0  r2  rM   rN   r     s    zEcombine_state_dicts_tp.<locals>.combine_gated_mlp.<locals>.<listcomp>r.   r  ztwo d ... -> (two d) ...)r	   r  r  )r.  r   r   r<  rM   r2  rN   combine_gated_mlp  s   z1combine_state_dicts_tp.<locals>.combine_gated_mlpr$  r%  r&  r_   r  r   r'  r(  r)  r*  r+  r,  r  )r   keysrD   r   r   r   rE   rn   r  copyrl   r   r   r   )r.  rI   r?  r   r-  r5  r6  r=  r>  r   Zmlp_combine_fnr   rM   )rI   r8  r   r   rN   combine_state_dicts_tp.  sH   	
8

rA  c           
   	      s  dd t fdd|  D } | d}t|dd}t|j| | }t|ddd||j	d  f| d	< | d	 | d
< dd t fdd|  D } t
|jD ](}| d| d}| | d| d< | d| d}| | d| d< qSdd t fdd|  D } t
|jD ]2}| d| dd  | d| d}| | d| d< | d| d}	|	 | d| d< qdd  t  fdd|  D } | S )Nc                 S      t dd| S Nz^wpe.z+transformer.embeddings.position_embeddings.resubr2  rM   rM   rN   key_mapping_pos_emb     z5remap_state_dict_hf_gpt2.<locals>.key_mapping_pos_embc                 3        | ]\}} ||fV  qd S r   rM   r   kvrG  rM   rN   r    r  z+remap_state_dict_hf_gpt2.<locals>.<genexpr>z
wte.weightr   r.   r   r$  r%  c                 S       t dd| } t dd| } | S )Nz^ln_f.(weight|bias)transformer.ln_f.\1z^h.(\d+).ln_(1|2).(weight|bias)ztransformer.layers.\1.norm\2.\3rD  r2  rM   rM   rN   key_mapping_ln     z0remap_state_dict_hf_gpt2.<locals>.key_mapping_lnc                 3   rI  r   rM   rJ  rP  rM   rN   r    r  zh.z.mlp.c_fc.weightr   r*  z.mlp.c_proj.weightr,  c                 S   rN  )Nz^h.(\d+).mlp.c_fc.biasz"transformer.layers.\1.mlp.fc1.biasz^h.(\d+).mlp.c_proj.biasz"transformer.layers.\1.mlp.fc2.biasrD  r2  rM   rM   rN   key_mapping_mlp  rQ  z1remap_state_dict_hf_gpt2.<locals>.key_mapping_mlpc                 3   rI  r   rM   rJ  rS  rM   rN   r    r  z
.attn.biasz.attn.c_attn.weightr'  z.attn.c_proj.weightr)  c                 S   rN  )Nz^h.(\d+).attn.c_attn.biasz%transformer.layers.\1.mixer.Wqkv.biasz^h.(\d+).attn.c_proj.biasz)transformer.layers.\1.mixer.out_proj.biasrD  r2  rM   rM   rN   key_mapping_attn  s
   z2remap_state_dict_hf_gpt2.<locals>.key_mapping_attnc                 3   rI  r   rM   rJ  rU  rM   rN   r    r  )r   itemsr   rD   r   r   r   rm   padr   r   r   t)
r   rI   r   r   r   dZW1ZW2WqkvZWoutrM   )rU  rP  rS  rG  rN   r     s6   
r   c           	   	      s  dd t fdd|  D } dd t fdd|  D } | d}t|d	d
}t|jd | | }t|ddd||jd  f| d< | d | d< dd t fdd|  D } dd t fdd|  D } dd  t  fdd|  D } |j	|j
 }t|jD ]0}| d| d}t|dd|d| d| d< | d| d}t|dd|d| d| d< q| S )Nc                 S   s    t dd| } t dd| } | S )Nz^language_model.encoder.ztransformer.z^language_model.rD  r2  rM   rM   rN   key_mapping_transformer  rQ  z:remap_state_dict_megatron.<locals>.key_mapping_transformerc                 3   rI  r   rM   rJ  )r\  rM   rN   r    r  z,remap_state_dict_megatron.<locals>.<genexpr>c                 S   rB  rC  rD  r2  rM   rM   rN   rG    rH  z6remap_state_dict_megatron.<locals>.key_mapping_pos_embc                 3   rI  r   rM   rJ  rM  rM   rN   r    r  z,transformer.embedding.word_embeddings.weightr   r.   r   r$  r%  c                 S   .   t dd| } t dd| } t dd| } | S )Nz*^transformer.final_layernorm.(weight|bias)rO  z7^transformer.layers.(\d+).input_layernorm.(weight|bias)ztransformer.layers.\1.norm1.\2z@^transformer.layers.(\d+).post_attention_layernorm.(weight|bias)ztransformer.layers.\1.norm2.\2rD  r2  rM   rM   rN   rP    s   z1remap_state_dict_megatron.<locals>.key_mapping_lnc                 3   rI  r   rM   rJ  rR  rM   rN   r    r  c                 S   rN  )Nz9^transformer.layers.(\d+).mlp.dense_h_to_4h.(weight|bias)z transformer.layers.\1.mlp.fc1.\2z9^transformer.layers.(\d+).mlp.dense_4h_to_h.(weight|bias)z transformer.layers.\1.mlp.fc2.\2rD  r2  rM   rM   rN   rS    s   z2remap_state_dict_megatron.<locals>.key_mapping_mlpc                 3   rI  r   rM   rJ  rT  rM   rN   r    r  c                 S   r]  )Nz<^transformer.layers.(\d+).self_attention.rotary_emb.inv_freqz/transformer.layers.\1.mixer.rotary_emb.inv_freqzF^transformer.layers.(\d+).self_attention.query_key_value.(weight|bias)z#transformer.layers.\1.mixer.Wqkv.\2z<^transformer.layers.(\d+).self_attention.dense.(weight|bias)z'transformer.layers.\1.mixer.out_proj.\2rD  r2  rM   rM   rN   rU    s    z3remap_state_dict_megatron.<locals>.key_mapping_attnc                 3   rI  r   rM   rJ  rV  rM   rN   r  '  r  r   r'  z8(nheads three headdim) ... -> (three nheads headdim) ...r   )r  r8  r(  z0(nheads three headdim) -> (three nheads headdim))r   rW  r   rD   r   r   r   rm   rX  rE   rF   r   r   r	   )	r   rI   r   r   r   r8  rZ  r[  ZbqkvrM   )rU  rP  rS  rG  r\  rN   remap_state_dict_megatron  s@   
r^  )NNNN)r   r-   T)Wloggingr   rE  collectionsr   r   collections.abcr   	functoolsr   typingr   r   r  Ztorch.nnr~   Ztorch.nn.functionalZ
functionalrm   Zeinopsr	   Ztransformersr
   Zflash_attn.models.bigcoder   Zflash_attn.models.falconr   Zflash_attn.models.gpt_neoxr   Zflash_attn.models.gptjr   Zflash_attn.models.llamar   Zflash_attn.models.optr   Zflash_attn.modules.blockr   r   Zflash_attn.modules.embeddingr   r   Zflash_attn.modules.mhar   r   Zflash_attn.modules.mlpr   r   r   r   r   r   Zflash_attn.ops.activationsr   Zflash_attn.utils.distributedr   r   r    r!   Zflash_attn.utils.generationr"   Zflash_attn.utils.pretrainedr#   Zflash_attn.ops.fused_denser$   rp   Zflash_attn.ops.triton.mlpr%   Z flash_attn.ops.triton.layer_normr&   r'   	getLoggerr   r   rO   rs   r   Moduler   r   r   r   r   strZTensorrA  r   r^  rM   rM   rM   rN   <module>   sp    


= 
1F
 )y tt9