o
    81 i'                     @   sp   d dl Z d dlZd dlmZ d dlZd dlm  mZ d dl	m
Z
 d dlmZmZ dd Zdedefd	d
ZdS )    N)OrderedDict)	rearrange)
GPT2ConfigGPTNeoXConfigc           
   	      s  dd t fdd|  D } dd t fdd|  D } | d}t|d	d
}t|j| | }t|ddd||j	d  f| d< t|ddrU| d | d< n| d}t|ddd||j	d  f| d< dd t fdd|  D } dd t fdd|  D } t
|jD ]R}| d| d | d| d | d| dd  |j|j }| d| d}t|dd|d| d| d< | d| d}	t|	d d|d| d| d!< qd"d#  t  fd$d|  D } | S )%Nc                 S      t dd| S )Nz
^gpt_neox.ztransformer.resubkey r   f/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/models/gpt_neox.pykey_mapping_layers      z8remap_state_dict_hf_gpt_neox.<locals>.key_mapping_layersc                 3        | ]\}} ||fV  qd S Nr   .0kv)r   r   r   	<genexpr>       z/remap_state_dict_hf_gpt_neox.<locals>.<genexpr>c                 S   r   )Nz^transformer.embed_in.z'transformer.embeddings.word_embeddings.r   r
   r   r   r   key_mapping_emb   r   z5remap_state_dict_hf_gpt_neox.<locals>.key_mapping_embc                 3   r   r   r   r   )r   r   r   r      r   z-transformer.embeddings.word_embeddings.weightpad_vocab_size_multiple   r   tie_word_embeddingsFzlm_head.weightzembed_out.weightc                 S   s.   t dd| } t dd| } t dd| } | S )Nz^transformer.final_layer_norm.ztransformer.ln_f.z*^transformer.layers.(\d+).input_layernorm.ztransformer.layers.\1.norm1.z3^transformer.layers.(\d+).post_attention_layernorm.ztransformer.layers.\1.norm2.r   r
   r   r   r   key_mapping_ln(   s   z4remap_state_dict_hf_gpt_neox.<locals>.key_mapping_lnc                 3   r   r   r   r   )r   r   r   r   4   r   c                 S   s    t dd| } t dd| } | S )Nz,^transformer.layers.(\d+).mlp.dense_h_to_4h.ztransformer.layers.\1.mlp.fc1.z,^transformer.layers.(\d+).mlp.dense_4h_to_h.ztransformer.layers.\1.mlp.fc2.r   r
   r   r   r   key_mapping_mlp7   s   z5remap_state_dict_hf_gpt_neox.<locals>.key_mapping_mlpc                 3   r   r   r   r   )r   r   r   r   @   r   ztransformer.layers.z.attention.biasz.attention.masked_biasz.attention.rotary_emb.inv_freqz!.attention.query_key_value.weightz8(nheads three headdim) ... -> (three nheads headdim) ...   )threeheaddimz.mixer.Wqkv.weightz.attention.query_key_value.biasz0(nheads three headdim) -> (three nheads headdim)z.mixer.Wqkv.biasc                 S   s   t dd| } | S )Nz*^transformer.layers.(\d+).attention.dense.z%transformer.layers.\1.mixer.out_proj.r   r
   r   r   r   key_mapping_attnX   s   z6remap_state_dict_hf_gpt_neox.<locals>.key_mapping_attnc                 3   r   r   r   r   )r!   r   r   r   `   r   )r   itemspopgetattrmathceil
vocab_sizeFpadshaperangen_layerhidden_sizenum_attention_headsr   )
Z
state_dictconfigZword_embeddingsr   r'   Zoutput_embeddingslr    ZWqkvZbqkvr   )r!   r   r   r   r   r   remap_state_dict_hf_gpt_neox   sN   

	r1   gpt_neox_configreturnc                 C   s   | j dksJ tdi d| jddd| jd| jd| jd| jd	| jd
dddddd| jd| j	d| j
d| jddd| jddd| jd| jS )Ni'  r'   Zn_positionsr   Zn_embdr,   Zn_headZn_innerZactivation_functionZresid_pdropg        Z
embd_pdropZ
attn_pdropZlayer_norm_epsiloninitializer_rangebos_token_ideos_token_idZprenormTZparallel_blockZparallel_block_tied_normFZrotary_emb_fractionr   r   )Zrotary_emb_baser   r'   r-   Znum_hidden_layersr.   Zintermediate_sizeZ
hidden_actZlayer_norm_epsr4   r5   r6   Zuse_parallel_residualZ
rotary_pctr   )r2   r   r   r   gpt_neox_config_to_gpt2_confige   sP   	
r7   )r%   r   collectionsr   ZtorchZtorch.nn.functionalnnZ
functionalr(   Zeinopsr   Ztransformersr   r   r1   r7   r   r   r   r   <module>   s   X