o
    81 iЁ                     @   sD  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlZd dlmZ d dlm  mZ d dlmZ d dlmZmZ d dlmZmZ d d	lmZmZmZmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( zd dl)m*Z* W n e+y   dZ*Y nw zd dl,m-Z- W n e+y   dZ-Y nw zd dl.m/Z/ W n e+y   dZ/Y nw e 0e1Z2d1ddZ3d2ddZ4d3ddZ5d4ddZ6G dd dej7Z8G dd dej7Z9G d d! d!ej7Z:G d"d# d#ej7Z;G d$d% d%ej7Z<G d&d' d'ej7Z=G d(d) d)e=Z>G d*d+ d+e=Z?d,efd-d.Z@d,efd/d0ZAdS )5    N)OrderedDict)Sequence)partial)AnyMapping)	rearrange)
BertConfigPretrainedConfig),BaseModelOutputWithPoolingAndCrossAttentionsBertForPreTrainingOutput)index_first_axisindex_first_axis_residual	pad_inputunpad_input)Block)BertEmbeddings)MHA)FusedMLPMlp)state_dict_from_pretrained)
FusedDense)layer_norm_fn)CrossEntropyLossFc              
   C   s   t | dd}t | dd}i }| jdkr4t | d| j|d< t | dd|d< t | dd |d< t | d	d|d	< ttf| j|| jd|||d
|}|S )Nuse_flash_attnFfused_bias_fcZrotaryZrotary_emb_dimZrotary_emb_baseg     @Zrotary_emb_scale_baseZrotary_emb_interleaved)Z	num_heads
cross_attnZdropoutZcausalr   r   return_residual)getattrZposition_embedding_typehidden_sizer   r   Znum_attention_headsZattention_probs_dropout_prob)configr   r   r   r   Zrotary_kwargs	mixer_cls r!   b/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/models/bert.pycreate_mixer_cls9   s,   
	r#   c                 C   s   | j }t| dd}|r| jdv sJ d|s.| jdv rdnd}tt|ttj|d|d}|S td u r6td	t| d
d}t	|t
rK|d usGJ || }tt|||d}|S )N	fused_mlpFgelu_new	gelu_fastgelu_pytorch_tanhz(fused_mlp only supports approximate gelutanhnoneapproximate)hidden_features
activationr   fused_dense is not installedmlp_checkpoint_lvlr   )r-   Zcheckpoint_lvlr   )Zintermediate_sizer   
hidden_actr   r   Fgelur   ImportError
isinstancer   )r   	layer_idxr   Z	inner_dimr$   r,   mlp_clsr0   r!   r!   r"   create_mlp_clsP   s>   

r8   c           	      C   s|   t | dd}|o|| jd k}| }t| ||d}t| ||d}ttj| jd}t| j	|||d| j
| j
t | dd|d	}|S )Nlast_layer_subsetF   )r   epsfused_dropout_add_ln)norm_clsZprenormZresid_dropout1Zresid_dropout2r=   r   )r   num_hidden_layersr#   r8   r   nn	LayerNormlayer_norm_epsr   r   hidden_dropout_prob)	r   r6   r9   r   r   r    r7   r>   blockr!   r!   r"   create_blockt   s$   
rE   {Gz?c                 C   s   t | tjrtjj| j|d | jd urtj| j d S d S t | tjr?tjj| j|d | j	d urAtj| j| j	  d S d S d S )N)Zstd)
r5   r@   LinearinitZnormal_weightbiasZzeros_Z	Embeddingpadding_idx)moduleinitializer_ranger!   r!   r"   _init_weights   s   

rN   c                       s,   e Zd Zdef fddZdddZ  ZS )BertEncoderr   c                    s<   t    t dd| _t fddt jD | _d S )Nr   Fc                    s   g | ]}t  |d qS ))r6   )rE   ).0ir   r!   r"   
<listcomp>   s    z(BertEncoder.__init__.<locals>.<listcomp>)	super__init__r   r   r@   Z
ModuleListranger?   layersselfr   	__class__rR   r"   rU      s
   

zBertEncoder.__init__Nc                 C   s  |du s| j s'|durd|ind}| jD ]}|||d}q|dur%|| }|S |jdd \}}t||\}}}	}
}|	|
d}|du rX| jD ]}|||d}qFt||||}|S | jdd D ]}|||d}q_|durtj|| dd }||@ jdtj	d	}t
tj|d
tj	d	d}ntj|dd }|jdtj	d	}t
tj|d
tj	d	d}t||\}}|||
|	|
d}| jd ||d}|S )zIf subset_mask is not None, we only want output for the subset of the sequence.
        This means that we only compute the last layer output for these tokens.
        subset_mask: (batch, seqlen), dtype=torch.bool
        Nkey_padding_mask)mixer_kwargs   )
cu_seqlens
max_seqlenFas_tuple)dimdtyper   )r:   r   )Zx_kvr_   r`   Zcu_seqlens_kZmax_seqlen_k)r   rW   shaper   r   torchnonzeroflattensumZint32r2   padZcumsumr   )rY   hidden_statesr\   subset_maskr]   layerbatchseqlenindicesr_   Zmax_seqlen_in_batch_
subset_idxZsubset_seqlensZsubset_cu_seqlensZhidden_states_subsetr!   r!   r"   forward   s\   
(

zBertEncoder.forward)NN__name__
__module____qualname__r   rU   rt   __classcell__r!   r!   rZ   r"   rO      s    rO   c                       s&   e Zd Z fddZdddZ  ZS )
BertPoolerc                    sV   t    t|dd}|rtd u rtd|stjnt}||j|j| _t	 | _
d S )Nr   Fr/   )rT   rU   r   r   r4   r@   rG   r   denseZTanhr.   rY   r   r   
linear_clsrZ   r!   r"   rU      s   
zBertPooler.__init__Tc                 C   s0   |r
|d d df n|}|  |}| |}|S )Nr   )r{   r.   )rY   rl   poolZfirst_token_tensorpooled_outputr!   r!   r"   rt      s   

zBertPooler.forwardTrv   rw   rx   rU   rt   ry   r!   r!   rZ   r"   rz      s    	rz   c                       s2   e Zd Z fddZdejdejfddZ  ZS )BertPredictionHeadTransformc                    s   t    t|dd}|rtd u rtdt|dd| _| jr'td u r'td|s,tjnt}||j	|j	| _
|jdv r=dnd}tj|d	| _tj|j	|jd
| _d S )Nr   Fr/   r=   Triton is not installedr%   r)   r*   r+   r;   )rT   rU   r   r   r4   r=   r   r@   rG   r   r{   r1   ZGELUtransform_act_fnrA   rB   
layer_norm)rY   r   r   r}   r,   rZ   r!   r"   rU      s   

z$BertPredictionHeadTransform.__init__rl   returnc                 C   sH   |  |}| |}| js| |}|S t|| jj| jj| jjd}|S )Nr;   )r{   r   r=   r   r   rI   rJ   r<   rY   rl   r!   r!   r"   rt      s   


z#BertPredictionHeadTransform.forward)rv   rw   rx   rU   rg   ZTensorrt   ry   r!   r!   rZ   r"   r      s    r   c                       $   e Zd Z fddZdd Z  ZS )BertLMPredictionHeadc                    sZ   t    t|dd}|rtd u rtd|stjnt}t|| _||j	|j
dd| _d S )Nr   Fr/   T)rJ   )rT   rU   r   r   r4   r@   rG   r   	transformr   
vocab_sizedecoderr|   rZ   r!   r"   rU   
  s   

zBertLMPredictionHead.__init__c                 C   s   |  |}| |}|S N)r   r   r   r!   r!   r"   rt     s   

zBertLMPredictionHead.forwardr   r!   r!   rZ   r"   r   	  s    r   c                       r   )BertPreTrainingHeadsc                    s(   t    t|| _t|jd| _d S )Nr^   )rT   rU   r   predictionsr@   rG   r   seq_relationshiprX   rZ   r!   r"   rU     s   

zBertPreTrainingHeads.__init__c                 C   s   |  |}| |}||fS r   )r   r   )rY   sequence_outputr   prediction_scoresseq_relationship_scorer!   r!   r"   rt   #  s   

zBertPreTrainingHeads.forwardr   r!   r!   rZ   r"   r     s    r   c                       s,   e Zd ZdZ fddZedd Z  ZS )BertPreTrainedModelzAn abstract class to handle weights initialization and
    a simple interface for dowloading and loading pretrained models.
    c                    s6   t    t|tstd| jj| jj|| _d S )NzParameter config in `{}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`)	rT   rU   r5   r   
ValueErrorformatr[   rv   r   )rY   r   inputskwargsrZ   r!   r"   rU   .  s   


zBertPreTrainedModel.__init__c                 O   s<   | |g|R i |}|j tt||dd}t| |S )a@  
        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.

        Params:
            pretrained_model_name_or_path: either:
                - a path or url to a pretrained model archive containing:
                    . `bert_config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance
                - a path or url to a pretrained model archive containing:
                    . `bert_config.json` a configuration file for the model
                    . `model.chkpt` a TensorFlow checkpoint
            *inputs, **kwargs: additional input for the specific Bert class
                (ex: num_labels for BertForSequenceClassification)
        F)strict)Zload_state_dictremap_state_dictr   loggerinfo)clsZ
model_namer   r   r   modelZload_returnr!   r!   r"   from_pretrained:  s   
z#BertPreTrainedModel.from_pretrained)rv   rw   rx   __doc__rU   classmethodr   ry   r!   r!   rZ   r"   r   )  s
    r   c                       s6   e Zd Zddef fddZ				d	ddZ  ZS )
	BertModelTr   c                    s   t  | t|dd| _|j| j dkr#| j| j|j| j  7  _t|dd| _| jr5td u r5td|jdv s<J t	|j
|j|j|j|jd| _t|j| _tj|j
|jd	| _t|| _|rgt|nd | _| tt|jd
 d S )Npad_vocab_size_multipler:   r   r=   Fr   )r3   r&   r'   r(   )rK   r;   rM   )rT   rU   r   r   r   r=   r   r4   r1   r   r   Zmax_position_embeddingsZtype_vocab_sizeZpad_token_id
embeddingsr@   ZDropoutrC   emb_droprA   rB   emb_lnrO   encoderrz   poolerapplyr   rN   rM   )rY   r   Zadd_pooling_layerrZ   r!   r"   rU   U  s,   


zBertModel.__init__Nc                 C   s:  | j |||d}| js| |}nt|| jj| jj| jjd}| |}|durI|jdd \}}t	j
||t	j|jd}	d|	dddf< ||	B }
nd}
| j|||
d}|du rd| jdura| |nd}n3|dur}|
| }||	| |  }||| |  }n||	|
  }|||
  }| jdur| j|d	d
nd}t||dS )a'  If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
        we only want the output for the masked tokens. This means that we only compute the last
        layer output for these tokens.
        masked_tokens_mask: (batch, seqlen), dtype=torch.bool
        )position_idstoken_type_idsr;   Nr^   )re   deviceTr   )r\   rm   F)r~   )last_hidden_statepooler_output)r   r=   r   r   rI   rJ   r<   r   rf   rg   Zzerosboolr   r   r   r
   )rY   	input_idsr   r   attention_maskmasked_tokens_maskrl   Z
batch_sizerp   Zfirst_col_maskrm   r   r   rs   Z
pool_inputr!   r!   r"   rt   o  sB   

zBertModel.forwardr   )NNNNru   r!   r!   rZ   r"   r   T  s    r   c                       s>   e Zd Zdef fddZdd Z					d	ddZ  ZS )
BertForPreTrainingr   c                    s   t  | t|dd| _t|dd| _| jr| jsJ dt|dd}|r.td u r.td|s3tjnttdd}t	|| _
t|| _|d	d
| _|dd
| _| tt|jd |   d S )Ndense_seq_outputFr9   z+last_layer_subset requires dense_seq_outputuse_xentropyzxentropy_cuda is not installedT)Zinplace_backwardr   )Zignore_indexra   r   )rT   rU   r   r   r9   r   r4   r@   r   r   bertr   r   mlm_lossnsp_lossr   rN   rM   tie_weights)rY   r   r   Zloss_clsrZ   r!   r"   rU     s$   


zBertForPreTraining.__init__c                 C   s   | j jjj| jjj_d S r   )r   r   word_embeddingsrI   r   r   r   )rY   r!   r!   r"   r     s   zBertForPreTraining.tie_weightsNc                 C   s   | j r|dur|dknd}| j||||dur| nd|d}|j|j}	}
| jrE|durEtj| dkdd }| j sEt	t
|	d|}	| |	|
\}}d}|dur|dur| jri|duri| || | }n| t
|dt
|d}| t
|d	t
|d}| |  }t|||d
S )a  
        If labels are provided, they must be 0 for masked out tokens (as specified in the attention
        mask).
        Outputs:
            if `labels` and `next_sentence_label` are not `None`:
                Outputs the total_loss which is the sum of the masked language modeling loss and the next
                sentence classification loss.
            if `labels` or `next_sentence_label` is `None`:
                Outputs a tuple comprising
                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
                - the next sentence classification logits of shape [batch_size, 2].

        Nr   )r   r   r   r   Frb   zb s d -> (b s) dz... v -> (...) vz... -> (...)z... t -> (...) t)ZlossZprediction_logitsZseq_relationship_logits)r9   r   r   r   r   r   rg   rh   ri   r   r   r   r   r   floatr   )rY   r   r   r   r   labelsZnext_sentence_labelr   Zoutputsr   r   Zmasked_token_idxr   r   Z
total_lossZmasked_lm_lossZnext_sentence_lossr!   r!   r"   rt     sL   
zBertForPreTraining.forward)NNNNN)rv   rw   rx   r   rU   r   rt   ry   r!   r!   rZ   r"   r     s    r   r   c              	      s  dd t fdd|  D } dd t fdd|  D } dd	 t fd
d|  D } dd t fdd|  D } t|dd}t|jD ]}| d| d}| d| d}| d| d}| d| d}| d| d}| d| d}	|r||jd kstj|||gdd| d| d< tj|||	gdd| d| d< qO|| d| d< tj||gdd| d| d< || d| d< tj||	gdd| d| d< qOd d!  t  fd"d|  D } d#d$ t fd%d|  D } t|d&d}
|
dkrO| d' }t	|ddd|j
|jd  f| d'< | d( }t	|ddd|j
|jd  f| d(< | d) }tj	|d|j
|jd  fd*d+| d)< | S ),zU
    Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
    c                 S       t dd| } t dd| } | S )NzLayerNorm.gamma$zLayerNorm.weightzLayerNorm.beta$zLayerNorm.biasresubkeyr!   r!   r"   key_mapping_ln_gamma_beta     z3remap_state_dict.<locals>.key_mapping_ln_gamma_betac                 3        | ]\}} ||fV  qd S r   r!   rP   kv)r   r!   r"   	<genexpr>      z#remap_state_dict.<locals>.<genexpr>c                 S      t dd| S )Nz^bert.encoder.layer.bert.encoder.layers.r   r   r!   r!   r"   key_mapping_layers     z,remap_state_dict.<locals>.key_mapping_layersc                 3   r   r   r!   r   )r   r!   r"   r     r   c                 S   <   t dd| } t dd| } t dd| } t dd| } | S )	Nz^bert.embeddings.LayerNorm.bert.emb_ln.zC^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)zbert.encoder.layers.\1.norm1.\2z9^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)zbert.encoder.layers.\1.norm2.\2z2^cls.predictions.transform.LayerNorm.(weight|bias)z'cls.predictions.transform.layer_norm.\1r   r   r!   r!   r"   key_mapping_ln   "   z(remap_state_dict.<locals>.key_mapping_lnc                 3   r   r   r!   r   )r   r!   r"   r   3  r   c                 S   r   )Nz;^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)z!bert.encoder.layers.\1.mlp.fc1.\2z5^bert.encoder.layers.(\d+).output.dense.(weight|bias)z!bert.encoder.layers.\1.mlp.fc2.\2r   r   r!   r!   r"   key_mapping_mlp6     z)remap_state_dict.<locals>.key_mapping_mlpc                 3   r   r   r!   r   )r   r!   r"   r   C  r   r9   Fr   .attention.self.query.weight.attention.self.key.weight.attention.self.value.weight.attention.self.query.bias.attention.self.key.bias.attention.self.value.biasr:   r   )rd   .mixer.Wqkv.weight.mixer.Wqkv.bias.mixer.Wq.weight.mixer.Wkv.weight.mixer.Wq.bias.mixer.Wkv.biasc                 S   r   )Nz?^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)z(bert.encoder.layers.\1.mixer.out_proj.\2r   r   r!   r!   r"   key_mapping_attnY  
   z*remap_state_dict.<locals>.key_mapping_attnc                 3   r   r   r!   r   )r   r!   r"   r   `  r   c                 S   r   )Nz^cls.predictions.biascls.predictions.decoder.biasr   r   r!   r!   r"   key_mapping_decoder_biasb  r   z2remap_state_dict.<locals>.key_mapping_decoder_biasc                 3   r   r   r!   r   )r   r!   r"   r   e  r   r   &bert.embeddings.word_embeddings.weightcls.predictions.decoder.weightr   g      Y)value)r   itemsr   rV   r?   poprg   catr2   rk   r   rf   )
state_dictr   r9   dZWqZWkZWvbqZbkZbvr   r   decoder_weightdecoder_biasr!   )r   r   r   r   r   r   r"   r     sX   
" 

r   c                    s  t |dd}|dkr9| d }| d }| d }|d|jddf | d< |d|jddf | d< |d|j | d< t|jD ](}t |dd}|rP||jd kr| d	| d
}| d	| d}	|d|jd d ddf | d	| d< ||jd d d|jd  d ddf | d	| d< |d|jd  d dddf | d	| d< |	d|	jd d  | d	| d< |	|	jd d d|	jd  d  | d	| d< |	d|	jd  d d | d	| d< q>| d	| d}
| d	| d}| d	| d}| d	| d}|
| d	| d< |d|jd d ddf | d	| d< ||jd d dddf | d	| d< || d	| d< |d|jd d  | d	| d< ||jd d d | d	| d< q>dd dd dd dd  d!d"  d#d$ tfd%d&|  D } tfd'd&|  D } tfd(d&|  D } tfd)d&|  D } t fd*d&|  D } tfd+d&|  D } | S ),z
    Map the state_dict of a flash_attn model to be Huggingface BERT compatible.

    This function is meant to be the inverse of remap_state_dict.
    r   r:   r   r   r   Nr9   Fr   r   r   r      r   r^   r   r   r   r   r   r   r   r   r   c                 S   r   )	Nr   zbert.embeddings.LayerNorm.z-bert.encoder.layers.(\d+).norm1.(weight|bias)z4bert.encoder.layers.\1.attention.output.LayerNorm.\2z-bert.encoder.layers.(\d+).norm2.(weight|bias)z*bert.encoder.layers.\1.output.LayerNorm.\2z2cls.predictions.transform.layer_norm.(weight|bias)z&cls.predictions.transform.LayerNorm.\1r   r   r!   r!   r"   inv_key_mapping_ln  r   z0inv_remap_state_dict.<locals>.inv_key_mapping_lnc                 S   r   )NzLayerNorm.weight$zLayerNorm.gammazLayerNorm.bias$zLayerNorm.betar   r   r!   r!   r"   inv_key_mapping_ln_gamma_beta  r   z;inv_remap_state_dict.<locals>.inv_key_mapping_ln_gamma_betac                 S   r   )Nr   zbert.encoder.layer.r   r   r!   r!   r"   inv_key_mapping_layers  r   z4inv_remap_state_dict.<locals>.inv_key_mapping_layersc                 S   r   )Nz.bert.encoder.layer.(\d+).mlp.fc1.(weight|bias)z+bert.encoder.layer.\1.intermediate.dense.\2z.bert.encoder.layer.(\d+).mlp.fc2.(weight|bias)z%bert.encoder.layer.\1.output.dense.\2r   r   r!   r!   r"   inv_key_mapping_mlp  r   z1inv_remap_state_dict.<locals>.inv_key_mapping_mlpc                 S   r   )Nz5bert.encoder.layer.(\d+).mixer.out_proj.(weight|bias)z/bert.encoder.layer.\1.attention.output.dense.\2r   r   r!   r!   r"   inv_key_mapping_attn  r   z2inv_remap_state_dict.<locals>.inv_key_mapping_attnc                 S   r   )Nr   zcls.predictions.biasr   r   r!   r!   r"   inv_key_mapping_decoder_bias  r   z:inv_remap_state_dict.<locals>.inv_key_mapping_decoder_biasc                 3   r   r   r!   rP   r   r   )r   r!   r"   r     r   z'inv_remap_state_dict.<locals>.<genexpr>c                 3   r   r   r!   r   )r   r!   r"   r         
c                 3   r   r   r!   r   )r   r!   r"   r     r   c                 3   r   r   r!   r   )r   r!   r"   r     r   c                 3   r   r   r!   r   )r   r!   r"   r     r   c                 3   r   r   r!   r   )r   r!   r"   r     r   )r   Zorig_vocab_sizerV   r?   r   rf   r   r   )r   r   r   r   r   r   r   r9   ZWqkv_weightsZWqkv_biasesZ	Wq_weightZWkv_weightsZWq_biasZ
Wkv_biasesr!   )r   r   r   r   r   r   r"   inv_remap_state_dict}  s   &r   )FF)NFr   )rF   )Bloggingr   collectionsr   collections.abcr   	functoolsr   typingr   r   rg   Ztorch.nnr@   Ztorch.nn.functionalZ
functionalr2   Zeinopsr   Ztransformersr   r	   Z&transformers.models.bert.modeling_bertr
   r   Zflash_attn.bert_paddingr   r   r   r   Zflash_attn.modules.blockr   Zflash_attn.modules.embeddingr   Zflash_attn.modules.mhar   Zflash_attn.modules.mlpr   r   Zflash_attn.utils.pretrainedr   Zflash_attn.ops.fused_denser   r4   Z flash_attn.ops.triton.layer_normr   Zflash_attn.losses.cross_entropyr   	getLoggerrv   r   r#   r8   rE   rN   ModulerO   rz   r   r   r   r   r   r   r   r   r!   r!   r!   r"   <module>   s`   



$
?+Waq