o
    81 i|                     @   sd  d dl Z d dlmZ d dlZd dlmZ d dlmZmZ d dl	m
Z
 zd dlmZmZmZmZmZ W n eyE   d\ZZd\ZZdZY nw z
d dlmZmZ W n ey]   d\ZZY nw zd dlmZ W n eyq   dZY nw d	d
 ZG dd dejZG dd dejZG dd dejZG dd dejZdd ZG dd dejZG dd dejZdS )    N)partial)	rearrangerepeat)get_dim_for_local_rank)flash_attn_kvpacked_funcflash_attn_qkvpacked_funcflash_attn_varlen_kvpacked_func flash_attn_varlen_qkvpacked_funcflash_attn_with_kvcacheNN)ColumnParallelLinearRowParallelLinear)RotaryEmbeddingc                 C   s\   dd }t |  r|| S dt t |  }||td| dd d d | |   S )Nc                    s6   ddt | d       fddt| D S )N      c                    s   g | ]} |  qS  r   ).0iratiostartr   b/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/modules/mha.py
<listcomp>)   s    zCget_alibi_slopes.<locals>.get_slopes_power_of_2.<locals>.<listcomp>)mathlog2range)nheadsr   r   r   get_slopes_power_of_2&   s   z/get_alibi_slopes.<locals>.get_slopes_power_of_2r   r   )r   r   
is_integerfloorget_alibi_slopes)r   r   Zclosest_power_of_2r   r   r   r    %   s    r    c                       s8   e Zd ZdZ						d
 fdd	Zddd	Z  ZS )FlashSelfAttention|  Implement the scaled dot product attention with softmax.
    Arguments
    ---------
        softmax_scale: The temperature to use for the softmax attention.
                      (default: 1/sqrt(d_keys) where d_keys is computed at
                      runtime)
        attention_dropout: The dropout rate to apply to the attention
                           (default: 0.0)
    FN        r%   c                    sb   t    td usJ dtd usJ d|| _|| _t|| _| j	d|dd || _
|| _d S NzFlashAttention is not installedalibi_slopesF)
persistent)super__init__r	   r   causalsoftmax_scalennDropoutdropregister_bufferwindow_sizedeterministic)selfr+   r,   attention_dropoutr1   r'   r2   	__class__r   r   r*   @      
	
zFlashSelfAttention.__init__c                 C   s   |j tjtjfv sJ |jsJ |du r| jn|}|du}| jdur*| jtj| _|rY|j tj	ks4J |dus:J t
|tsAJ t|||| jrL| jjnd| j|| j| j| jd	S t|| jrb| jjnd| j|| j| j| jdS )ao  Implements the multihead softmax attention.
        Arguments
        ---------
            qkv: The tensor containing the query, key, and value.
                If cu_seqlens is None and max_seqlen is None, then qkv has shape (B, S, 3, H, D).
                If cu_seqlens is not None and max_seqlen is not None, then qkv has shape
                (total, 3, H, D), where total is the sum of the sequence lengths in the batch.
            causal: if passed, will override self.causal
            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into qkv.
            max_seqlen: int. Maximum sequence length in the batch.
        Returns:
        --------
            out: (total, H, D) if cu_seqlens is not None and max_seqlen is not None,
                else (B, S, H, D).
        Nr#   r,   r+   r'   r1   r2   )dtypetorchfloat16bfloat16is_cudar+   r'   tofloat32int32
isinstanceintr	   trainingr/   pr,   r1   r2   r   )r3   qkvr+   
cu_seqlens
max_seqlenunpaddedr   r   r   forwardS   s<   

zFlashSelfAttention.forward)FNr#   r$   NF)NNN__name__
__module____qualname____doc__r*   rI   __classcell__r   r   r5   r   r!   5   s    r!   c                       sB   e Zd ZdZ						d
 fdd	Z					ddd	Z  ZS )FlashCrossAttentionr"   FNr#   r$   c                    sb   t    td usJ dtd usJ d|| _|| _t|| _| j	d|dd || _
|| _d S r&   )r)   r*   r   r   r+   r,   r-   r.   r/   r0   r1   r2   )r3   r+   r,   r4   r'   r1   r2   r5   r   r   r*      r7   zFlashCrossAttention.__init__c                 C   sh  |j tjtjfv sJ |jr|jsJ |du r| jn|}|du}| jdur-| jtj| _|rz|j tj	ks7J |dus=J t
|tsDJ |dusJJ |j tj	ksRJ |dusXJ t
|ts_J t||||||| jrm| jjnd| j|| j| j| jdS |jd |jd }	}
|jd }|jd |	kr|jd |jd ksJ t||| jr| jjnd|| j| j| j| jdS )	a  Implements the multihead softmax attention.
        Arguments
        ---------
            q: The tensor containing the query. (B, Sq, H, D)
            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
            causal: if passed, will override self.causal
            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into q.
            max_seqlen: int. Maximum sequence length in the batch of q.
            cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into kv.
            max_seqlen_k: int. Maximum sequence length in the batch of k and v.
        Nr#   r8   r         r   )r+   r,   r'   r1   r2   )r9   r:   r;   r<   r=   r+   r'   r>   r?   r@   rA   rB   r   rC   r/   rD   r,   r1   r2   shaper   )r3   qkvr+   rF   rG   Zcu_seqlens_kZmax_seqlen_krH   
batch_sizeseqlen_qseqlen_kr   r   r   rI      sR   

&zFlashCrossAttention.forward)FNr#   Nr$   F)NNNNNrJ   r   r   r5   r   rP      s    rP   c                       ,   e Zd ZdZd	 fdd	Zd
ddZ  ZS )SelfAttentionr"   FNr#   c                    &   t    || _|| _t|| _d S Nr)   r*   r+   r,   r-   r.   r/   r3   r+   r,   r4   r5   r   r   r*         
zSelfAttention.__init__c                 C   s  |j d |j d }}|du r| jn|}|jdd\}}}| jp)dt|j d  }	td|||	 }
|durQtj||fd	|
j	|
j
d
}||d |
t|d }
|rkttj||fd	|
j
dd}|
|j|
j	d }
tj|
d|j	d}| |}td||}|S )au  Implements the multihead softmax attention.
        Arguments
        ---------
            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
            causal: if passed, will override self.causal
            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
                False means to mask out. (B, S)
        r   rQ   Nr   dim      ?r%   bthd,bshd->bhts     r9   devicer#   b s -> b 1 1 srf   )r9   ra   r9   bhts,bshd->bthd)rS   r+   unbindr,   r   sqrtr:   einsumfullr9   rf   masked_fill_r   Ztriur>   softmaxr/   )r3   rE   r+   key_padding_maskrV   seqlenrT   kvr,   scorespadding_maskcausal_mask	attentionattention_dropoutputr   r   r   rI      s(   	
zSelfAttention.forwardFNr#   r   rJ   r   r   r5   r   rZ          
rZ   c                       rY   )CrossAttentionr"   FNr#   c                    r[   r\   r]   r^   r5   r   r   r*   %  r_   zCrossAttention.__init__c                 C   s  |j d |j d }}|du r| jn|}|j d }|j d |kr*|j d |j d ks,J |j d |j d krEt|d|j d |j d  d}|jdd	\}}	| jpYd
t|j d  }
td|||
 }|durtj	||fd|j
|jd}||d |t|d }|rttj||jtjdd}tj||jtjd}|du r|nt|dd}||| | k}||d}tj|d|	j
d}| |}td||	}|S )a  Implements the multihead softmax attention.
        Arguments
        ---------
            q: The tensor containing the query. (B, Sq, H, D)
            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
            causal: if passed, will override self.causal
            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
                False means to mask out. (B, Sk)
        r   rQ   NrR   r   r   z... hkv d -> ... (hkv g) d)gr`   rb   r%   rc   rd   re   r#   rg   rf   r9   zs -> s 1zb -> b 1 1 1ri   rj   )rS   r+   r   rk   r,   r   rl   r:   rm   rn   r9   rf   ro   r   ZarangelongsumZmasked_fillrp   r/   )r3   rT   rU   r+   rq   rV   rW   rX   rs   rt   r,   ru   rv   Zrow_idxZcol_idxskrw   rx   ry   rz   r   r   r   rI   +  s<   

&
zCrossAttention.forwardr{   r   rJ   r   r   r5   r   r}     r|   r}   c           
   	   C   s   | j dd \}}||jvr$tj|j|jd||| j| jd}||j|< n|j| }|j}|| j d  }|j	}|| j d  }	||j d ksFJ |	|j d ksOJ |dusUJ | |||||	df< |||d|	df S )Ukv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)Nr   re   r   rQ   .)
rS   key_value_memory_dictr:   emptyZmax_batch_sizerG   r9   rf   Zbatch_size_offsetseqlen_offset)
rU   inference_params	layer_idx	num_headshead_dimkv_cacheZbatch_startZ	batch_endZsequence_startZsequence_endr   r   r   _update_kv_cacheX  s,   
	
r   c                       s   e Zd ZdZ																					d	d fd
dZdddZdd Zdd Zdd Z						dddZ	  Z
S )MHA-Multi-head self-attention and cross-attentionNFTr#   r        @r$   returnc                    sR  ||d}t    || _|| _|	| _|
| _|| _|| _|| _|| _	|| _
|r7|s-J dtjt||d}nd}|dkrC|sCJ d|| _|durL|n|| _| j| j dks[J d| j| dksfJ d	| j| | _| j| jd
| j   }d
| j | j }| jdkr|rJ dtdusJ dt| j||||d| _|rtt||dnt}|rtt||dnt}| jstj||fd|i|| _ntj||fd|i|| _tj||fd|i|| _| jr
| j| jkrtj||dd
|d| _ntj||dd
|d| _tj||dd
|d| _ ||	||d| _!||	||d| _"tj||fd|i|| _#dS )aX  
        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
        return_residual: whether to return the input x along with the output. This is for
            performance reason: for post-norm architecture, returning the input allows us
            to fuse the backward of nn.Linear with the residual connection.
        r   #ALiBi code path requires flash_attnrh   Nr$   >Local (sliding window) attention code path requires flash_attnr   +num_heads must be divisible by num_heads_kv(embed_dim must be divisible by num_headsr   z>MHA with rotary embedding does not support cross-attention yetrotary_emb is not installedbaseZ
scale_baseinterleavedrf   r'   r1   biasr   )Zkernel_sizepaddinggroupsr+   r,   r4   )$r)   r*   	embed_dim
cross_attnr+   r   dwconvrotary_emb_dimuse_flash_attnreturn_residualcheckpointingr:   tensorr    r   num_heads_kvr   r   
rotary_embr   r!   rZ   rP   r}   r-   ZLinearWqkvWqWkvZConv1d
dwconv_qkvdwconv_q	dwconv_kv
inner_attninner_cross_attnout_proj)r3   r   r   r   r   qkv_proj_biasout_proj_biasdropoutr,   r+   r   r   r   rotary_emb_baserotary_emb_scale_baserotary_emb_interleaved	use_alibir1   Zfused_bias_fcr   r   r   rf   r9   factory_kwargsr'   qkv_dimZkv_diminner_attn_clsinner_cross_attn_clsr5   r   r   r*   x  s   
 





zMHA.__init__c              	   C   <   |d u r	| j jjn|}| j jj}tj||d| j| j||dS Nr   re   )r   weightr9   rf   r:   r   r   r   r3   rV   rG   r9   rf   r   r   r   allocate_inference_cache     
zMHA.allocate_inference_cachec                 C   s.   | j rJ d| jdusJ dt||| jS )r   z&Generation does not support dwconv yetN0Generation requires layer_idx in the constructor)r   r   r   r3   rU   r   r   r   r   r     s   zMHA._update_kv_cachec                 C   B  |dur	|j dksJ | jsJ | jdkr5| jjdu sJ d| jj|j|j|jd | jj	| jj
}}nd\}}|jd }|j| j d| }|jdurT|jd| n|j }t| jdd}	t||dddddf |dddddf |dddddf |dddddf |||| jj| jj| jdkr| jjnd|	d	}
|
S 
z
        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
        q: (batch_size, seqlen_q, nheads, head_dim)
        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
        Nr   z$This code path does not support xPosr   r   r'   rQ   F)
rotary_cos
rotary_sincache_seqlensr,   r+   Zrotary_interleavedr'   r   r   r   r   scaleZ_update_cos_sin_cacherG   rf   r9   Z_cos_cachedZ_sin_cachedrS   r   r   lengths_per_samplegetattrr   r
   r,   r+   r   r3   rT   rU   r   r   r   batchr   r   r'   contextr   r   r   &_apply_rotary_update_kvcache_attention  >   



z*MHA._apply_rotary_update_kvcache_attentionc                 C   s   |j dkstdu s| js| ||}| ||S |jd }|j| j d| }|jdur3|jd| n|j }t	| jdd}t||dddddf |dddddf |dddddf |dddddf || jj
| jj|d	S z/Write kv to inference_params, then do attentionr   Nr'   rQ   )r   r,   r+   r'   )r   r
   r   r   r   rS   r   r   r   r   r,   r+   )r3   rT   rU   r   r   r   r   r'   r   r   r   _update_kvcache_attention  s0   


zMHA._update_kvcache_attentionc                 K   sd  |dur!|dus
J |du sJ | j sJ | jrJ | jdks!J |dur6|du s+J |du s1J | j r6J |durO|du s@J |du rH|du sJJ | jrOJ | j rY||d|nd|i|}|du redn
|jdurm|jn|j}	|durw|jnd}
|jdd \}}| js<| j| j	kr<|du r|du sJ | 
|}| jrt| t|ddddf d	 }t|d
d| jd}|du s|jdks| jdks| jd dks| j s | jdkr| j||	|
d}|du r| js| j|fi |}n,tjjj| j|fi |}n| |dddddf |ddddddf |}n | |dddddf |ddddddf |}n| jr_| |du rI|n|dd|f }| |dur[|n|}n&| j| j	kshJ | 
|}|dd| j	| j f }|d| j	| j df }t|d| jd}t|dd| jd}| jrt| t|ddddf d	 }t| t|ddddf d	 }|du s|jdks| jdks| jd dks| j s| jdkr| j|||	|
d\}}|du r| js| j||fi |}ntjjj| j||fi |}n| |||}n| |||}| t|d}| js.|S ||fS )a  
        Arguments:
            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
                cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
                is the is the sum of the sequence lengths in the batch.
            x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x.
            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into x. Only applicable when using
                FlashAttention.
            max_seqlen: int. Maximum sequence length in the batch.
            key_padding_mask: boolean mask, True means to keep, False means to mask out.
                (batch, seqlen). Only applicable when not using FlashAttention.
            mixer_subset: for cross-attention only. If not None, will take a subset of x
                before applying the query projection. Useful for e.g., ViT where we only care
                about the CLS token in the last layer.
            inference_params: for generation. Adapted from Megatron-LM (and Apex)
            https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
        Nr   )rF   rG   rq   r   zb s d -> b d s.r   zb d s -> b s dz ... (three h d) -> ... three h dr   threed   r   rG   rQ   ... (h d) -> ... h dr    ... (two hkv d) -> ... two hkv dtwor   z... h d -> ... (h d))r   r   r   r   r   rG   rS   r   r   r   r   r   r   
contiguousr   r   r   r   r:   utils
checkpointr   r   r   r   r   r   r   r   r   )r3   xZx_kvrq   rF   rG   Zmixer_subsetr   kwargsr   rotary_max_seqlenr   rr   rE   r   rT   rU   outr   r   r   rI   =  s   





	



..$



zMHA.forward)NFTTr#   NFNFr   r   NFFr$   FFFFNNr   Nr\   )NNNNNNrK   rL   rM   rN   r*   r   r   r   r   rI   rO   r   r   r5   r   r   u  sJ    
k("r   c                       sv   e Zd ZdZ																		d	d fd
dZdddZdd Zdd Zdd ZdddZ	  Z
S )ParallelMHAr   NTr#   Fr   r   r$   r   c                    s6  ||d}t    || _|	| _|
| _|| _|| _|| _|| _|	 | _
tj|| _|| _| j| j dks:J d|d ur@|n|| _| j| j dksOJ dt| j| j
| j| _t| j| j
| j| _| j| | _| j| jd| j   }|r|s|J dt| j| j
 }tjt|| j| | jd |  |d}nd }|d	kr|sJ d
| jdkrtd usJ dt| j||||d| _td u std u rtdt|||f||| j| j| j d  d|| _|rtt ||dnt!}|rtt"||dnt#}||	||d| _$||	||d| _%t|||f||| jd|| _&d S )Nr   r   r   r   r   r   rQ   rh   r$   r   r   r   zfused_dense is not installed)r   sequence_parallelZmultiple_ofr   r   )'r)   r*   r   r+   r   r   r   r   process_groupsizeZ
world_sizer:   distributedZget_rankZ
local_rankr   r   r   num_heads_per_ranknum_heads_kv_per_rankr   r   ceilr   r    r   r   r   r   ImportErrorr   r   r!   rZ   rP   r}   r   r   r   )r3   r   r   r   r   r   r   r   r,   r+   r   r   r   r   r   r   r1   r   r   r   rf   r9   r   r   Znum_heads_localr'   r   r   r5   r   r   r*     s   



zParallelMHA.__init__c              	   C   r   r   )r   r   r9   rf   r:   r   r   r   r   r   r   r   r   8  r   z$ParallelMHA.allocate_inference_cachec                 C   s    | j dus	J dt||| j S )r   Nr   )r   r   r   r   r   r   r   E  s   zParallelMHA._update_kv_cachec                 C   r   r   r   r   r   r   r   r   J  r   z2ParallelMHA._apply_rotary_update_kvcache_attentionc           	      C   s   |j dks| js| ||}| ||S |jd }|j| j d| }|jdur/|jd| n|j }t| jdd}t	||dddddf |dddddf |dddddf |dddddf || jj
| jj|d	}|S r   )r   r   r   r   rS   r   r   r   r   r
   r,   r+   )	r3   rT   rU   r   r   r   r   r'   r   r   r   r   r   r  s,   

z%ParallelMHA._update_kvcache_attentionc                 K   s  |  |}|durt|d|d}|du rdn
|jdur|jn|j}|dur(|jnd}| j| jkrt|dd| jd}|du sQ|jdksQ| jdksQ| jd dksQ| j	s| jdkr^| j
|||d	}|du r}| jso| j|fi |}ntjjj| j|fi |}n| |dddddf |ddddd
df |}n| |dddddf |ddddd
df |}nt|dd| j| j f d| jd}	t|d| j| j df dd| jd}
|du s|jdks| jdks| jd dks| j	s/| jdkr| j
|	|
||d	\}	}
|du r'| js| j|	|
fi |}ntjjj| j|	|
fi |}n| |	|
|}n| |	|
|}t|d}|durEt|d}| |}|S )ae  
        Arguments:
            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if seqlen=None.
                If seqlen is not None, x is (batch * seqlen, hidden_dim). This is so that when we
                split x during sequence parallel, we split the batch * seqlen dimension
                (in case batch is small).
        Nz(b s) ... -> b s ...)sr   z b s (three h d) -> b s three h dr   r   r   r   rQ   .r   r   r   r   r   zb s h d -> b s (h d)zb s d -> (b s) d)r   r   r   r   rG   r   r   r   r   r   r   r   r   r:   r   r   r   r   r   r   r   )r3   r   rr   r   r   rE   r   r   r   rT   rU   r   r   r   r   rI     s   

	

..






zParallelMHA.forward)NTTr#   NFNr   r   NFFr$   FFTNNr   r\   r   r   r   r   r5   r   r     s8    
r(r   ) r   	functoolsr   r:   Ztorch.nnr-   Zeinopsr   r   Zflash_attn.utils.distributedr   Z
flash_attnr   r   r   r	   r
   r   Zflash_attn.ops.fused_denser   r   Zflash_attn.layers.rotaryr   r    Moduler!   rP   rZ   r}   r   r   r   r   r   r   r   <module>   sB    Pa4>  P