o
    81 i                  2   @   sB  d dl mZmZmZmZ d dlZd dlmZ d dlZe	dddkZ
e
r+ddlmZ nd dlZdd	 Zd
d Zdd ZejdkrIejjZejjZndkdddddZdkdddddZeZeZedddddejdejdejdedededed ed!ed"eej d#ed$eejejejejf fd%d&Zeddejdejdejdedededed ed!ed"eej d#ed$eejejejejf fd'd(Zejdkrej j!jZ"neZ"ed)ddd	*	*	+		,				,dldejdejdejd-ejd.ejd/ed0ededededed ed!ed"eej d#ed1eej d2eej d3eej d4ed$eejejejejf f(d5d6Z#ed)	*	*	+		,				,dldejdejdejd-ejd.ejd/ed0ededededed ed!ed"eej d#ed1eej d2eej d3eej d4ed$eejejejejf f(d7d8Z$ejdkrej j!j#Z%ne#Z%ed9d:dd	dkd;ejdejdejdejd<ejd=ejd>eej d?eej d@eej dedededed ed!ed"eej dAedBeej d$ejf&dCdDZ&ed9	dkd;ejdejdejdejd<ejd=ejd>eej d?eej d@eej dedededed ed!ed"eej dAedBeej d$ejf&dEdFZ'ejdkr+ej j!j&Z(ne&Z(edGd:dd		,dmd;ejdejdejdejd<ejd=ejd>eej d?eej d@eej d-ejd.ejd/ed0ededededed ed!ed"eej dAedBeej d4ed$ejf0dHdIZ)edG		,dmd;ejdejdejdejd<ejd=ejd>eej d?eej d@eej d-ejd.ejd/ed0ededededed ed!ed"eej dAedBeej d4ed$ejf0dJdKZ*ejdkrej j!j)Z+ne)Z+G dLdM dMej,j-Z.G dNdO dOej,j-Z/G dPdQ dQej,j-Z0G dRdS dSej,j-Z1G dTdU dUej,j-Z2G dVdW dWej,j-Z3	+		,	X	+		,	,dndYdZZ4	+		,	X	+		,	,dnd[d\Z5	+		,	X	+		,	,dnd]d^Z6	+		,	X	+		,	,dnd_d`Z7	+		,	X	+		,	,dndadbZ8	+		,	X	+		,	,	dodcddZ9										,	X	+	e		 	,dpdfeeeejf  dgeej dheej d1eej fdidjZ:dS )q    )OptionalSequenceTupleUnionNZ!FLASH_ATTENTION_TRITON_AMD_ENABLEFALSETRUE   )interface_fac                 C   s"   | d ur|  ddkr|  S | S )Nr   )stride
contiguous)x r   k/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/flash_attn_interface.pymaybe_contiguous   s   "r   c           	      C   s   |dksJ t j| \}}|dko|dk}|dko|dk}|dko%|dk}|dkr,dS |dkr6|s4dS dS |dkr<dS |dkrP|rJ|sH|rHdS dS |sNdS dS |d	krVdS |d
kr\dS |dkrbdS d S )N      r   	          @   `         )torchcudaZget_device_capability)	deviceZhead_dimZ
is_dropoutZ	is_causalmajorminorZis_sm8xZis_sm80Zis_sm90r   r   r   _get_block_size_n   s,   r   c                 C   s   | | d | | S )Nr   r   )r   mr   r   r   round_multiple1   s   r!   z2.4.0)device_typesschemac               C   s   dd }|d u r
|S |S )Nc                 S      | S Nr   funcr   r   r   wrap=      z$noop_custom_op_wrapper.<locals>.wrapr   )namefnmutates_argsr"   r#   r(   r   r   r   noop_custom_op_wrapper<      r-   )lib_stacklevelc               C   s   dd }|d u r
|S |S )Nc                 S   r$   r%   r   r&   r   r   r   r(   C   r)   z(noop_register_fake_wrapper.<locals>.wrapr   )opr+   r/   r0   r(   r   r   r   noop_register_fake_wrapperB   r.   r2   zflash_attn::_flash_attn_forwardr   r   )r,   r"   qkv	dropout_psoftmax_scalecausalwindow_size_leftwindow_size_rightsoftcapalibi_slopesreturn_softmaxreturnc                 C   sP   dd | ||fD \} }}t | ||d |	|||||||
d \}}}}||||fS )Nc                 S      g | ]}t |qS r   r   .0r   r   r   r   
<listcomp>Z       z'_flash_attn_forward.<locals>.<listcomp>)flash_attn_gpuZfwd)r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   outsoftmax_lseS_dmask	rng_stater   r   r   _flash_attn_forwardL   s"   rJ   c                 C   s   dd | ||fD \} }}| j \}}}}|j d }t| }tj|||ftj| j| jd}tjd| j| j| jd}|
rQtj||t|dt|df| j| j| jd}tjdtj	| jd}||||fS )	Nc                 S   r?   r   r@   rA   r   r   r   rC   {   rD   z,_flash_attn_forward_fake.<locals>.<listcomp>r   dtyper   layoutr   r      rL   r   )
shaper   
empty_likeemptyfloat32r   rM   rL   r!   int64)r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   
batch_sizeseqlen_q	num_headsZ	head_sizeZseqlen_krF   rG   prI   r   r   r   _flash_attn_forward_fakem   s   

,r[   z&flash_attn::_flash_attn_varlen_forwardr
           Fcu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kblock_table	leftpad_k	seqused_kzero_tensorsc                 C   s`   dd | ||fD \} }}t | ||d ||||||||||||	|
|||d \}}}}||||fS )Nc                 S   r?   r   r@   rA   r   r   r   rC      rD   z._flash_attn_varlen_forward.<locals>.<listcomp>)rE   Z
varlen_fwd)r3   r4   r5   r]   r^   r_   r`   r6   r7   r8   r9   r:   r;   r<   r=   ra   rb   rc   rd   rF   rG   rH   rI   r   r   r   _flash_attn_varlen_forward   s2   re   c                 C   s   dd | ||fD \} }}|d u}|  d }| j\}}}t| }tj||ftj| j| jd}tjd| j| j| jd}t	|d}t	|d}|rXtj||||f| j| j| jd}tjdtj
| jd}||||fS )	Nc                 S   r?   r   r@   rA   r   r   r   rC      rD   z3_flash_attn_varlen_forward_fake.<locals>.<listcomp>r   rK   rN   r   rO   rQ   )numelrR   r   rS   rT   rU   r   rM   rL   r!   rV   )r3   r4   r5   r]   r^   r_   r`   r6   r7   r8   r9   r:   r;   r<   r=   ra   rb   rc   rd   Zpaged_kvrW   total_qrY   _rF   rG   rZ   Zseqlen_q_roundedZseqlen_k_roundedrI   r   r   r   _flash_attn_varlen_forward_fake   s   


 ri   z flash_attn::_flash_attn_backward)dqdkdvdoutrF   rG   rj   rk   rl   deterministicrI   c                 C   s\   dd | ||||fD \} }}}}t | ||||||||||	|
|||||d |\}}}}|S )Nc                 S   r?   r   r@   rA   r   r   r   rC     rD   z(_flash_attn_backward.<locals>.<listcomp>)rE   Zbwd)rm   r3   r4   r5   rF   rG   rj   rk   rl   r6   r7   r8   r9   r:   r;   r<   rn   rI   	softmax_dr   r   r   _flash_attn_backward   s8   "rp   c                 C   s   dd | ||||fD \} }}}}|d u rt |}|d u r#t |}|d u r,t |}|j\}}}}t j||t|df|jt jd}|S )Nc                 S   r?   r   r@   rA   r   r   r   rC   6  rD   z-_flash_attn_backward_fake.<locals>.<listcomp>r   r   rL   )r   rS   rR   rT   r!   r   rU   )rm   r3   r4   r5   rF   rG   rj   rk   rl   r6   r7   r8   r9   r:   r;   r<   rn   rI   rW   rX   rY   rh   ro   r   r   r   _flash_attn_backward_fake!  s   "


 rr   z'flash_attn::_flash_attn_varlen_backwardc                 C   sf   dd | ||||fD \} }}}}t | |||||||||	|
|||||||||||d |\}}}}|S )Nc                 S   r?   r   r@   rA   r   r   r   rC   d  rD   z/_flash_attn_varlen_backward.<locals>.<listcomp>)rE   Z
varlen_bwd)rm   r3   r4   r5   rF   rG   rj   rk   rl   r]   r^   r_   r`   r6   r7   r8   r9   r:   r;   r<   rn   rI   rd   ro   r   r   r   _flash_attn_varlen_backwardI  sB   "rs   c                 C   s   dd | ||||fD \} }}}}|	  d }|j\}}}|d u r&t|}|d u r/t|}|d u r8t|}tj||d|  f|jtjd}|S )Nc                 S   r?   r   r@   rA   r   r   r   rC     rD   z4_flash_attn_varlen_backward_fake.<locals>.<listcomp>r   r   rq   )rf   rR   r   rS   rT   r   rU   )rm   r3   r4   r5   rF   rG   rj   rk   rl   r]   r^   r_   r`   r6   r7   r8   r9   r:   r;   r<   rn   rI   rd   rW   rg   rY   rh   ro   r   r   r    _flash_attn_varlen_backward_fake  s   "


 rt   c                   @   $   e Zd Zedd Zedd ZdS )FlashAttnQKVPackedFuncc                 C   sv  |
o|j }|d u r|jd d }|d d d d df  |d d d d df  |d d d d df  }}}|d}|d dkrntjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||d |d |||	o|dkd\}}}}|r| 	|||||| || _
|| _|| _|| _|| _|| _|| _|d	d |f }|	s|S |||fS )
Nr
         r   r   rP      r   r8   r9   r:   r;   r<   r=   .)requires_gradrR   detachsizer   nn
functionalpad_wrapped_flash_attn_forwardsave_for_backwardr6   r7   r8   window_sizer;   r<   rn   )ctxqkvr6   r7   r8   r   r;   r<   rn   r=   is_grad_enabledis_gradr3   r4   r5   head_size_og
out_paddedrG   rH   rI   rF   r   r   r   forward  sB   
R

zFlashAttnQKVPackedFunc.forwardc                 G   s(  | j \}}}}}}|jd d dg|jdd  R  }	tj|	|j|jd}
|d}|}|d dkr@tjj	|dd|d  g}t
|||||||
d d d d df |
d d d d df |
d d d d df | j| j| j| jd | jd | j| j| j|d |
d	d |jd
 f }
|
d d d d d d d d d f
S )Nrx   rQ   r   r   r   rP   rI   .r
   )saved_tensorsrR   r   rT   rL   r   r|   r}   r~   r   _wrapped_flash_attn_backwardr6   r7   r8   r   r;   r<   rn   )r   rm   argsr3   r4   r5   rF   rG   rI   	qkv_shapedqkvr   dout_paddedr   r   r   backward  s:   $
zFlashAttnQKVPackedFunc.backwardN__name__
__module____qualname__staticmethodr   r   r   r   r   r   rv     s
    
/rv   c                   @   ru   )FlashAttnVarlenQKVPackedFuncc                 C   sv  |o|j }|d u r|jd d }|d d df  |d d df  |d d df  }}}|d}|d dkretjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||||||d |d ||	|o}|dkd d\}}}}|r| 	||||||| || _
|| _|| _|| _|| _|| _|	| _|
| _|dd |f }|s|S |||fS )	Nr
   rw   r   r   rP   r   r8   r9   r:   r;   r<   r=   ra   .)rz   rR   r{   r|   r   r}   r~   r   "_wrapped_flash_attn_varlen_forwardr   r6   
max_seqlenr7   r8   r   r;   r<   rn   )r   r   
cu_seqlensr   r6   r7   r8   r   r;   r<   rn   r=   r   r   r3   r4   r5   r   r   rG   rH   rI   rF   r   r   r   r     sN   
@

z$FlashAttnVarlenQKVPackedFunc.forwardc                 G   s(  | j \}}}}}}}	|jd d dg|jdd  R  }
tj|
|j|jd}|d}|}|d dkrAtjj	|dd|d  g}t
|||||||d d df |d d df |d d df ||| j| j| j| j| j| jd | jd | j| j| j|	d |d	d |jd
 f }|d d d d d d d d d d d fS )Nr   rx   rQ   rP   r   r   r   r   .r
   )r   rR   r   rT   rL   r   r|   r}   r~   r   #_wrapped_flash_attn_varlen_backwardr   r6   r7   r8   r   r;   r<   rn   )r   rm   r   r3   r4   r5   rF   rG   r   rI   r   r   r   r   r   r   r   r   C  sB   $
z%FlashAttnVarlenQKVPackedFunc.backwardNr   r   r   r   r   r   
  s
    
7r   c                   @   ru   )FlashAttnKVPackedFuncc                 C   sj  |ot dd ||fD }|d u r|jd d }|d d d d df  |d d d d df  }}|d}|d dkrhtjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||d |d |||
o||dkd	\}}}}|r| 	|||||| || _
|| _|| _|| _|| _|| _|	| _|d
d |f }|
s|S |||fS )Nc                 s       | ]}|j V  qd S r%   rz   rA   r   r   r   	<genexpr>x      
z0FlashAttnKVPackedFunc.forward.<locals>.<genexpr>r
   rw   r   r   rx   r   ry   .)anyrR   r{   r|   r   r}   r~   r   r   r   r6   r7   r8   r   r;   r<   rn   )r   r3   kvr6   r7   r8   r   r;   r<   rn   r=   r   r   r4   r5   r   r   rG   rH   rI   rF   r   r   r   r   i  sF   6

zFlashAttnKVPackedFunc.forwardc                 G   s8  | j \}}}}}}t|}	|jd d dg|jdd  R  }
tj|
|j|jd}|d}|}|d dkrEtjj	
|dd|d  g}t|||||||	|d d d d df |d d d d df | j| j| j| jd | jd | j| j| j|d |	d	d |jd
 f }	|d	d |jd
 f }|	|d d d d d d d d d fS )Nr   rP   rQ   rx   r   r   r   r   .r
   )r   r   rS   rR   rT   rL   r   r|   r}   r~   r   r   r6   r7   r8   r   r;   r<   rn   )r   rm   r   r3   r4   r5   rF   rG   rI   rj   kv_shapedkvr   r   r   r   r   r     s>   
$
zFlashAttnKVPackedFunc.backwardNr   r   r   r   r   r   h  
    
2r   c                   @   ru   )FlashAttnVarlenKVPackedFuncc                 C   sx  |ot dd ||fD }|d u r|jd d }|d d df  |d d df  }}|d}|d dkrbtjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t||||||||||	|
d |
d |||oz|dkd d	\}}}}|r| 	|||||||| || _
|| _|| _|| _|	| _|
| _|| _|| _|| _|d
d |f }|s|S |||fS )Nc                 s   r   r%   r   rA   r   r   r   r     r   z6FlashAttnVarlenKVPackedFunc.forward.<locals>.<genexpr>r
   rw   r   r   rP   r   r   .)r   rR   r{   r|   r   r}   r~   r   r   r   r6   r_   r`   r7   r8   r   r;   r<   rn   )r   r3   r   r]   r^   r_   r`   r6   r7   r8   r   r;   r<   rn   r=   r   r   r4   r5   r   r   rG   rH   rI   rF   r   r   r   r     sX   *

z#FlashAttnVarlenKVPackedFunc.forwardc                 G   sD  | j \}}}}}}}	}
t|}|jd d dg|jdd  R  }tj||j|jd}|d}|}|d dkrGtjj	
|dd|d  g}t||||||||d d df |d d df ||	| j| j| j| j| j| jd | jd | j| j| j|
d |dd |jd	 f }|dd |jd	 f }||d d d d d d d d d d d d d fS )
Nr   rP   rQ   r   r   r   r   .r
   )r   r   rS   rR   rT   rL   r   r|   r}   r~   r   r   r_   r`   r6   r7   r8   r   r;   r<   rn   )r   rm   r   r3   r4   r5   rF   rG   r]   r^   rI   rj   r   r   r   r   r   r   r   r      sF   
$
"z$FlashAttnVarlenKVPackedFunc.backwardNr   r   r   r   r   r     s
    
?r   c                   @   ru   )FlashAttnFuncc                 C   s6  |ot dd |||fD }|d u r|jd d }|d}|d dkrNtjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||d |d ||	|ob|dkd	\}}}}|r| |||||| || _	|| _
|| _|| _|| _|	| _|
| _|d
d |f }|s|S |||fS )Nc                 s   r   r%   r   rA   r   r   r   r   8  r   z(FlashAttnFunc.forward.<locals>.<genexpr>r
   rw   rx   r   r   r   ry   .)r   rR   r|   r   r}   r~   r   r   r   r6   r7   r8   r   r;   r<   rn   )r   r3   r4   r5   r6   r7   r8   r   r;   r<   rn   r=   r   r   r   r   rG   rH   rI   rF   r   r   r   r   (  sD   

zFlashAttnFunc.forwardc                 G   s  | j \}}}}}}t|t|t|}	}
}|d}|}|d dkr5tjj|dd|d  g}t|||||||	|
|| j| j	| j
| jd | jd | j| j| j|d |	dd |jd f }	|
dd |jd f }
|dd |jd f }|	|
|d d d d d d d d d fS )Nrx   r   r   r   r   .r
   )r   r   rS   r|   r}   r~   r   r   r6   r7   r8   r   r;   r<   rn   rR   )r   rm   r   r3   r4   r5   rF   rG   rI   rj   rk   rl   r   r   r   r   r   r   [  s<   "
zFlashAttnFunc.backwardNr   r   r   r   r   r   '  r   r   c                   @   ru   )FlashAttnVarlenFuncc                 C   sP  |ot dd |||fD }|	d u r|jd d }	|d}|d dkrNtjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||||	|
|d |d |||of|dk|d	\}}}}|r| |||||||| || _	|| _
|| _|	| _|
| _|| _|| _|| _|| _|d
d |f }|s|S |||fS )Nc                 s   r   r%   r   rA   r   r   r   r     r   z.FlashAttnVarlenFunc.forward.<locals>.<genexpr>r
   rw   rP   r   r   r   r   .)r   rR   r|   r   r}   r~   r   r   r   r6   r_   r`   r7   r8   r   r;   r<   rn   )r   r3   r4   r5   r]   r^   r_   r`   r6   r7   r8   r   r;   r<   rn   r=   ra   r   r   r   r   rG   rH   rI   rF   r   r   r   r   ~  sV   

zFlashAttnVarlenFunc.forwardc                 G   s&  | j \}}}}}}}	}
t|t|t|}}}|d}|}|d dkr7tjj|dd|d  g}t|||||||||||	| j| j	| j
| j| j| jd | jd | j| j| j|
d |dd |jd f }|dd |jd f }|dd |jd f }|||d d d d d d d d d d d d d d fS )NrP   r   r   r   r   .r
   )r   r   rS   r|   r}   r~   r   r   r_   r`   r6   r7   r8   r   r;   r<   rn   rR   )r   rm   r   r3   r4   r5   rF   rG   r]   r^   rI   rj   rk   rl   r   r   r   r   r   r     sD   "
&zFlashAttnVarlenFunc.backwardNr   r   r   r   r   r   }  s
    
Ar   r
   r
   c	           	      C   s    t | ||||||||t 
S )a  dropout_p should be set to 0.0 during evaluation
    If Q, K, V are already stacked into 1 tensor, this function will be faster than
    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
    of the gradients of Q, K, V.
    For multi-query and grouped-query attention (MQA/GQA), please see
    flash_attn_kvpacked_func and flash_attn_func.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive.

    Arguments:
        qkv: (batch_size, seqlen, 3, nheads, headdim)
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|) is added to
            the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )rv   applyr   r   )	r   r6   r7   r8   r   r;   r<   rn   return_attn_probsr   r   r   flash_attn_qkvpacked_func  s   -r   c
           
      C   s"   t | |||||||||	t S )a  dropout_p should be set to 0.0 during evaluation
    If K, V are already stacked into 1 tensor, this function will be faster than
    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
    of the gradients of K, V.
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        kv: (batch_size, seqlen, 2, nheads_k, headdim)
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )
r3   r   r6   r7   r8   r   r;   r<   rn   r   r   r   r   flash_attn_kvpacked_func!  s   ?r   c                 C   $   t | |||||||||	|
t S )a
  dropout_p should be set to 0.0 during evaluation
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        k: (batch_size, seqlen, nheads_k, headdim)
        v: (batch_size, seqlen, nheads_k, headdim)
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )r3   r4   r5   r6   r7   r8   r   r;   r<   rn   r   r   r   r   flash_attn_funco  s   =r   c                 C   r   )a	  dropout_p should be set to 0.0 during evaluation
    If Q, K, V are already stacked into 1 tensor, this function will be faster than
    calling flash_attn_varlen_func on Q, K, V since the backward pass avoids explicit concatenation
    of the gradients of Q, K, V.
    For multi-query and grouped-query attention (MQA/GQA), please see
    flash_attn_varlen_kvpacked_func and flash_attn_varlen_func.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive.

    Arguments:
        qkv: (total, 3, nheads, headdim), where total = total number of tokens in the batch.
        cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into qkv.
        max_seqlen: int. Maximum sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )r   r   r   r6   r7   r8   r   r;   r<   rn   r   r   r   r    flash_attn_varlen_qkvpacked_func  s   2r   c                 C   s*   t | |||||||||	|
|||t S )a  dropout_p should be set to 0.0 during evaluation
    If K, V are already stacked into 1 tensor, this function will be faster than
    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
    of the gradients of K, V.
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        kv: (total_k, 2, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )r3   r   r]   r^   r_   r`   r6   r7   r8   r   r;   r<   rn   r   r   r   r   flash_attn_varlen_kvpacked_func  s"   Ir   c                 C   s.   t | |||||||||	|
|||||t S )aq  dropout_p should be set to 0.0 during evaluation
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )r3   r4   r5   r]   r^   r_   r`   r6   r7   r8   r   r;   r<   rn   r   ra   r   r   r   flash_attn_varlen_funcZ  s&   Ir   Tcache_seqlenscache_batch_idxcache_leftpadc                 C   s   | ddksJ d| ddksJ ddd | ||fD \} }}|du r.| jd d }|durJt|trJtj|jd	 f|tj|jd
}t|}t|}t|
}
t	
| |||||||||	|
|d|||d	 |d |||\}}|rv||fS |S )a<  
    If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
    k and v. This is useful for incremental decoding: you can pass in the cached keys/values from
    the previous step, and update them with the new keys/values from the current step, and do
    attention with the updated cache, all in 1 kernel.

    If you pass in k / v, you must make sure that the cache is large enough to hold the new values.
    For example, the KV cache could be pre-allocated with the max sequence length, and you can use
    cache_seqlens to keep track of the current sequence lengths of each sequence in the batch.

    Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be
    rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
    If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos
    and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
    If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at
    indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens).

    See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function.

    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Note: Does not support backward pass.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
            page_block_size must be a multiple of 256.
        v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
        k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
            k with k_cache, starting at the indices specified by cache_seqlens.
        v [optional]: (batch_size, seqlen_new, nheads_k, headdim). Similar to k.
        rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding
            to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16.
        rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos.
        cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the
            KV cache.
        cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache.
            If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1].
            If the indices are not distinct, and k and v are provided, the values updated in the cache
                 might come from any of the duplicate indices.
        cache_leftpad: (batch_size,), dtype torch.int32. The index that the KV cache starts. If None, assume 0.
        block_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in.
            If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False,
            rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1
            (i.e. GPT-NeoX style).
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        num_splits: int. If > 1, split the key/value into this many chunks along the sequence.
           If num_splits == 1, we don't split the key/value. If num_splits == 0, we use a heuristic
           to automatically determine the number of splits.
           Don't change this unless you know what you are doing.
        return_softmax_lse: bool. Whether to return the logsumexp of the attention scores.

    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    r
   r   z+k_cache must have contiguous last dimensionz+v_cache must have contiguous last dimensionc                 S   r?   r   r@   rA   r   r   r   rC   &  rD   z+flash_attn_with_kvcache.<locals>.<listcomp>Nrw   r   rQ   )r   rR   
isinstanceintr   fullZint32r   r   rE   Zfwd_kvcache)r3   Zk_cacheZv_cacher4   r5   Z
rotary_cosZ
rotary_sinr   r   r   ra   r7   r8   r   r;   Zrotary_interleavedr<   Z
num_splitsZreturn_softmax_lserF   rG   r   r   r   flash_attn_with_kvcache  sF   lr   r%   )	r
   r
   r\   NFNNNF)NF)r\   NFr   r\   NFF)	r\   NFr   r\   NFFN)NNNNNNNNNFr   r\   TNr   F);typingr   r   r   r   r   Ztorch.nnr}   osgetenvZUSE_TRITON_ROCMZflash_attn_triton_amdr	   rE   Zflash_attn_2_cudar   r   r!   __version__ZlibraryZ	custom_opZ_torch_custom_op_wrapperZregister_fakeZ_torch_register_fake_wrapperr-   r2   ZTensorfloatboolr   rJ   r[   opsZ
flash_attnr   re   ri   r   rp   rr   r   rs   rt   r   ZautogradFunctionrv   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s  

	
 	

	
2	
%	
3	
!	
?	
(R^WhVk
>
R
Q
I
d
b	
