o
    81 i                     @   s(  d Z ddlZddlZddlZddlmZ edd dd dd dejdej	d	ej	d
ej	dej	dej	dej	dej	dej	fddZ
ejdej	d
ej	fddZejdej	dej	dej	fddZejdej	dej	d	ej	d
ej	dej	dej	dej	dej	dej	fddZdd Zejejdddddded d!ejddd"ddded d!gg d#d$ed%d d&d d'd dejdej	d	ej	d
ej	d(ej	dej	dej	dej	dej	dej	fd)d*Zd5d+d,Z	d5d-d.ZG d/d0 d0ejjZejZG d1d2 d2ejjZejZG d3d4 d4ejjZejZdS )6a  
*Experimental* implementation of FlashAttention in Triton.
Tested with triton==2.0.0.dev20221202.
Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
other than 64:
https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
We'll update this implementation with the new Triton backend once this is fixed.

We use the FlashAttention implementation from Phil Tillet a starting point.
https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py

Changes:
- Implement both causal and non-causal attention.
- Implement both self-attention and cross-attention.
- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
- Support attention bias.
- Speed up the forward pass a bit, and only store the LSE instead of m and l.
- Make the backward for d=128 much faster by reducing register spilling.
- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
small batch size * nheads.

Caution:
- This is an *experimental* implementation. The forward pass should be quite robust but
I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
- This implementation has only been tested on A100.
- If you plan to use headdim other than 64 and 128, you should test for race conditions
(due to the Triton compiler), as done in tests/test_flash_attn.py
"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
that there are none left for other head dimensions.

Differences between this Triton version and the CUDA version:
- Triton version doesn't support dropout.
- Triton forward is generally faster than CUDA forward, while Triton backward is
generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
than CUDA forward + backward.
- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
- Triton version supports attention bias, while CUDA version doesn't.
    Nc                 C      | d | d  dkS Nseqlen_qBLOCK_Mr    argsr   r   h/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/flash_attn_triton.py<lambda><       r
   c                 C   r   Nseqlen_kBLOCK_Nr   r   r   r   r   r	   r
   =   r   c                 C      | d | d kS NheaddimBLOCK_HEADDIMr   r   r   r   r	   r
   >       EVEN_MEVEN_NEVEN_HEADDIM	BIAS_TYPE	IS_CAUSALr   r   r   r   r   r   c&           D   	   C   s  t d}&t d}'|'| }(|'| })|&|$ t d|$ }*t d|%}+t d| },| |(|  |)|	  |*d d d f |
 |,d d d f   }-||(|  |)|  |+d d d f | |,d d d f   }.||(|  |)|  |+d d d f | |,d d d f   }/|dkr||(|  |)|  |+ }0n |dkr||(|  |)|  |*d d d f | |+d d d f   }0||'|  |* }1t j|$gt jdtd }2t j|$gt jdtd }3t j|$| gt jd}4|!|"@ r|#rt |-}5n?t j|-|,d d d f |k dd}5n.|#r
t j|-|*d d d f |k dd}5nt j|-|*d d d f |k |,d d d f |k @ dd}5|s)|n	t |&d |$ |}6td|6|%D ]}7t 	|7|%}7|"|!@ ri|#rTt |.|7|  }8nOt j|.|7|  |,d d d f |k dd}8n:|#rt j|.|7|  |7|+ d d d f |k dd}8n t j|.|7|  |7|+ d d d f |k |,d d d f |k @ dd}8t j|$|%gt jd}9|9t j
|5|8d	d
7 }9|"s|9t |7|+ d d d f |k dtd7 }9|r|9t |*d d d f |7|+ d d d f kdtd7 }9|dkrw|dkr!|"rt |0|7 t j}:nt j|0|7 |7|+ |k ddt j}:|:d d d f }:n8|dkrY|!|"@ r7t |0|7 t j}:n"t j|0|7 |*d d d f |k |7|+ d d d f |k @ ddt j}:|9| |: }9t t |9d|2};t |9|;d d d f  }<nt t |9d| |2};t |9| |;d d d f  }<t |<d}=t |3|; }>t |1|> t |1}>|4|>d d d f  }4|"|!@ r|#rt |/|7|  }?nOt j|/|7|  |,d d d f |k dd}?n:|#rt j|/|7|  |7|+ d d d f |k dd}?n t j|/|7|  |7|+ d d d f |k |,d d d f |k @ dd}?|<|?j}<|4t 
|<|?7 }4|;}3t |2|; |= }@|;t |@ }2q9t |3|2 }At |1|A t |1}A|4|Ad d d f  }4t d}&|&|$ t d|$ }*||'|  |* }Bt |B|2 t d| },||(|  |)|  |*d d d f | |,d d d f   }C|!r|#rt |C|4 d S t j|C|4|,d d d f |k d d S |#rt j|C|4|*d d d f |k d d S t j|C|4|*d d d f |k |,d d d f |k @ d d S )Nr      vectormatrixdtypeinf        maskotherTZtrans_b-infnoner"   )tl
program_idarangezerosfloat32floatloadminimumrangemultiple_ofdotwheretomaximummaxexpsumstorer   log)DQKVBiasOutZLseTMPsoftmax_scale	stride_qb	stride_qh	stride_qm	stride_kb	stride_kh	stride_kn	stride_vb	stride_vh	stride_vn	stride_bb	stride_bh	stride_bm	stride_ob	stride_oh	stride_omnheadsr   r   seqlen_q_roundedr   CACHE_KEY_SEQLEN_QCACHE_KEY_SEQLEN_Kr   r   r   r   r   r   r   r   start_moff_hboff_boff_hoffs_moffs_noffs_dq_ptrsk_ptrsv_ptrsb_ptrsZt_ptrslse_iZm_iZacc_oqZend_nstart_nkqkbiasZm_ijpZl_ijZacc_o_scalevZl_i_newZo_scaleZlse_ptrsZout_ptrsr   r   r	   _fwd_kernel:   s  
0
666"""*
*

**6



	

*

*

"$$*
rh   c                 C   sR  t d}t d}||	 }||	 }|| t d| }t d|}t j| ||  ||  |d d d f |  |d d d f  |d d d f |
k |d d d f |k @ ddt j}t j|||  ||  |d d d f |  |d d d f  |d d d f |
k |d d d f |k @ ddt j}t j|| dd}t |||  | | d S )Nr   r   r    r!   )Zaxis)r(   r)   r*   r.   r4   r,   r8   r9   )r?   DODeltarN   rO   rP   
stride_dob
stride_doh
stride_domrQ   r   rR   r   r   r   rU   rV   rW   rX   rY   r[   ododeltar   r   r	   _bwd_preprocess_do_o_dot  s<   

6&&	rq   c                 C   s   |	|@ r6|
rt || t | | d S t j|||d d d f |k d t j| ||d d d f |k d d S |
rZt j|||d d d f |k d t j| ||d d d f |k d d S t j|||d d d f |k |d d d f |k @ d t j| ||d d d f |k |d d d f |k @ d d S )Nr'   )r(   r9   )dk_ptrsdv_ptrsdkdvrZ   r[   r   r   r   r   r   r   r   r	   _bwd_store_dk_dvL  s    $ $48rv   
ATOMIC_ADDc            >      C   s  |sdn| | | | } | t d| }!| | t d| }"t d|}#t d|}$||!d d d f | |$d d d f   }%||"d d d f | |$d d d f   }&||"d d d f | |$d d d f   }'||!d d d f | |$d d d f   }(||!d d d f | |$d d d f   })|dkr||" }*n|dkr||!d d d f | |"d d d f   }*t j||gt jd}+t j||gt jd},| |kr||"d d d f | |$d d d f   }-||"d d d f | |$d d d f   }.t|.|-|,|+|"|$|||||d d S ||@ r5|rt |&}/t |'}0nyt j|&|$d d d f |k dd}/t j|'|$d d d f |k dd}0nX|rYt j|&|"d d d f |k dd}/t j|'|"d d d f |k dd}0n4t j|&|"d d d f |k |$d d d f |k @ dd}/t j|'|"d d d f |k |$d d d f |k @ dd}0t ||}1t| |1| |D ]y}2t |2|}2|2|# }3||@ rt |%}4n.|rt j|%|3d d d f |k dd}4nt j|%|3d d d f |k |$d d d f |k @ dd}4t j	|4|/dd	}5|st 
|"d d d f |k |5td
}5|rt 
|3d d d f |"d d d f k|5td
}5|dkrt   |dkrI|r2t |*t j}6nt j|*|"|k ddt j}6|6d d d f }6n2|dkr{||@ r]t |*t j}6nt j|*|3d d d f |k |"d d d f |k @ ddt j}6|5| |6 }5||@ st   t |	|3 }7|dkrt |5| |7d d d f  }8nt |5|7d d d f  }8||@ rt |(}9nt j|(|3d d d f |k |$d d d f |k @ dd}9|+t j	|8|9j|9dd7 }+||@ st   t j	|9|0dd	}:|st   t |
|3 };|8|:|;d d d f   | |4j}<|,t j	|<|4dd7 },||@ s*t   |s||@ rJt j|)dd}=|=t 	|<|/7 }=t j|)|=dd n|rxt j|)|3d d d f |k ddd}=|=t 	|<|/7 }=t j|)|=|3d d d f |k dd nt j|)|3d d d f |k |$d d d f |k @ ddd}=|=t 	|<|/7 }=t j|)|=|3d d d f |k |$d d d f |k @ dd n@t 	|<|/}=||@ rt |)|= n.|rt j|)|=|3d d d f |k d nt j|)|=|3d d d f |k |$d d d f |k @ d |)|| 7 })|%|| 7 }%|(|| 7 }(|dkr|*|| 7 }*q||"d d d f | |$d d d f   }-||"d d d f | |$d d d f   }.t|.|-|,|+|"|$|||||d d S )Nr   r   r   r   r   r    r!   Tr$   r%   r&   )Ztrans_aZ
evict_last)eviction_policy)r"   r#   rx   )r"   rx   r'   )r(   r*   r+   r,   rv   r.   cdivr0   r1   r2   r3   r-   Zdebug_barrierr4   r7   r   r9   Z
atomic_add)>rb   r;   r<   r=   r>   ri   DQDKDVLSEDrA   rD   rG   rJ   rM   rm   
stride_dqm
stride_dkn
stride_dvnr   r   r   rw   r   r   r   r   r   r   r   r   Zbegin_mZoffs_qmrZ   rY   r[   r\   r]   r^   Zdo_ptrsZdq_ptrsr_   ru   rt   rs   rr   rc   rg   Znum_block_mrU   Zoffs_m_currra   rd   re   r`   rf   ro   ZdpZDiZdsdqr   r   r	   _bwd_kernel_one_col_blockl  sX  $(((((
(
((

 " "**
"&".



&

 
&
$
&&
"&
((
r   c                    s    fddS )Nc                    s   |     S )N)Zzero_)nargsnamer   r	   r
   z  s    zinit_to_zero.<locals>.<lambda>r   r   r   r   r	   init_to_zeroy  s   r      F)r   r   SEQUENCE_PARALLEL   r   rz   )	num_warps
num_stagesZpre_hookT)rS   rT   r   r   r   )Zconfigskeyc                 C   r   r   r   r   r   r   r	   r
     r   c                 C   r   r   r   r   r   r   r	   r
     r   c                 C   r   r   r   r   r   r   r	   r
     r   r   c3           8      C   s  t d}3|3|# }4|3|# }5| |4| |5|  7 } ||4| |5|  7 }||4| |5|  7 }||4| |5|  7 }||4| |5|  7 }||4| |5|  7 }||4|  |5|!  7 }|*dkra||4| |5|  7 }|	|3|& 7 }	||3|& 7 }|-st |%|2}6td|6D ](}7t|7| |||||||||	|
||||||||"|$|%|'fd|*|+|,|.|/|0|1|2d	 qzd S t d}7t|7| |||||||||	|
||||||||"|$|%|'fd|*|+|,|.|/|0|1|2d	 d S )Nr   r&   r   F)	rw   r   r   r   r   r   r   r   r   T)r(   r)   ry   r0   r   )8r;   r<   r=   r>   ri   rz   r{   r|   r}   r~   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rk   rl   rm   Z
stride_dqbZ
stride_dqhr   Z
stride_dkbZ
stride_dkhr   Z
stride_dvbZ
stride_dvhr   rQ   r   r   rR   r   rS   rT   r   r   r   r   r   r   r   r   r   rV   rW   rX   Znum_block_nrb   r   r   r	   _bwd_kernel}  s   
T

$r   c                    s  | j \ }|j \}}}}|j  ||fksJ |j  ||fks$J |dks,J d| j|j  kr<|jksAJ d J d| jtjtjfv sNJ d| jrW|jrW|jsYJ |padt| }|d u}	d}
|	r|j| jtjfv suJ |jszJ |	 dksJ |
dd	kr| }|j d
d  d	|fkrd}
n|j d
d  |fkrd}
ntd| |}|	r|
d|
d	|
d
fnd}td d }tj |f| jtjd}tj |f| jtjd}t| }tt|d}d}|dkrdnd} fdd}t| g | |||||||| 
d| 
d
| 
d	|
d|
d
|
d	|
d|
d
|
d	||
d|
d
|
d	|||d |d |
||R |||d	d |||fS )Nr   z5FlashAttention only support head dimensions up to 128z#All tensors must have the same typezOnly support fp16 and bf16      ?r&      r      r   r   GLast 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)r   r   r   r   )devicer      @   r   c                       t | d   fS Nr   tritonry   ZMETAbatchrQ   r   r   r	   r
   S      z%_flash_attn_forward.<locals>.<lambda>    )r   r   r   r   )shaper   torchZfloat16Zbfloat16is_cudamathsqrtr-   dimstride
contiguousRuntimeErrorexpandceilemptyr   r,   
empty_liker6   r   next_power_of_2rh   )ra   rc   rg   re   causalrA   d_r   has_bias	bias_typebias_stridesrR   lsetmprn   r   ZBLOCKr   gridr   r   r	   _flash_attn_forward,  s   *
$
	
 !"
'r   c                    s  |  ddkr|  } |j\ }|j\}}}|dksJ td d }|j |fks2J | d| d  krS| d  krS| d  krSdksVJ  J | d| d  kro| d  krodksrJ  J |pzdt| }tj|tjd}t|}t	t
|d} fdd}t| || || d	| d
| d|  d	|  d
|  d||d|d |	d u}d}|r|	j|jtjfv sJ |	jsJ |	 dksJ |	 ddksJ |	jd
d  dfkrd}n|	jd
d  fkrd}ntd|	 }	|r#|	 d	|	 d|	 d
fnd} fdd}t| g ||||	| ||||||| d	| d
| d| d	| d
| d| d	| d
| d||  d	|  d
|  d| d	| d
| d| d	| d
| d| d	| d
| d||d d ||
|R   || d S )Nr   r   r   r   r   r   c                    r   r   r   r   r   r   r	   r
     r   z&_flash_attn_backward.<locals>.<lambda>r   r   )r   r   r&   r   r   r   r   r   c                    s$   | d rt | d nd  fS )Nr   r   r   r   r   )r   rQ   r   r   r	   r
     s   r   )r   r   r   r   r   r   r   r   r,   r6   r   r   rq   r   r-   r   r   r   r   r   Zcopy_)ro   ra   rc   rg   rn   r   r   rt   ru   re   r   rA   r   r   rR   Zdq_accumrp   r   r   r   r   r   r   )r   rQ   r   r   r	   _flash_attn_backward~  s  H8

&	
 !"#$%&'(+,-3r   c                   @   &   e Zd ZedddZedd ZdS )FlashAttnQKVPackedFuncNFc                 C   s   | ddkr| }t|dddddf |dddddf |dddddf |||d\}}| _| |||| || _|S )a5  
        qkv: (batch, seqlen, 3, nheads, headdim)
        bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
            ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
        r   r   Nr   r   re   r   rA   )r   r   r   rA   save_for_backwardr   )ctxqkvre   r   rA   rn   r   r   r   r	   forward  s   	zFlashAttnQKVPackedFunc.forwardc                 C   s   | j \}}}}| jd rJ dt U t|}t||d d d d df |d d d d df |d d d d df |||d d d d df |d d d d df |d d d d df || j| jd W d    n1 slw   Y  |d d d fS )Nr   1FlashAttention does not support bias gradient yetr   r   r   saved_tensorsneeds_input_gradr   inference_moder   r   r   rA   )r   ro   r   rn   r   re   Zdqkvr   r   r	   backward  s(   

zFlashAttnQKVPackedFunc.backwardNFN__name__
__module____qualname__staticmethodr   r   r   r   r   r	   r     s
    r   c                   @   r   )FlashAttnKVPackedFuncNFc                 C   sr   dd ||fD \}}t ||dddddf |dddddf |||d\}}| _| ||||| || _|S )an  
        q: (batch, seqlen_q, nheads, headdim)
        kv: (batch, seqlen_k, 2, nheads, headdim)
        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
        c                 S   &   g | ]}| d dkr|n| qS r   r   r   r   .0xr   r   r	   
<listcomp>4     & z1FlashAttnKVPackedFunc.forward.<locals>.<listcomp>Nr   r   r   r   rA   r   r   )r   ra   kvre   r   rA   rn   r   r   r   r	   r   *  s   
0zFlashAttnKVPackedFunc.forwardc           	      C   s   | j \}}}}}t| jdkr| jd rJ dt H t|}t|}t|||d d d d df |d d d d df ||||d d d d df |d d d d df || j| jd W d    n1 sgw   Y  ||d d d fS )N   r   r   r   r   r   )	r   lenr   r   r   r   r   r   rA   )	r   ro   ra   r   rn   r   re   r   Zdkvr   r   r	   r   <  s,   


zFlashAttnKVPackedFunc.backwardr   r   r   r   r   r	   r   )  
    r   c                   @   r   )FlashAttnFuncNFc           	      C   sT   dd |||fD \}}}t ||||||d\}}| _| |||||| || _|S )aw  
        q: (batch_size, seqlen_q, nheads, headdim)
        k, v: (batch_size, seqlen_k, nheads, headdim)
        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
        c                 S   r   r   r   r   r   r   r	   r   e  r   z)FlashAttnFunc.forward.<locals>.<listcomp>r   r   )	r   ra   rc   rg   re   r   rA   rn   r   r   r   r	   r   [  s   
zFlashAttnFunc.forwardc                 C   s   | j \}}}}}}| jd rJ dt ) t|}t|}	t|}
t||||||||	|
|| j| jd W d    n1 sBw   Y  ||	|
d d d fS )Nr   r   r   r   )r   ro   ra   rc   rg   rn   r   re   r   rt   ru   r   r   r	   r   m  s,   



zFlashAttnFunc.backwardr   r   r   r   r   r	   r   Z  r   r   r   )__doc__r   r   r   Ztriton.languagelanguager(   Z
heuristicsZjitZ	constexprrh   rq   rv   r   r   ZautotuneZConfigr   r   r   ZautogradFunctionr   applyZflash_attn_qkvpacked_funcr   Zflash_attn_kvpacked_funcr   Zflash_attn_funcr   r   r   r	   <module>   s    ) !"#$%& ^,	
   

+,-./0123 
S
w1.
.