o
    )i~                  ,   @   sN  d dl Zd dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* dd	lm+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 dd
l2m3Z3m4Z4m5Z5m6Z6m7Z7 dZ8e9e:Z;edZ<de<de<fddZ=dee> fddZ?dZ@dZAdZBejCjDdeEdrddlFmGZG ddlHmIZI eGdureGjJKdZ8ejLjHZBn*ejCDdrejCDdrd dlMZHe? ZNeNdu rejLjHZBdZ8dZ@dZAne;OdeN  de0d ePdePfd!d"ZQd#eRd$eRd%eRd&eRdeRf
d'd(ZSd%eRd&eRfd)d*ZTeBdurejUjVd+d,d-gd.						/	/dad0ejWd1ejWd2ejWd3eejW d4eejW d5eejW d6eejW d7eRd8eRd9eXd:eXd;ePd<eejW d=eejW d>eejW d?eejW d@ePd%eRd&eRdeejWejWf f(dAdBZYejUZd+						/	/dad0ejWd1ejWd2ejWd3eejW d4eejW d5eejW d6eejW d7eRd8eRd9eXd:eXd;ePd<eejW d=eejW d>eejW d?eejW d@ePd%eRd&eRdeejWejWf f(dCdDZ[eejLj\j]ddE						/	/dad0ejWd1ejWd2ejWd3eejW d4eejW d5eejW d6eejW d7eRd8eRd9eXd:eXd;ePd<eejW d=eejW d>eejW d?eejW d@ePd%eRd&eRf&dFdGZ^dHePdeejWejWejWf fdIdJZ_ejUjVdKd,d-gd.dHePdLejWd0ejWd1ejWd2ejWdMejWdNejWd3ejWd4ejWd7eRd8eRd:eXd;ePd%eRd&eRdeejWejWejWf f dOdPZ`ejUZdKdHePdLejWd0ejWd1ejWd2ejWdMejWdNejWd3ejWd4ejWd7eRd8eRd:eXd;ePd%eRd&eRdeejWejWejWf f dQdRZaeejLj\jbddEdHePdLejWd0ejWd1ejWd2ejWdMejWdNejWd3ejWd4ejWd7eRd8eRd:eXd;ePd%eRd&eRfdSdTZcdUe0dVee> ddfdWdXZddeejW fdYdZZeeG d[d\ d\e,ZfeG d]d^ d^e+ZgeG d_d` d`efZhdS )b    N)AnyIterableListOptionalSequenceSetTupleTypeVar)parse_schema)%_unpack_flash_attention_nested_shapesregister_flop_formula   )get_operatorregister_operator   )&BlockDiagonalCausalFromBottomRightMask4BlockDiagonalCausalLocalAttentionFromBottomRightMask%BlockDiagonalCausalLocalAttentionMask/BlockDiagonalCausalLocalAttentionPaddedKeysMaskBlockDiagonalCausalMask*BlockDiagonalCausalWithOffsetGappyKeysMask+BlockDiagonalCausalWithOffsetPaddedKeysMaskBlockDiagonalGappyKeysMask)BlockDiagonalLocalAttentionPaddedKeysMaskBlockDiagonalMaskBlockDiagonalPaddedKeysMask!LocalAttentionFromBottomRightMask0LowerTriangularFromBottomRightLocalAttentionMask"LowerTriangularFromBottomRightMaskLowerTriangularMask/PagedBlockDiagonalCausalWithOffsetGappyKeysMask0PagedBlockDiagonalCausalWithOffsetPaddedKeysMaskPagedBlockDiagonalGappyKeysMask PagedBlockDiagonalPaddedKeysMaskVARLEN_BIASES)AttentionBwOpBaseAttentionFwOpBasecheck_lastdim_alignment_stride1Context	GradientsInputsScaledTensor)_check_needs_no_topleft_convert_input_format
_is_causal_post_process_lse_window_sizez0.0.0Txreturnc                 C   s"   | d ur|  ddkr|  S | S )Nr   )stride
contiguousr2    r8   d/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/ops/fmha/flash3.pymaybe_contiguousA   s   "r:   c                   C   s\   t tjjdrt tjjdsdS tjjjjjtdsdS tjjj	jjtds,dS d S )NfwdbwdzNPyTorch has no `flash_attn_3` - is your Flash-Attention version recent enough?a[  flash_attn_3::fwd(Tensor q, Tensor k, Tensor v, Tensor(k_new!)? k_new=None, Tensor(v_new!)? v_new=None, Tensor? q_v=None, Tensor(out!)? out=None, Tensor? cu_seqlens_q=None, Tensor? cu_seqlens_k=None, Tensor? cu_seqlens_k_new=None, Tensor? seqused_q=None, Tensor? seqused_k=None, int? max_seqlen_q=None, int? max_seqlen_k=None, Tensor? page_table=None, Tensor? kv_batch_idx=None, Tensor? leftpad_k=None, Tensor? rotary_cos=None, Tensor? rotary_sin=None, Tensor? seqlens_rotary=None, Tensor? q_descale=None, Tensor? k_descale=None, Tensor? v_descale=None, float? softmax_scale=None, bool is_causal=False, int window_size_left=-1, int window_size_right=-1, int attention_chunk=0, float softcap=0., bool is_rotary_interleaved=False, Tensor? scheduler_metadata=None, int num_splits=0, bool? pack_gqa=None, int sm_margin=0) -> (Tensor(out!), Tensor, Tensor, Tensor)z,flash_attn_3::fwd operator is not compatiblea(  flash_attn_3::bwd(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor(dq!)? dq=None, Tensor(dk!)? dk=None, Tensor(dv!)? dv=None, Tensor? cu_seqlens_q=None, Tensor? cu_seqlens_k=None, Tensor? seqused_q=None, Tensor? seqused_k=None, int? max_seqlen_q=None, int? max_seqlen_k=None, float? softmax_scale=None, bool is_causal=False, int window_size_left=-1, int window_size_right=-1, float softcap=0., bool deterministic=False, int sm_margin=0) -> (Tensor(dq!), Tensor(dk!), Tensor(dv!), Tensor, Tensor, Tensor, Tensor, Tensor)z,flash_attn_3::bwd operator is not compatible)
hasattrtorchopsflash_attn_3r;   defaultZ_schemaZis_backward_compatible_withr
   r<   r8   r8   r8   r9   %_flash_attention3_incompatible_reasonE   s"   
rB   TFz...flash_attn_3._C)package   )_build_metadata)_Cvr@   zflash_attn_3._CZpip_pkgz)Flash-Attention 3 package can't be used: inpenable_kvsplit_attnc                 C   s2   | j }|jj|jjkrdS |jj|jjkrdS |S )NF)	attn_biasZ	q_seqinfoZ
min_seqlenZ
max_seqlenZ	k_seqinfo)rH   rI   Z
atten_biasr8   r8   r9   _heuristic_kvsplit   s   rK   s_qs_kwindow_leftwindow_rightc                 C   s   |dk r|dk r| | S |dk r%|dkr%| | d  d | t d||    S |dk r+|}|dk r1|}t| |}t|| }||d | d 7 }|| | | 7 }t|d | }||d | d 7 }|| | | 7 }|S )Nr   r   r   )maxmin)rL   rM   rN   rO   mask_nzZlastq_utZ	firstq_btr8   r8   r9   mask_non_zeros   s    "

rS   c                 C   s   | \}}}}|\}	}
}}|\}}}}||	  kr|ks J  J |
|ks&J ||ks,J ||ks2J ||ks8J ||
 dks@J t ||||}d| | | | }|d| | | | 7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   r   )rS   )query_shape	key_shapevalue_shaperN   rO   bZh_qrL   Zd_qZ_b2Zh_kvrM   Z_d2Z_b3Z_h2Z_s3Zd_vrR   Ztotal_flopsr8   r8   r9   sdpa_flop_count   s   rX   zxformers_flash3::flash_fwdr8   cuda)Zmutates_argsZdevice_typesr4   querykeyvaluecu_seqlens_qcu_seqlens_k	seqused_k	leftpad_kmax_seqlen_qmax_seqlen_kpsoftmax_scale	is_causal	descale_q	descale_k	descale_vblock_tableuse_kvsplitc                    s  dd |fD \}| ddkr| ddkr| n|}dd  ||fD \ }}t|} fdd}|d u}| }j}d }|rh|djd	 jd d  jd d
krhjd |jd krhd}td usnJ tjg ||d d d d  |s|nd d d ||||d |d d d ||||
|||dd| d |sdnd|dR  ^}}}j|krjd	 }|j}||d |d|d }|dddd|d}||fS )Nc                 S      g | ]}t |qS r8   r:   .0r2   r8   r8   r9   
<listcomp>   s    zmha_fwd.<locals>.<listcomp>r4   r   c                 S   rk   r8   rl   rm   r8   r8   r9   ro      s    c                      s     d ur j d d S j d S )Nr   r   )shaper8   r]   rZ   r8   r9   
_get_batch   s   
zmha_fwd.<locals>._get_batch@   r   Tr           rD   )	r5   r6   r:   rq   view_C_flashattention3r;   Zpermutereshape)rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rN   rO   rs   Zis_pagedbsZorig_query_shapeZpack_gqaoutsoftmax_lserestZnum_heads_qZorig_lse_shaper8   rr   r9   mha_fwd   s   "
	

 !"
%
r~   c                 C   s   | j }| jtjks| jtjkr| j|tjd}n| |}|d u r,|d |d |d fn|d |d f}| j|tjd}||fS )N)dtyper   r   r   )rq   r   r>   float8_e4m3fnZfloat8_e5m2Z	new_emptybfloat16float32)rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rN   rO   rT   r{   Z	lse_shapelser8   r8   r9   mha_fwd_fakeD  s   
r   )Zget_rawc              	      s@  d| j   krdksJ  J d|j   krdksJ  J d|j   kr*dks-J  J tjdddkrbd  } } }}| j dkrH| dn| } |j dkrT|dn|}|j dkr`|dn|}t| j dkrn| ddn| |j dkrz|ddn||j dkr|ddn|||||d	}|rdt fd
d|D }|S )NrD      Z XFORMERS_FLOP_FORMULA_WORST_CASE01r   rt   rp   )rZ   r[   r\   Z	cum_seq_qZ	cum_seq_kZmax_qZmax_kc                 3   s*    | ]\}}}}t ||| d V  qdS )rN   rO   N)rX   )rn   rT   rU   rV   _r   r8   r9   	<genexpr>  s    

z mha_fwd_flops.<locals>.<genexpr>)ndimosenvirongetZ	unsqueezer   Z	transposesum)rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rN   rO   argskwargssizesresr8   r   r9   mha_fwd_flopsi  s.   	
r   grads_share_storagec                 C   s~   | r1t jg |jdd d|jd |jd R |j|jd}|dd|dd|ddfS t |t |t |fS )	Nr   rt   rD   r4   )r   devicerp   r   r   )r>   emptyrq   r   r   select
empty_like)r   rZ   r[   r\   chunkr8   r8   r9   _create_dq_dk_dv  s   *"r   zxformers_flash3::flash_bwddoutr{   r|   c                 C   s   t | |||\}}}d}|d u r|d u sJ td usJ t|||||||||||d d |	|
||||d|d^}}}}}|||fS )NFrv   r   )r   rx   r<   )r   r   rZ   r[   r\   r{   r|   r]   r^   ra   rb   rd   re   rN   rO   dqdkdvZis_deterministicZ	softmax_dr}   r8   r8   r9   mha_bwd  s<   
r   c                 C   s(   t |}t |}t |}|||fS N)r>   r   )r   r   rZ   r[   r\   r{   r|   r]   r^   ra   rb   rd   re   rN   rO   r   r   r   r8   r8   r9   mha_bwd_fake  s   



r   c                 O   sx   dt |||fi d|d|dd dd d|	d|
dd	d
dd|dd dd dd dd ddd|d| d S )N   r]   r^   r_   r`   ra   rb   rc   rv   rd   g      ?re   rf   rg   rh   ri   rj   FrN   rO   r   )r   )r   r   rZ   r[   r\   r{   r|   r]   r^   ra   rb   rd   re   rN   rO   r   r   r8   r8   r9   mha_bwd_flops  sT   	
r   dreasonsc                 C   s   | j jjdkr:tjjd u r<| j jd | jjd kr>tj	| j}|dk r@|
d| j jd  d| jjd  d d S d S d S d S d S )NrY   r4   )	   r   Q/K head-dim () must be equal to V head-dim (z) for Ampere GPUs)rZ   r   typer>   versionZhiprq   r\   rY   Zget_device_capabilityappend)r   r   Zdevice_capabilityr8   r8   r9   %_check_different_value_headdim_ampere5  s   
r   c                 C   s   t | ttfr
| jS d S r   )
isinstancer"   r#   block_tables)Zinp_attn_biasr8   r8   r9   _get_blocktablesC  s   r   c                       s(  e Zd ZU dZeddZdhZee e	d< dZ
ejejher"ejhne B Zeej e	d< dZd	Zed
eeeeeeeeeee e!e"e#e$fe%rLe&e'e(e)fne*  Z+e,e- e	d< dZ.dZ/dZ0dZ1dZ2dZ3de4 Z5e4Z6e7de8de9e f fddZ:e7	dde8de;de;de<ej=e>e? f fddZ@  ZAS )FwOpzOperator that computes memory-efficient attention using         `Flash-Attention <https://github.com/HazyResearch/flash-attention>`_         implementation.
    xformers_flash3	flash_fwdrY   SUPPORTED_DEVICES)   r   SUPPORTED_DTYPES       NSUPPORTED_ATTN_BIAS_TYPESFTzfa3F@r   r3   c                    s   t t| |}t|d|jd t|d|jd t|d|jd t|| t|| t|j	d urQ|jj
d |jj
d krQ|d|jj
d  d|jj
d  d |S )	NrZ   r   r[   r\   r4   r   r   z) for paged attention)superr   not_supported_reasonsr'   rZ   r\   r,   r   r   rJ   rq   r   clsr   r   	__class__r8   r9   r     s   


zFwOp.not_supported_reasonsrH   needs_gradientrj   c                 C   sd  |j j}g |j jd d |jjd }dttjtf fdd}||j \|_ }||j\|_}||j\|_}	t|d|d\}}
}}}}|j }|j}|j}|j 	 dkr|j	 dkrt
|j\}}t|j}d }t|jtr|
d usvJ |d us|J t|
t|kr|d d }nt|
t| dksJ d	t|
d
t||}| j|||fi d|
d|d|d|d|d|d|jd|jdt|jd|d|d|	d|d|d|d|\}}||}n(tj|j j|j j|j jd}tj|j jd |j jd |j jd g|j jtjd}t||d}|s |d fS t|t||t|ddd}||fS )Nr4   r3   c                 S   s   t | tr	|  S | d fS r   )r   r+   unpackr7   r8   r8   r9   unpack_func  s   zFwOp.apply.<locals>.unpack_funcT)supports_mqarj   r   r   zlen(cu_seqlens_q)=z len(cu_seqlens_k)=r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rN   rO   )r   r   r   )r{   r   )Zvarlen_lse_packed)rZ   rq   r\   r   r>   Tensorr   r[   r-   numelr0   rJ   r   r   r"   lenOPERATORrc   scale_floatr.   ry   Zzerosr   r   r   r   r(   r/   tuple)r   rH   r   rj   Zoriginal_query_shapeZ	out_shaper   rf   rg   rh   r]   ra   r^   rb   r_   qkrG   win_left	win_rightr   r`   r{   r|   ctxr8   r8   r9   apply  s   

	

 z
FwOp.apply)F)B__name__
__module____qualname____doc__r   r   r   r   str__annotations__CUDA_MINIMUM_COMPUTE_CAPABILITYr>   Zhalfr   FLASH3_HAS_FLOAT8r   setr   r   SUPPORTED_MAX_KSUPPORTED_MIN_Kr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   FLASH3_HAS_PAGED_ATTENTIONr    r!   r"   r#   r   r   r   r   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDSUPPORTS_BMGHKZSUPPORTS_PARTIALZUNPADDED_LSEFLASH_VERSIONNAMEVERSIONclassmethodr*   r   r   boolr   r   r   r(   r   __classcell__r8   r8   r   r9   r   N  sv   
 

r   c                
       s   e Zd ZU ejZeddZejZejZej	Z	ej
Z
dZedeeeeeeeeef
ZejZejZejZdZdZddgZee ed< d	e  Z!e Z"e#d
e$de%e f fddZ&e#de'de$de(j)de*fddZ+  Z,S )BwOpr   	flash_bwdru   NF Zvarlen_flatSUPPORTS_LSE_FORMATSzfa3B@r   r3   c                    sX   t t| |}t|d|jd t|d|jd t|d|jd t|| t|| |S )NrZ   r   r[   r\   )r   r   r   r'   rZ   r\   r,   r   r   r   r8   r9   r     s   

zBwOp.not_supported_reasonsr   rH   gradc                 C   s  |j j|jj|jj}}}t|dd\}}}}	}
}|j}t|jtr1|jd dks,J |d }n|jd |ks:J |d d d d d |f 	 }g |j jd d |jjd }|j
| jv saJ |j  r|j rt|j\}}| j|j||	 |j |j|j|j||j||	||
|||jt|jd\}}}t|||}ntt|j t|jt|jd}|j||_|j||_|j||_|S )	NF)r   r   r   r   r4   )rN   rO   rd   re   )r   r   r   )rZ   rq   r[   r\   r-   r   r   rJ   r$   r6   r   r   r   r0   r   Zqkv_share_storagery   r{   r   r.   r)   r>   Z
zeros_liker   r   r   )r   r   rH   r   Zdq_shapeZdk_shapeZdv_shaper]   ra   r^   rb   r   Zctx_lseZkernel_out_shaper   r   r   r   r   Zgradsr8   r8   r9   r     sd   






z
BwOp.apply)-r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ZIS_DETERMINISTICr   r   r   r   r   r   r   r   r   r*   r   r   r(   r>   r   r)   r   r   r8   r8   r   r9   r     s>   
 

	&r   c                
       s   e Zd ZU dZde ZdZeed< e	de
eeeefer#eeeefne  Zee ed< ededed	eejee f f fd
dZ  Z S )FwOp_KVSplitzOperator that computes memory-efficient attention using         `Flash-Attention3 <https://github.com/Dao-AILab/flash-attention/tree/main/hopper>`_         implementation with heuristic rules to dispatch decoding shapes to KVSplit Attention     zfa3F_splitKV@TrI   Nr   rH   r   r3   c                    s   t || j}t |||S r   )rK   rI   r   r   )r   rH   r   rj   r   r8   r9   r   w  s   zFwOp_KVSplit.apply)!r   r   r   r   r   r   rI   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r   r   r   r   r   r*   r   r>   r   r   r(   r   r   r8   r8   r   r9   r   [  s8   
 
r   )NNNNFr4   r4   )iimportlib.util	importlibloggingr   typingr   r   r   r   r   r   r   r	   r>   Ztorch._Cr
   Ztorch.utils.flop_counterr   r   commonr   r   rJ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   Zflashr,   r-   r.   r/   r0   r   	getLoggerr   loggerr1   r:   r   rB   r   r   rx   util	find_spec__package__Z_cpp_librE   r@   rF   Zflash_versionlstripr?   Zflash_attn_3._CZincompat_reasonwarningr   rK   intrS   rX   ZlibraryZ	custom_opr   floatr~   Zregister_faker   r   r   r   r   r   r   r   r   r   r   r   r   r   r8   r8   r8   r9   <module>   sr  (X$	
#

*

	

e	
$	
A
	

1	
	
/ &f