o
    81 i                     @   sl  d dl Z d dlZd dlZd dlZd dlZd dlm  mZ d dl	m
Z
 d dlmZmZ zd dlmZ W n ey>   dZY nw d dlmZmZ d dlmZmZmZ d dlmZmZmZ d dlmZmZ e d	d
dkZe dd
dkZ e dd
dkZ!e dd
dkZ"e dd
dkZ#e dd
dkZ$e dd
dkZ%e dd
dkZ&e dd
dkpej'(dd  dk Z)e dd
dkZ*e dd
dkZ+e dd
dkZ,e dd
dkZ-e dd
dkZ.g e*sdgng  e+sdgng  e,sdgng  e-sdgng  e.sdgng  Z/ej01d ej2ge&sej3gng  e)sej4gng  ej01d!g d"ej01d#d$gej01d%d$gej01d&d'ge$s>d(gng  ej01d)d$ge#sMd*gng  ej01d+d$d*gej01d,d$gej01d-e/ej01d.g d/d0d1 Z5ej01d ej2ge&sej3gng  e)sej4gng  ej01d!g d"ej01d#d$gej01d%d$gej01d&d'ge$sd(gng  ej01d)d$ge#sd*gng  ej01d+d$d*gej01d2d$d*gej01d-e/ej01d.g d3d4d5 Z6ej01d ej2ge)sej4gng  ej01d!g d"ej01d6d$ge"sd*gng  ej01d7d8d9ge#s*d:gng  ej01d;e"s8d*d$gnd*gej01d<d$d*gej01d=e"sNd$d*gnd$gej01d>e"saedurag d?nd'gej01d@dge!sqg dAng  ej01dBd$d*gej01dCd$d*gej01dDd$d*gej01d-dgej01d.g dEdFdG Z7dHdI Z8ej01d ej2gej01d+d$d*gej01d-dgej01d.dJgdKdL Z9ej01d ej2gej01d+d$d*gej01d-g dMej01d.g dNdOdP Z:dQdR Z;ej01d ej<ej3ej2gej01d-g dSej01dTg dUej01dVg dWdXdY Z=d]d[d\Z>dS )^    N)parse_schema)	rearrangerepeat)apply_rotary_emb)	pad_inputunpad_input)attention_refgenerate_qkvgenerate_random_padding_mask)flash_attn_funcflash_attn_varlen_funcflash_attn_combine)flash_attn_with_kvcacheget_scheduler_metadataZ FLASH_ATTENTION_DISABLE_BACKWARDFALSETRUEZFLASH_ATTENTION_DISABLE_SPLITZFLASH_ATTENTION_DISABLE_PAGEDKVZ FLASH_ATTENTION_DISABLE_APPENDKVZFLASH_ATTENTION_DISABLE_LOCALZFLASH_ATTENTION_DISABLE_SOFTCAPZFLASH_ATTENTION_DISABLE_PACKGQAZFLASH_ATTENTION_DISABLE_FP16ZFLASH_ATTENTION_DISABLE_FP8cuda	   ZFLASH_ATTENTION_DISABLE_HDIM64ZFLASH_ATTENTION_DISABLE_HDIM96ZFLASH_ATTENTION_DISABLE_HDIM128ZFLASH_ATTENTION_DISABLE_HDIM192ZFLASH_ATTENTION_DISABLE_HDIM256@   `            dtypemha_type)mhamqagqahas_qvFdeterministicsoftcap        g      .@localTcausal
V_colmajordzseqlen_q,seqlen_k)   r'   r   r   )r   r   r   r      r'        q      )r0   r   r      r0      l   r   r      )  r     r   r9   r      r?     r?   r?   rA   )   rC   )  rD   c           5         sL  |r|d dkst jkrtd dt jd |dkr dnd d}|	d	kr*|n|	d
kr0dndt jkr:t jn}|dkrH|dkrHd|gn|dkrQdd|gn|g}t jkr\|g}tskt d|d d	 dgndg}t
||D ].\}}t j | |||d}|dkr|| d }|| }t j |||d| }t j |||d| }|rt j | |||d|}nd }|sdnt d|d }t jkr fddtdD \}}}nd\}}}fdd|||fD \}}}|r|  nd }|r*tt| d d }t|||d d ||||||||d\}}t|||d d ||||||||dd t jkrUnd d!\}} d|d" d" |   	  }!|dkrrdnd}"td#||   	   td$||   	   tsdd gndg}#tsddgndg}$t
|#|$D ]O\}%}&t||||||||||||%|&d%}'td&|'|   	   td'|'|   	   |'|   	 |"||   	  |! ksJ qtst jkr|s|s|dks|dkst |'}(|( |'  d( dd})t j!"|'|||f|(\}*}+}t j!"||||f|(\},}-}.t j!"||||f|(\}/}0}1td)|*|,   	   td*|+|-   	   td+||.   	   td,|*|,   	   td-|+|-   	   td.||.   	   td/|/|,   	   td0|0|-   	   td1|1|.   	   td2|/|,   	   td3|0|-   	   td4|1|.   	   d|,d" d" |,   	  |dkrdnd5 }2|*|,   	 |"|/|,   	  |2 ks;J d|-d" d" |-   	  |dkrQdnd5 }3|+|-   	 |"|0|-   	  |3 ksoJ d|.d" d" |.   	  |dkrdnd5 }4||.   	 |"|1|.   	  |4 ksJ qtd S )6N   r   zQV_colmajor requires seqlen_k to be a multiple of 16 and dtype to be float8_e4m3fnr      r         r   r   r'   r   r   r   r   r9   r'   devicer   r!      rN   rG   c                    $   g | ]}t j t jd d qS rJ   rG   torchrandfloat32.0_
batch_sizerK   	nheads_kv b/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/hopper/test_flash_attn.py
<listcomp>      $ z*test_flash_attn_output.<locals>.<listcomp>r.   NNNc                       g | ]}|    qS r\   detachtorequires_grad_rW   xr   r\   r]   r^          zb s h d -> b h d szb h d s -> b s h dr#   qv	q_descale	k_descale	v_descalewindow_sizeattention_chunkr    FTr#   rk   rl   rm   rn   ro   rp   r    upcastreorder_opsintermediate_dtype333333?Pytorch max diff: Pytorch mean diff: )
r#   rk   rl   rm   rn   ro   rp   r    pack_gqa
num_splitsOutput max diff: Output mean diff: rN   dQ max diff: dK max diff: dV max diff: dQ mean diff: dK mean diff: dV mean diff: dQ Pytorch max diff: dK Pytorch max diff: dV Pytorch max diff: dQ Pytorch mean diff: dK Pytorch mean diff: dV Pytorch mean diff: a2U0*3?)#rS   float8_e4m3fnpytestskiprandommanual_seedbfloat16DISABLE_LOCALrandintitem	itertoolsproductrandnrd   re   tolistrangerc   r   
contiguousr   absmaxprintmeanDISABLE_PACKGQADISABLE_SPLITr   DISABLE_BACKWARD
randn_likefloatsum	transposeautogradgrad)5seqlen_qseqlen_kr%   r#   r"   r    r$   r   r   r   r   nheads	dtype_refdv_valsattention_chunk_valsdvrp   q_refk_refv_refqv_refro   rl   rm   rn   qkvrk   out_refattn_refout_ptattn_ptfwd_atolrtolpack_gqa_valsnum_splits_valsrx   ry   outgdo_odqdkdq_refdk_refdv_refdq_ptdk_ptdv_ptdq_atoldk_atoldv_atolr\   rZ   rK   r   r[   r]   test_flash_attn_output5   s   7
0
$&&$
"


 :


262626 r   add_unused_qkv)r&   )r'   r.   )rG   r'   )i  r'   )r.   i  r(   r   r   r)   r/   r2   r4   r6   r8   )i3  r   r;   r=   r>   r@   rB   rF   rF   c           N         sh  dt j| | | t|d  t|  | dkrdnd d}|	dkr&|n|	dkr,dndt jkr6t jn}|d	krD|d
krDd	|gn|dkrMdd|gn|g}t jkrX|g}| |krktskt d|d d dgndg}t	
||D ]\}}t j | |||d}|dkr|| d   }|| }t j |||d| }t j |||d| }|rt j | |||d|}nd }|sdnt d|d}t jkr fddtdD \}}}nd\}}}dd |||fD \}}}|r| nd }t|  ddd}t| ddd}dd }||||  |j\}} |||| |j\}}!t||||||d| |!d 	\}"}#}$}%}&}'}(})}*}+}}}}},}-}.fd!d|"|#|$fD \}"}#}$t|||||||||||||d"\}/}0t|||||||||||||ddt jkrnd d#\}1}2td$|1|/      td%|1|/      | d urt| d&}3d|/d' d' |/     }4|dkrdnd}5tsddgndg}6tsddgndg}7t	
|6|7D ]b\}8}9t|"|#|$|&|'|*|+|(|)||%||||||d(}:|,|:};| d ur!|;|3d td)|;|/      td*|;|/      |;|/    |5|1|/     |4 ksZJ qts1t jkr1|s1|dks1|dks1t |:}<|< |:   d+!d+d,}=t j"#|:|"|#|$f|<\}>}?}@|-|>}A|.|?}B|.|@}|!d urt|!d&}C|B|Cd ||Cd | d ur|A|3d |,|<}Dt j"#|/|||f|D\}E}F}Gt j"#|1|||f|D\}H}I}Jtd-|A|E      td.|B|F      td/||G      td0|A|E      td1|B|F      td2||G      td3|H|E      td4|I|F      td5|J|G      td6|H|E      td7|I|F      td8|J|G      d|Ed' d' |E     |dkrdnd9 }K|A|E    |5|H|E     |K ksJ d|Fd' d' |F     |dkrdnd9 }L|B|F    |5|I|F     |L ksJ d|Gd' d' |G     |dkrdnd9 }M||G    |5|J|G     |M ks1J qtd S ):Nr   rG   rF   r   rH   r   r   r'   r   r   r   r   r9   rI   r   rJ   r!   rL   rM   rO   c                    rP   rQ   rR   rV   rY   r\   r]   r^   o  r_   z1test_flash_attn_varlen_output.<locals>.<listcomp>r.   r`   c                 S   s   g | ]}|   qS r\   )rc   re   rf   r\   r\   r]   r^   r  s    r   F)modeZzero_lengthsTc                 S   sH   |rt |||}t| |}tt| ||}||fS | }d }||fS N)r
   rS   logical_andlogical_xor
logical_or)Zpadding_maskZ
add_unusedZmax_seq_lenbsrK   Zanother_maskZ	attn_maskZunused_maskr\   r\   r]   _gen_unused_masks{  s   z8test_flash_attn_varlen_output.<locals>._gen_unused_masks)rk   Zkvpackedquery_unused_maskkey_unused_maskc                    ra   r\   rb   rf   rh   r\   r]   r^     ri   rj   rq   rv   rw   zb s -> b s 1 1ru   )
	seqused_q	seqused_kr#   rk   rl   rm   rn   ro   rp   r    rz   r{   rN   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   )$rS   r   r   intr   r   r   r   r   r   r   r   rc   re   rd   r   r
   rK   r	   r   r   r   r   r   r   r   r   r   Zmasked_fill_r   r   r   r   r   r   r   )Nr   r   r%   r   r#   r"   r    r   r   r   r   r   r   r   r   r   rp   r   r   r   r   ro   rl   rm   rn   r   r   r   rk   query_padding_maskkey_padding_maskr   r   r   q_unpadk_unpadv_unpadqv_unpadcu_seqlens_qZcu_seqlens_kr   r   max_seqlen_qZmax_seqlen_koutput_pad_fnZ	dq_pad_fnZ	dk_pad_fnr   r   r   r   Zq_zero_maskingr   r   r   r   rx   ry   Z	out_unpadr   Zg_unpadr   Zdq_unpadZdk_unpadZdv_unpadr   r   Zk_zero_maskingr   r   r   r   r   r   r   r   r   r   r\   r   r]   test_flash_attn_varlen_output  sL  5(0
,&&$
"






 
:





262626 r   new_kvzcausal,local)FF)TF)FTseqlen_new_eq_seqlen_qhas_rotary_seqlensrotary_interleavedrotary_fraction)r!   g      ?g      ?	page_size)r'   rL   r   has_leftpadhas_batch_idxvarlen_q))r'   r   )r'   iS  )r.   r?   )r   i   )r   r   r.   r-   )r   rF   )rE   i N  r   r8   )rF   i  c           U         s  |d ur|| dkrt   |kr|rt   |s"|dkr"t   |dkr,|	r,t   dtjd d |s: n d }d}tt|| d d }|dkrS|n|d	krYd
nd}|| dkscJ |tjkrktj	n|}|dkry|dkryd|gn|dkrdd|gn|g}|tjkr|g}|s|rt
std
|d d dgndg}t||D ]]\}}|dko|dk}tj |||d||}|rtj |||d||}nd }|rt dd}t||^}}}}  fdd}!|rt|d nd }"n
d }|}|}"d\}}|sdntd|d}#|
r"n
td
d
 d }$d }%d }&|rxtj |$|||d||}'tj |$|||d||}(|rrt|$ dd}&t|'|&^})}*}%} t|(|&^}+} n|'|(})}+nd\}'}(})}+|d u rtj|||||d||},tj|||||d||}-d }.nt||||||||	\},}-}.}/}0}1tj|rdnd
|r||s|r|d
krӈn|$ d
 n|d
  ftjd|rtfddt D }2nd }2|r	tj|tjdd   }3nd }3ttj|dd }4td!}5|s"|4|5k }6n|r,|&jd"d#d$n|$}7|4|5|7 k }6|rFt|6|4|2d"d"|k}6|	sKnd }8|dkrtj|d u r]|n|1| |d dd tj }9t |9j|d%||}:t!|9j|d%||};|s|rt"||:|;|8|d&}<ntt"t|d'|:|;|8|d&d(d)}<t"|'|:|;|8|d&}=n	d\}:};||'}<}=|s|,n|,|3 # }>|s|-n|-|3 # }?|r t|5|4k|4|5|7 k }@t|=d}At|(d}B|r|A|* }A|B|* }B|A|>|@< |B|?|@< t$|>d*|| d+}Ct$|?d*|| d+}Dt%|<|C|D||6|||#||2d,
\}E}Ft%|<|C|D||6|||#|d-d#|2|tjkr7|nd d.\}G}F||}|rJ||nd }|,|},|-|}-|d ur`|/|nd }/|d url|0|nd }0|'d urx|'|nd }'|(d ur|(|nd }(|)d ur|)|nd })|+d ur|+|nd }+|d ur||nd }|r|d ur|"|nd }"|:d ur|:|nd }:|;d ur|;|nd };|d u r|,# n|/# }H|d u r|-# n|0# }It&sd
dgnd
g}Jd-d#g}Kt|J|KD ]\}L}M|Mr$t' |r|n|||||j(|||%|2|$|||#||Ld/}Nnd }Nt|Ms,d
ndD ]}F|d u rB|,)|H |-)|I n
|/)|H |0)|I t*|sR|n||d u rZ|,n|/|d u rb|-n|0|ri|sk|'n|)|rr|st|(n|+fi d0|s}|n|"d1|:d2|;d3d4|3d5|2d6|.d7|d8|%d9|d:|8d;|d<|#d=|d>|d?|Nd@|LdAd#^}O}P} |r|!|O}Ot+dB|O|E , -    t+dC|O|E , .    t+dD|G|E , -    t+dE|G|E , .    |r|d u r!|s
|,|n|,||3 }Q|s|-|n|-||3 }Rn@t|/||s+|.n|.|3 /  dF dGd d d |f |}Qt|0||sK|.n|.|3 /  dF dGd d d |f |}R|>||}>|?||}?|tjurt0|R|?sJ ntj1|R|?dHdHdIsJ |dkrt0|Q|>sJ n|tjurtj1|Q|>dHdHdIsJ ntj1|Q|>dJdJdIsJ |tjkrdKnd}S|O|E , -  |S|G|E , -   dL ksJ |tjkrdndM}T|O|E , .  |T|G|E , .   ksJ q/qqd S )NNr   r!   r      rG   rH   rE   r   r   r'   r.   r   r   r   r   r9   rI   rJ   r   )r   c                    s   t |  S r   )r   )Zoutput_unpad)rZ   	indices_qr   r\   r]   <lambda>  s    z)test_flash_attn_kvcache.<locals>.<lambda>zb s ... -> (b s) ...)NNrM   rO   )NNNNr   rK   c              	      sL   g | ]"} |   d krtjd  |   dtjdntjdtjdqS )r   rI   r   r'   )r   rS   r   int32Zzeros)rW   i)cache_seqlensrK   r\   r]   r^     s    z+test_flash_attn_kvcache.<locals>.<listcomp>)rK   zs -> 1 szb -> b 1rN   T)Zkeepdimsrh   )Zseqlen_offsetsZinterleavedzb s h d -> b 1 (s h) dzb 1 (s h) d -> b s h d)szb s h d -> b s (h g) d)r   )r#   rk   ro   rp   key_leftpadF)r#   rk   ro   rp   rr   rs   r   rt   )
Z	headdim_vr   cu_seqlens_k_newcache_leftpadZmax_seqlen_k_newr   r#   ro   rp   ry   rk   Z
rotary_cosZ
rotary_sinr   cache_batch_idxr   
page_tabler   r   r   rotary_seqlensr#   ro   rp   r   scheduler_metadatary   Zreturn_softmax_lserz   r{   rv   rw   8(b nblocks) block_size ... -> b (nblocks block_size) ...bgMbP?)r   atolg?rL   h㈵>g      ?)2r   r   rS   r   r   mathfloorr   r   r   r   r   r   r   r   r   rd   r
   r   r   _generate_block_kvcacher   catr   randpermaranger   r   	unsqueezeexpandrT   picossinr   cloner   r   r   r   r   Zcopy_r   r   r   r   r   flattenequalallclose)Ur   r   r%   r   r   r   r   r   r   r   r   r#   r"   r   r   r   Zbatch_size_cacher   Z
rotary_dimnheads_kr   r   r   r   rp   r   r   rk   r   r   r   r   restr   r   ro   Z
seqlen_newr   Zkey_new_padding_maskr   r   r   Z	indices_kr   k_cachev_cacher   k_cache_pagedv_cache_paged
num_blocksr   r   r  Zcache_seqlens_expandedr   Zk_new_seqlensr   Zangler  r  Zq_roZk_roZk_cache_refZv_cache_refZupdate_maskZk_to_updateZv_to_updateZk_cache_repZv_cache_repr   rX   r   Zk_cache_savedZv_cache_savedr   Zprecompute_metadata_valsry   Zprecompute_metadatar   r   lseZk_cache_selectZv_cache_selectZmultZ	mult_meanr\   )rZ   r   rK   r   r   r]   test_flash_attn_kvcacheA  sN  E0
,"$ ""
""	*



	










	



	



66 r  c	                 C   s   t | | | d }	tj|	|||||d||}
tj|	|||||d||}ttj|	tj|dd|d}t|
|  d|dd d d | f }t||  d|dd d d | f }||||
||	fS )Nr.   rJ   r   z(b nblocks) -> b nblocksr   r   )	r  ceilrS   r   rd   r   r  r   r  )r   r   rZ   r  r%   r   rK   r   r   r  r  r  r   r  r  r\   r\   r]   r    s>   

r  )r   i    c                 C   s   d}t jd d}d}d}t j|| ||||d}	t j||||||d}
t j||||||d}tdD ]
}t|	|
||d q3d S )	Nr   r   rG   rE   rL   rJ   d   r#   )rS   r   r   r   r   r   )r   r   r%   r#   r   rK   rZ   r   r[   r   r   r   rX   r\   r\   r]   test_flash_attn_cluster  s   r  )    (   ;   r   P   r   o   r      r      r   ))r'   r+   r*   r   r,   )r?   r   )a   r&  r   )   r'  r)   )  r(  )r:   r:   )r9   r9   )   r)  r>   r   c              	   C   s  d}t jd t jdt j|d}d}d}t j|| ||||dd}	t j||||||dd}
t j||||||dd}t jd	 t|	|
||d
}t |}t j	||	|
|f|\}}}d|d d | 
    }tdD ]Y}t jd	 t|	|
||d
}t ||sJ t j	||	|
|f|\}}}t j|||d}|std| d|d|| 
     t ||sJ t ||sJ |sJ qnd S )Nr   r   l       F r   <   rL   T)rK   r   Zrequires_grad*   r  rG   ru   i  )r  zIter z, dq_atol = z, dQ max diff: )rS   r   r   emptyZuint8r   r   r   r   r   r   r   r   r   r  r  r   )r   r   r%   r#   r   rK   dummyrZ   r   r   r   r   Zout0r   Zdq0Zdk0Zdv0r   r   r   r   r   r   Zdq_equalr\   r\   r]   test_flash_attn_race_condition  s2   
 *
r.  c                 C   s\   t j|dd}t || }t t |t |B t ||}|d|  d}||fS )z|
    out_partial: (num_splits, batch_size, seqlen, nheads, d)
    lse_partial: (num_splits, batch_size, nheads, seqlen)
    r   )dimrN   )	rS   Z	logsumexpexpwhereisinfisnanZ
zeros_liker	  r   )out_partiallse_partialr  scaler   r\   r\   r]   attention_combine_ref3  s
   $r7  )r   r   r   r   r   r9   seqlen)
r'   rG   r.   r  r   r   r0   r7   r<   r?   ry   )	r'   rG   r.   r      r  7   r&     c              	   C   s  t rt  d}tjd d}d}tj| d |||||tjdddd |  }tj| ||d ||tjddd	d d d d d d d |f }t	d
 || d d d |d f< t
|||d\}	}
t||\}}||}td|
|      td|
|      td|	|      td|	|      td||      td||      tj|
|dddsJ d}|	|    |||     kstj|	|dddsJ d S d S )Nr   r'   r   rE   rG   rJ   r.   rN   r   inf)Z	out_dtypezLSE max diff: zLSE mean diff: rz   r{   rv   rw   r  )r  r   )r   r   r   rS   r   r   r   rU   r   r   r   r7  rd   r   r   r   r   r   r  )ry   r8  r%   r   rK   rZ   r   r4  r5  r   r  r   Zlse_refr   Zmultipler\   r\   r]   test_flash_attn_combine?  s*   .@"
Jr=  returnc                   C   st   t jjjjjtdsJ t jjjjjtdsJ t jjj	jjtds*J t jjj
jjtds8J d S )Na[  flash_attn_3::fwd(Tensor q, Tensor k, Tensor v, Tensor(k_new!)? k_new=None, Tensor(v_new!)? v_new=None, Tensor? q_v=None, Tensor(out!)? out=None, Tensor? cu_seqlens_q=None, Tensor? cu_seqlens_k=None, Tensor? cu_seqlens_k_new=None, Tensor? seqused_q=None, Tensor? seqused_k=None, int? max_seqlen_q=None, int? max_seqlen_k=None, Tensor? page_table=None, Tensor? kv_batch_idx=None, Tensor? leftpad_k=None, Tensor? rotary_cos=None, Tensor? rotary_sin=None, Tensor? seqlens_rotary=None, Tensor? q_descale=None, Tensor? k_descale=None, Tensor? v_descale=None, float? softmax_scale=None, bool is_causal=False, int window_size_left=-1, int window_size_right=-1, int attention_chunk=0, float softcap=0., bool is_rotary_interleaved=False, Tensor? scheduler_metadata=None, int num_splits=0, bool? pack_gqa=None, int sm_margin=0) -> (Tensor(out!), Tensor, Tensor, Tensor)a(  flash_attn_3::bwd(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor(dq!)? dq=None, Tensor(dk!)? dk=None, Tensor(dv!)? dv=None, Tensor? cu_seqlens_q=None, Tensor? cu_seqlens_k=None, Tensor? seqused_q=None, Tensor? seqused_k=None, int? max_seqlen_q=None, int? max_seqlen_k=None, float? softmax_scale=None, bool is_causal=False, int window_size_left=-1, int window_size_right=-1, float softcap=0., bool deterministic=False, int sm_margin=0) -> (Tensor(dq!), Tensor(dk!), Tensor(dv!), Tensor, Tensor, Tensor, Tensor, Tensor)zflash_attn_3::fwd_combine(Tensor out_partial, Tensor lse_partial, Tensor(out!)? out=None, ScalarType? out_dtype=None) -> (Tensor(out!), Tensor)a(  flash_attn_3::get_scheduler_metadata(int batch_size, int max_seqlen_q, int max_seqlen_k, int num_heads, int num_heads_k, int headdim, int headdim_v, ScalarType qkv_dtype, Tensor seqused_k, Tensor? cu_seqlens_q=None, Tensor? cu_seqlens_k=None, Tensor? cu_seqlens_k_new=None, Tensor? seqused_q=None, Tensor? leftpad_k=None, int? page_size=None, int max_seqlen_k_new=0, bool is_causal=False, int window_size_left=-1, int window_size_right=-1, int attention_chunk=0, bool has_softcap=False, int num_splits=0, bool? pack_gqa=None, int sm_margin=0) -> Tensor)rS   opsZflash_attn_3ZfwddefaultZ_schemaZis_backward_compatible_withr   ZbwdZfwd_combiner   r\   r\   r\   r]   test_flash3_bw_compatibilitym  s   


rA  )r>  N)?osr  r   r   rS   Ztorch.nn.functionalnnZ
functionalFZtorch._Cr   Zeinopsr   r   Zflash_attn.layers.rotaryr   ImportErrorpaddingr   r   Z	test_utilr   r	   r
   Zflash_attn_interfacer   r   r   r   r   getenvr   r   ZDISABLE_PAGEDKVZDISABLE_APPENDKVr   ZDISABLE_SOFTCAPr   ZDISABLE_FP16r   Zget_device_capabilityZDISABLE_FP8ZDISABLE_HDIM64ZDISABLE_HDIM96ZDISABLE_HDIM128ZDISABLE_HDIM192ZDISABLE_HDIM256ZCOMPILED_HDIMSmarkZparametrizer   Zfloat16r   r   r   r  r  r  r.  r7  rU   r=  rA  r\   r\   r\   r]   <module>   s    $4 44 "t & $  P"#