o
    )ir                  (   @   s  d dl Zd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlZddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddlm'Z'm(Z(m)Z)m*Z*m+Z+m,Z, dd	l-m.Z. d
Z/dZ0dZ1ej2j3du oej4j56 Z7dZ8ej9j:de;drddl<m=Z= ddl>m?Z? e?dure?j@AdZ/dZ0n~ej9:drd dlBZBd dlCZBeDeBjEdreBjEjFZ=neBjEjGZ=eBjHZ/dZIdZJeKdd e/Lddd D ZMeMeIk seMeJkrejNOdddkrePddQdd eID  d dQd!d eJD  d"e/ ddZ0ne7r#e.dd#du Z1ejRjST Z/e1 Z0dZ8e/d
krejUjVd$d%d&gd'd(ejWd)ejWd*ejWd+e	ejW d,e	ejW d-e	ejW d.eXd/eXd0eYd1eYd2eZd3eXd4eXd5eZd6e	ejW d7eejWejWejWf f d8d9Z[ejU\d$d:d; Z]ejUjVd<d%d&gd'd=eZd>ejWd(ejWd)ejWd*ejWd?ejWd@ejWd+ejWd,ejWd.eXd/eXd0eYd1eYd2eZd3eXd4eXdAejWd7eejWejWejWf f$dBdCZ^ejU\d<dDdE Z_d=eZd7eejWejWejWf fdFdGZ`	dedHe,dIeZdJeZd7ee,e	ejW eXe	ejW eXe	ejW f fdKdLZadMe	eejWef  d7eZfdNdOZbd7eZfdPdQZcdMe	eejWef  d7eeXeXf fdRdSZddTe,dUeee d7dfdVdWZfdXejWdYeedUeee d7dfdZd[Zge0fd@ejWdHe,d\eeXd]f d^eZd7ejWf
d_d`ZheG dadb dbe(ZieG dcdd dde'ZjdS )f    N)zip_longest)AnyIterableListOptionalSetTupleUnion   )get_operatorregister_operator   )AttentionBias&BlockDiagonalCausalFromBottomRightMask4BlockDiagonalCausalLocalAttentionFromBottomRightMask%BlockDiagonalCausalLocalAttentionMask/BlockDiagonalCausalLocalAttentionPaddedKeysMaskBlockDiagonalCausalMask*BlockDiagonalCausalWithOffsetGappyKeysMask+BlockDiagonalCausalWithOffsetPaddedKeysMaskBlockDiagonalGappyKeysMask)BlockDiagonalLocalAttentionPaddedKeysMaskBlockDiagonalMaskBlockDiagonalPaddedKeysMask!LocalAttentionFromBottomRightMask0LowerTriangularFromBottomRightLocalAttentionMask"LowerTriangularFromBottomRightMaskLowerTriangularMask/PagedBlockDiagonalCausalWithOffsetGappyKeysMask0PagedBlockDiagonalCausalWithOffsetPaddedKeysMaskPagedBlockDiagonalGappyKeysMask PagedBlockDiagonalPaddedKeysMaskVARLEN_BIASES)AttentionBwOpBaseAttentionFwOpBasecheck_lastdim_alignment_stride1Context	GradientsInputs)is_pt_flash_oldz0.0.0Fz..._C_flashattention)package   )_C_flashattention)_build_metadatavT
flash_attnflash_attn_cuda)r
      r   )r
      r   c                 c   s    | ]}t |V  qd S N)int).0s r7   c/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/ops/fmha/flash.py	<genexpr>M   s    r9   .Z#XFORMERS_IGNORE_FLASH_VERSION_CHECK01z#Requires Flash-Attention version >=c                 C      g | ]}t |qS r7   strr5   ir7   r7   r8   
<listcomp>R       rB   z,<=c                 C   r=   r7   r>   r@   r7   r7   r8   rB   S   rC   z	 but got )forcezxformers_flash::flash_fwdr7   cuda)Zmutates_argsZdevice_typesquerykeyvaluecu_seqlens_qcu_seqlens_k	seqused_kmax_seqlen_qmax_seqlen_kpsoftmax_scale	is_causalwindow_leftwindow_rightreturn_softmaxblock_tablesreturnc                 C   s   d}t r7tjjj| ||||||||
d|	|||d d}tr+|\}}}}}t||g}n|\}}}}}|||fS |d u r]|d u sAJ |d u sGJ t| ||d d ||	|
||||d \}}}}nt	| ||d |||d |d ||||	d|
||||d \}}}}|||fS )N        F)Zreturn_debug_maskscalewindow_size_leftwindow_size_rightrK   Zalibi_slopes)
_USE_PT_FLASH_ATTNtorchopsatenZ_flash_attention_forwardpt_flash_is_oldstackr,   ZfwdZ
varlen_fwd)rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   softcapret	attentionZ	logsumexpZphilox_seedZphilox_offset_	rng_stateoutsoftmax_lser7   r7   r8   
_flash_fwda   s   

rg   c                 C   s   t | }|d u r| j\}}}}|||g}n| j\}}}|jd d }tr*||g}n|||g}t j|| jt jd}t jdg| jt jd}|||fS )Nr   r   devicedtyper
   )r[   
empty_likeshapeVARLEN_LSE_PACKEDemptyri   float32Zint64)rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   re   BMHKZ	lse_shaperf   rd   r7   r7   r8   _flash_fwd_abstract   s   



rt   zxformers_flash::flash_bwdgrads_share_storagegradre   lserd   c                 C   s  d}t r9|dks
J |d urtr|d }|d }n| }}tjjj|||||||||	|
|||||||d\}}}nHt| |||\}}}|d u re|d u sMJ t|||||||||d ||||||dd | nt	|||||||||||d |	|
||d||||dd | |||fS )NrV   r   r   )rW   rX   rY   F)
rZ   r^   r[   r\   r]   Z_flash_attention_backward_create_dq_dk_dvr,   ZbwdZ
varlen_bwd)ru   rv   rF   rG   rH   re   rw   rI   rJ   rL   rM   rN   rO   rP   rQ   rR   rd   r`   Z
rng_state0Z
rng_state1dqdkdvr7   r7   r8   
_flash_bwd   s   

r|   c                 O   s   t | |||S r3   )rx   )ru   rv   rF   rG   rH   argskwargsr7   r7   r8   _flash_bwd_abstractM  s   
r   c                 C   s~   | r1t jg |jdd d|jd |jd R |j|jd}|dd|dd|ddfS t |t |t |fS )	Nr   r+   )rj   ri   r   r
   )r[   rn   rl   rj   ri   selectrk   )ru   rF   rG   rH   chunkr7   r7   r8   rx   Y  s   *"rx   inpsupports_mqause_kvsplitc              
   C   s  | j jdv sJ | j | j| j}}}|jd }|jd }|jd }|jd }	|jd }
| j}t|trR|jj	j
| j j
ks?J |jj	}|jj	}|jj}|jj}d }n;t|ttttfr{|jj	j
| j j
ksfJ |jj	}|jj	}|jj}|jj}|jj}nd }d }d }| j jd }| jjd }|jdkr|sJ dd }||}||}||}|r|jdkr|d	dkr|d	dkr|d d d d d df }|d d d d d df }|d ur(||| d|	g}||| d|	g}||| d|
g}t|ttfr(|jd |j }|j||jg|jdd  R  }|j||jg|jdd  R  }t||||| j| j| j| jd
}||||||fS )N)      r   r   r   r   c                 S   sN   |  ddkr| d d d d d d df S | | jd | jd d| jd gS )Nr+   r   r   r   r   )stridereshaperl   )xr7   r7   r8   fold  s   z#_convert_input_format.<locals>.foldr   r
   )rF   rG   rH   	attn_biasrN   rW   output_dtype
is_partial)rF   ndimrG   rH   rl   r   
isinstancer   	k_seqinfoZseqstartri   	q_seqinfo
max_seqlenr   r   r    r!   Zseqlenr   r   Z	page_sizeviewr(   rN   rW   r   r   )r   r   r   rF   rG   rH   batchZseqlen_qZ	seqlen_kvZ
head_dim_qZ
head_dim_vr   Zcu_seqlen_kZcu_seqlen_qrL   rM   rK   r   Z	num_pagesZnew_inpr7   r7   r8   _convert_input_formati  s   





	

*
  
r   r   c                 C   s"   t | ttttttttt	t
ttfS r3   )r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r7   r7   r8   
_is_causal  s    r   c                 C   s.   t | trt S t | ttfrtj  S dS )NT)
issubclassr!   rZ   r    r[   ZmtiaZis_available)Zattn_bias_typer7   r7   r8   _is_paged_attention_supported  s   
r   c                 C   sF   d}d}t | ttttfr| jd }t | ttfr| j}| j	}||fS )Nr   r   )
r   r   r   r   r   _window_sizer   r   rQ   rR   )r   win_left	win_rightr7   r7   r8   r     s(   
	r   dreasonsc                 C   s   t | jtr$t| jjj| jjjD ]\}}||kr!|d  d S qd S t | jtr=| j	j
d | jj
d kr?|d d S d S d S )NzIOnly support BlockDiagonalCausalMask if equal numbers of keys and queriesr   zCOnly support LowerTriangularMask if equal number ofkeys and queries)r   r   r   r   r   Zseqstart_pyr   appendr   rF   rl   rG   )r   r   Zk_startZq_startr7   r7   r8   _check_needs_no_topleft
  s$   	r   r   namec                 C   s   | j dkrG| d| d}}| jd dkrdS | jd dks$|dkr&dS ||| jd  krI|d| d	|   d
| dt| j  dS dS dS )zD
    We want to be able to collapse the G/H dimensions together
    r   r
   r+   r   Nr   r   zAGQA is only supported when the G/H dimensions are contiguous
    z
.stride:  z
    z
.shape :  )r   r   rl   r   list)r   r   r   Zstride_gZstride_hr7   r7   r8   _check_strides_for_bmghk   s(   
r   original_query_shape.varlen_lse_packedc                 C   s   t |jtst|dkr| d|dd S | S |r2t|dkr-| d|dd dS | dS |js7| S | dddjddd  }t|dkrT|d|dd S |S )Nr   r   r
   r   r   )Z	start_dim)	r   r   r"   lenZ	unflattenZ	unsqueezer   Zpermuteflatten)rw   r   r   r   Zlse_hkmr7   r7   r8   _post_process_lse2  s   
r   c                       s  e Zd ZU dZeddZdhZee e	d< dZ
ejejhZeej e	d< dZed	eeeeeeeeeeeeeee e!fZ"e#e$ e	d
< dd e"D Z"dZ%dZ&dZ'dZ(dZ)e*Z*e+r\de, dnde, Z-e,Z.e/de0de1e f fddZ2e/de0de3de4ej5e6e7 f fddZ8  Z9S )FwOpzOperator that computes memory-efficient attention using         `Flash-Attention <https://github.com/HazyResearch/flash-attention>`_         implementation.
    xformers_flashZ	flash_fwdrE   SUPPORTED_DEVICESr2   r   SUPPORTED_DTYPES   NSUPPORTED_ATTN_BIAS_TYPESc                 C   s   g | ]}t |r|qS r7   )r   )r5   br7   r7   r8   rB   s  s
    zFwOp.<listcomp>TFzfa2F@-ptr   rU   c                    s   t t| |}t|d|jd t|| t|jd| t|jd| t|jd| |j	rDt
sDt|jtrD|jj}|j|jkrD|d |S )NrF   r2   rG   rH   z,partial attention with heterogeneous queries)superr   not_supported_reasonsr%   rF   r   r   rG   rH   r   rm   r   r   r"   r   Z
min_seqlenr   r   )clsr   r   r   	__class__r7   r8   r     s    


zFwOp.not_supported_reasonsr   needs_gradientc                 C   s  d}|j j}g |j jd d |jjd }t|dd\}}}}}	}
|j  dkrh|j dkrht|j\}}t|jt	rA|jj
nd }| j|j |j|j|||
||	|j|jt|j||||d\}}}||}nBtj||j j|j jd}d }tjtrt|jtr|j jd |j jd |j jd	  gn|j jd |j jd |j jd	 g|j jtjd}|s|d fS t|t|||d
}|jdkrt|_||_||fS )NFr   Tr   r   )rQ   rR   rS   rT   rh   r
   r   )re   rw   rV   )rF   rl   rH   r   numelrG   r   r   r   r!   rT   OPERATORrN   scale_floatr   r   r[   Zzerosri   rj   rn   rm   r"   ro   r&   r   BwOpZop_bwrd   )r   r   r   rS   r   Z	out_shaperI   rL   rJ   rM   rK   r   r   rT   re   rf   rd   ctxr7   r7   r8   apply  sx   




& 	

z
FwOp.apply):__name__
__module____qualname____doc__r   r   r   r   r?   __annotations__CUDA_MINIMUM_COMPUTE_CAPABILITYr[   ZhalfZbfloat16r   rj   SUPPORTED_MAX_Ktyper   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r!   r   r   r   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDSUPPORTS_BMGHKZSUPPORTS_PARTIALrm   rZ   FLASH_VERSIONNAMEVERSIONclassmethodr(   r   r   boolr   Tensorr   r&   r   __classcell__r7   r7   r   r8   r   S  s\   
 
r   c                
       s   e Zd ZU ejZeddZejZejZej	Z	ej
Z
eeejeeeeeeehZee ed< ejZejZejZdZdZeZerGde  dnde  Z!e Z"dZ#e$de%d	e&e' f fd
dZ(e$de)de%de*j+d	e,fddZ-  Z.S )r   r   Z	flash_bwdr   Fzfa2B@r      r   rU   c                    s   t t| |}t|d|jd t|| |jjdkrGtj	
|j}|dv }t|jjd |jjd | jkrG|sG|jdkrG|d| j d |S )	NrF   r2   rE   )r   )	   r   r   rV   zdrequires a GPU with compute capability 8.0 (A100) or 9.0 (H100) for dropout when 'query.shape[-1] > ')r   r   r   r%   rF   r   ri   r   r[   rE   Zget_device_capabilitymaxrG   rl   MAX_HEADDIM_DROPOUT_SM8xrN   r   )r   r   r   Zdevice_capabilityZis_sm80_or_sm90r   r7   r8   r     s    
 
zBwOp.not_supported_reasonsr   r   rv   c                 C   s  |j j|jj|jj}}}t|dd\}}}}	}
}|d u s J |j}t|jtr9t	r9|jd dks4J |d }n|jd |ksBJ |d d d d d |f 
 }g |j jd d |jjd }|j| jv siJ |j  r|j rt|j\}}t| j|j||
 |j |j|j|j||||	||
|j|jt|j|||jdkr|jnd d }ntt|j t|jt|jd	}|j dkr|j  |j  |j dkr|j  |j||_|j||_|j||_|S )
NFr   r   r   r
   r   rV   )rQ   rR   rd   )ry   rz   r{   )rF   rl   rG   rH   r   rw   r   r   r"   rm   
contiguousrj   r   r   r   r'   r   Zqkv_share_storager   re   rN   r   r   rd   r[   Z
zeros_likery   rz   Zzero_r{   )r   r   r   rv   Zdq_shapeZdk_shapeZdv_shaperI   rL   rJ   rM   rK   Zctx_lseZkernel_out_shaper   r   Zgradsr7   r7   r8   r     sv   









z
BwOp.apply)/r   r   r   r   r   r   r   r   r   r   r   tuplesetr   
differencer   r   r   r   r   r   r!   r   r   r   r   r   r   ZIS_DETERMINISTICr   rm   rZ   r   r   r   r   r   r(   r   r?   r   r&   r[   r   r'   r   r   r7   r7   r   r8   r     s@   
 

&r   )F)kimportlib.util	importlibos	itertoolsr   typingr   r   r   r   r   r   r	   r[   commonr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   Ztorch_attention_compatr)   r   rm   r^   versionZhipbackendsrE   Zis_flash_attention_availableZ_TRY_PT_FLASH_ATTNrZ   util	find_spec__package__ r,   Z_cpp_libr-   Zflash_versionlstripr/   Zflash_attn.flash_attn_interfacehasattrZflash_attn_interfacer0   Zflash_attn_gpu__version__ZFLASH_VER_MINZFLASH_VER_LASTr   splitZflash_ver_parsedenvirongetImportErrorjoinnnrb   Z_get_flash_versionZlibraryZ	custom_opr   r4   floatr   rg   Zregister_fakert   r|   r   rx   r   r   r   r   r?   r   r   r   r   r   r7   r7   r7   r8   <module>   sh  $\ 
 
	

]
 	

d


 f

 

! 