o
    )iB                     @   s  d dl mZ d dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZmZmZmZmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' de(de)de)fddZ*de%de(fddZ+de%dee
ej, e
ej, e(e(f fddZ-de
eej,ef  de
ej, fddZ.de	e/ de
eej,ef  ddfddZ0G dd de(eZ1de
eej,ef  de(fd d!Z2eG d"d# d#e!Z3eG d$d% d%e Z4dS )&    )replace)Enum)partial)AnyIterableListOptionalSetTupleUnionN   )get_operatorregister_operator   	attn_bias)AttentionBiasAttentionBiasSubTensor4BlockDiagonalCausalLocalAttentionFromBottomRightMask%BlockDiagonalCausalLocalAttentionMaskBlockDiagonalCausalMask+BlockDiagonalCausalWithOffsetPaddedKeysMaskBlockDiagonalMask0LowerTriangularFromBottomRightLocalAttentionMask"LowerTriangularFromBottomRightMaskLowerTriangularMask!LowerTriangularMaskWithTensorBias)_attn_bias_applyAttentionBwOpBaseAttentionFwOpBasecheck_lastdim_alignment_stride1Context	GradientsInputs)is_pt_cutlass_compatiblesmis_halfreturnc                 C   s   | dkrdS | dkr|S dS )NP   TF   F )r%   r&   r*   r*   e/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/ops/fmha/cutlass.py_uses_tensorcores)   s
   r,   inpc                 C   s   | j jdkrdS tj| j }|d d |d  }tjdtjdtjdi| jj	 }t
||dk}d}|dkr7d}|r@t|d	| }|S )
Ncudar   r   
          r(         )devicetypetorchr.   get_device_capabilityfloathalfbfloat16querydtyper,   max)r-   capr%   Zbits_per_scalarZuses_tensorcoresmatmul_alignment_mnr*   r*   r+   _minimum_gemm_alignment1   s   r@   c                 C   sh   | j }t|ttfr&|jjj| jjksJ |jj}|jj}|jj	}|jj	}nd }d }d}d}||||fS )N)
r   
isinstancer   r   	k_seqinfoZseqstartr4   r;   Z	q_seqinfoZ
max_seqlen)r-   r   
seqstart_k
seqstart_qmax_seqlen_qmax_seqlen_kr*   r*   r+   _get_seqlen_infoB   s   
rH   r   c                 C   s2   t | trt | tr| jS d S t | tjr| S d S N)rB   r   r   Z
_subtensorr6   Tensorr   r*   r*   r+   _get_tensor_biasW   s   

rK   reasonsc                 C   s   t |}|d urQdt|jj }d}t|jd D ]}||| dkr5| d| d|  d d}q|r=| d	 |d
dkrS| d|  d d S d S d S )Nr3   Fr   r   zattn_bias.stride(-2) % z != 0 (attn_bias.stride() = )TzHINT: To use an `attn_bias` with a sequence length that is not a multiple of 8, you need to ensure memory is aligned by slicing a bigger tensor. Example: use `attn_bias = torch.zeros([1, 1, 5, 8])[:,:,:,:5]` instead of `torch.zeros([1, 1, 5, 5])`rA   z/attn_bias.stride(-1) > 1 (attn_bias.stride() = z/) - you should call `.contiguous()` on the bias)	rK   r6   Zfinfor<   bitsrangendimZstrideappend)rL   r   attn_bias_tensorZ	alignmentZshow_padding_hintdr*   r*   r+   _check_bias_alignmentb   s*   rT   c                   @   s   e Zd ZdZdZdZdZdS )_CustomMaskTypez*
    (Matches CustomMaskType in C++.)
    r   r   r   N)__name__
__module____qualname____doc__NoCustomMaskCausalFromTopLeftCausalFromBottomRightr*   r*   r*   r+   rU   ~   s
    rU   biasc                 C   sD   t | tttfrttjS t | ttt	j
ttfrttjS ttjS rI   )rB   r   r   r   intrU   r[   r   r   r   &BlockDiagonalCausalFromBottomRightMaskr   r   r\   rZ   )r]   r*   r*   r+   _custom_mask_type   s&   



r`   c                       s*  e Zd ZU dZe reddndZdhZee	 e
d< ejejejhZeej e
d< dZedejeeeeeeeejejefZe e! e
d	< d
Z"d
Z#d
Z$d
Z%dZ&dZ'g dZ(e)e* e
d< e+de,de-de.eje/e0 f fddZ1e+de,de-de.eje/e0 f fddZ2e+de,de)e	 f fddZ3  Z4S )FwOpzxFormers' MHA kernel based on CUTLASS.
    Supports a large number of settings (including without TensorCores, f32 ...)
    and GPUs as old as P100 (Sm60)
    atenZ_efficient_attention_forwardNr.   SUPPORTED_DEVICESSUPPORTED_DTYPESi   SUPPORTED_ATTN_BIAS_TYPESTFzcutlassF-ptr0   r3      _TEST_Kr-   needs_gradientr'   c                    sj  t  jtjvrtd jjdv r| j |dS  jjdks(J d jj d } jjdkr{ jjd dkr{t	t
jdd}t | j| j| jt jt	t
jd	dd
 | j |d\}}|d}|d urwt||jd|d}||fS  jjd	 }t
j }|g fddt|d D  }g }	t|D ]`\}
}|| t
j|I  jd d d d |
f } jd d d d |
f } jd d d d |
f }t jt	t
jd|
d}|	| jt ||||d
|d W d    n1 sw   Y  q|dd  D ]}|| qt
jdd |	D d	d}|r1t|t
jdd |	D dd|	d d jd}||fS )NUnsupported attn_bias type)   r2   )ri      zquery has shape rk   r   )dimr   )r;   keyvaluer   )lseoutc                    s   g | ]}t jj jjd qS ))r4   )r6   r.   ZStreamr;   r4   ).0_r-   r*   r+   
<listcomp>   s    zFwOp.apply.<locals>.<listcomp>)rm   indexc                 S   s   g | ]}|d  qS )r   r*   rr   or*   r*   r+   ru     s    c                 S   s   g | ]}|d  j qS )r   )rp   rw   r*   r*   r+   ru     s    r   )rq   rp   op_bw)r5   r   ra   re   NotImplementedErrorr;   rP   
apply_bmhkshaper   r6   Zsqueezer   rn   ro   r   Z	unsqueezerp   r.   Zcurrent_streamrO   	enumerateZwait_streamstreamselectrQ   stackr!   ry   )clsr-   ri   ctxZslice_oprq   Zn_groupsZmain_streamstreamsZoutsgroupr~   r;   rn   ro   r]   sr*   rt   r+   apply   sn   	



z
FwOp.applyc                 C   s   t |jtjvrtdt|\}}}}| j|j|j|j	t
|j|||||j|t|j|jt|jtr8|jjjnd t|jtttfrF|jjnd d\}}}	}
}}d }|rgt||d}|jdkrg|	|
f|_t|_||fS )Nrj   )r;   rn   ro   r]   cu_seqlens_qcu_seqlens_krF   rG   	dropout_pZcompute_log_sumexpcustom_mask_typescaleZseqlen_kwindow_size)rq   rp   r   )r5   r   ra   re   rz   rH   OPERATORr;   rn   ro   rK   pr`   r   rB   r   rC   Zseqlenr   r   r   _window_sizer!   	rng_stateBwOpry   )r   r-   ri   rD   rE   rF   rG   rq   rp   rng_seed
rng_offsetrs   r   r*   r*   r+   r{   
  sL   
	!

zFwOp.apply_bmhkrS   c                    sH   t t| |}t|}t|d|j| t|d|j| t||j |S )Nr;   ro   )	superra   not_supported_reasonsr@   r    r;   ro   rT   r   )r   rS   rL   r?   	__class__r*   r+   r   =  s   zFwOp.not_supported_reasons)5rV   rW   rX   rY   r$   r   r   rc   r	   str__annotations__r6   r8   r9   r:   rd   r<   SUPPORTED_MAX_Kr5   rJ   r   r   r   r   r   r   r   r   r_   r   r   re   r   r   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDZSUPPORTS_BMGHKVARLEN_LSE_PACKEDNAMErh   r   r^   classmethodr#   boolr
   r   r!   r   r{   r   __classcell__r*   r*   r   r+   ra      s^   
 ?2$ra   c                	       s   e Zd ZU ejZe reddndZejZej	Z	ej
Z
edejeeeeejejfZee ed< dZejZejZejZdZdZg dZe e! ed	< e"d
e#de e$ f fddZ%e"de&de#dejde'fddZ(  Z)S )r   rb   Z_efficient_attention_backwardNre   TFzcutlassB-ptrf   rh   rS   r'   c                    s  t t| |}t|}t|d|j| t|d|j| t|d|j| t||j	 t
|j	}|d ur|jr|jjdkrS|jdkrSg |jjd d |jjd R }n|jjd |jjdkrd|jjd nd|jjd |jjd f}t|j|kr|d	t|j d
| d |S )Nr;   rn   ro   rk   r   r   r   r2   z=Broadcasting the `attn_bias` tensor is not supported (shape: z/ expected: rM   )r   r   r   r@   r    r;   rn   ro   rT   r   rK   requires_gradrP   r|   tuplerQ   )r   rS   rL   r?   rR   Zexpected_bias_shaper   r*   r+   r   p  s0   
$


zBwOp.not_supported_reasonsr   r-   gradc                 C   s2  t |jtjvrtdt|\}}}}|jj}t	  }	}
|j
dkr/|jd us*J |j\}	}
t|j}tj|jjdk}| j|||j|j|jf||d urS|jnd|||||jd|d|j||j
|	|
t|j|jd t|jtttfr{|jjnd d\}}}}t|jtj	r|jjsd }t||||dS )	Nrj   g        )   rl   Fr0   )force_pad_inf)r]   Zbias_requires_gradr   r   rF   rG   Z	logsumexprq   r   Zphilox_seedZphilox_offsetr   r   Znum_splits_keyr   )ZdqZdkZdvdb)r5   r   r   re   rz   rH   r;   r<   r6   rJ   r   r   rK   r.   r7   r4   r   torn   ro   r   Zget_padded_lserq   r`   r   rB   r   r   r   r   r"   )r   r   r-   r   rD   rE   rF   rG   r<   r   r   Ztensor_biasr   Zgrad_qZgrad_kZgrad_vZ	grad_biasr*   r*   r+   r     s\   




	*z
BwOp.apply)*rV   rW   rX   ra   rY   r$   r   r   rc   rd   r   r5   r6   rJ   r   r   r   r   r   r_   r   re   r   r   r   ZSUPPORTS_ATTN_BIAS_GRADr   r   r   r   r   rh   r   r^   r   r#   r   r   r!   r"   r   r   r*   r*   r   r+   r   G  s:   
 &r   )5dataclassesr   enumr   	functoolsr   typingr   r   r   r   r	   r
   r   r6   commonr   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   Ztorch_attention_compatr$   r^   r   r,   r@   rJ   rH   rK   r   rT   rU   r`   ra   r   r*   r*   r*   r+   <module>   sH   $4$	


 
 '