o
    )ixE                  	   @   s  d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZ d dlZddlmZmZ ddlmZ dd	lmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# dd
lm$Z$m%Z%m&Z&m'Z'm(Z(m)Z) de)de*fddZ+de)dee	ej, e	ej, e	ej, e*e*f fddZ-de	eej,ef  de	ej, fddZ.dee/ de	eej,ef  ddfddZ0G dd de*eZ1de	eej,ef  de*fddZ2eG dd de%Z3eG dd de$Z4dS )     )replace)Enum)AnyIterableListMappingOptionalSetTupleUnionN   )get_operatorregister_operator   	attn_bias)AttentionBiasAttentionBiasSubTensor4BlockDiagonalCausalLocalAttentionFromBottomRightMask%BlockDiagonalCausalLocalAttentionMaskBlockDiagonalCausalMask*BlockDiagonalCausalWithOffsetGappyKeysMask+BlockDiagonalCausalWithOffsetPaddedKeysMaskBlockDiagonalGappyKeysMaskBlockDiagonalMaskBlockDiagonalPaddedKeysMask0LowerTriangularFromBottomRightLocalAttentionMask"LowerTriangularFromBottomRightMaskLowerTriangularMask!LowerTriangularMaskWithTensorBias0PagedBlockDiagonalCausalWithOffsetPaddedKeysMaskPagedBlockDiagonalGappyKeysMask PagedBlockDiagonalPaddedKeysMask)AttentionBwOpBaseAttentionFwOpBasecheck_lastdim_alignment_stride1Context	GradientsInputsinpreturnc                 C   s   dS )Nr    )r)   r+   r+   `/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/ops/fmha/ck.py_minimum_gemm_alignment,   s   r-   c                 C   s   | j }t|tttttfr9|j| j	j
 |j| j	j
 |jj}|jj}|jj}|jj}t|tr4d n|jj}n
d }d }d}d}d }t|trT|jjd d }|| }|||||fS )N)r   
isinstancer   r   r   r"   r!   Z	k_seqinfotoquerydeviceZ	q_seqinfoZseqstartZ
max_seqlenseqlen)r)   r   
seqstart_k
seqstart_qmax_seqlen_qmax_seqlen_kr3   r+   r+   r,   _get_seqlen_info0   s:   

r8   r   c                 C   s2   t | trt | tr| jS d S t | tjr| S d S N)r/   r   r   Z
_subtensortorchTensorr   r+   r+   r,   _get_tensor_biasY   s   

r<   reasonsc                 C   s   t |}|d urQdt|jj }d}t|jd D ]}||| dkr5| d| d|  d d}q|r=| d	 |d
dkrS| d|  d d S d S d S )N   Fr   r   zattn_bias.stride(-2) % z != 0 (attn_bias.stride() = )TzHINT: To use an `attn_bias` with a sequence length that is not a multiple of 8, you need to ensure memory is aligned by slicing a bigger tensor. Example: use `attn_bias = torch.zeros([1, 1, 5, 8])[:,:,:,:5]` instead of `torch.zeros([1, 1, 5, 5])`r.   z/attn_bias.stride(-1) > 1 (attn_bias.stride() = z/) - you should call `.contiguous()` on the bias)	r<   r:   Zfinfodtypebitsrangendimstrideappend)r=   r   attn_bias_tensorZ	alignmentZshow_padding_hintdr+   r+   r,   _check_bias_alignmentd   s*   rH   c                   @   s   e Zd ZdZdZdZdZdS )_CustomMaskTypez*
    (Matches CustomMaskType in C++.)
    r   r   r   N)__name__
__module____qualname____doc__NoCustomMaskCausalFromTopLeftCausalFromBottomRightr+   r+   r+   r,   rI      s
    rI   biasc                 C   sF   t | tttfrttjS t | ttt	j
tttfrttjS ttjS r9   )r/   r   r   r   intrI   rO   r   r   r   &BlockDiagonalCausalFromBottomRightMaskr   r   r    rP   rN   )rQ   r+   r+   r,   _custom_mask_type   s(   


rT   c                       sx  e Zd ZU dZeddZdhZee e	d< e
je
jhZee
j e	d< dZede
jeeeeeeeeeeejejee e!e"fZ#e$e% e	d	< d
Z&d
Z'd
Z(d
Z)d
Z*dZ+e
j,de
jde
jdiZ-e.e
je,f e	d< e
j,de
jde
jdiZ/e.e
je,f e	d< g dZ0e1e2 e	d< e3de4de5de6e
je7e8 f fddZ9e3de4de5de6e
je7e8 f fddZ:e3de4de1e f fddZ;  Z<S ) FwOpz0xFormers' MHA kernel based on Composable Kernel.xformersZefficient_attention_forward_ckcudaSUPPORTED_DEVICESSUPPORTED_DTYPES   NSUPPORTED_ATTN_BIAS_TYPESTZckFga2U0*3?g~jtx?gy&1?
ERROR_ATOLgh㈵>g~jth?g{Gz?
ERROR_RTOL)    `   r>      rZ   _TEST_Kr)   needs_gradientr*   c                 C   s@  t |jtjvrtd|jjdv rtd|jjdv r#| j||dS |jjdks2J d|jj d }|j	
 d d	kr|j
 d d	ksJJ d
|j	 }|j	
 }|j	|d	 |d |d |d f|d	 |d |d |d f}|j }|j
 }|j|d	 |d |d |d f|d	 |d |d |d f}	n|j	dd}|jdd}	|jj\}
}
}}}
|j}t|jtrt|j}|d ur|jdkrt|dd}nt|jtjr|jjdkr|jdd}t||jdd||	|d}| j||d\}}|d||f}|d ur|jd||f}t|||d}||fS )NUnsupported attn_bias type)r   r      z Unsupported number of dimensions)   )rb      zquery has shape rd   r   z0key and value should be expanded in the same wayr   r   re   )r1   keyvaluer   )lseout)typer   rU   r[   NotImplementedErrorr1   rC   
apply_bmhkshaperg   rD   rh   sizeZ
as_stridedflattenr/   r   r<   r:   r;   r   Z	unflattenri   )clsr)   rb   ctxZk_shapeZk_striderg   Zv_shapeZv_striderh   _GZHqZattn_bias_replaceZbias_tensorrj   ri   r+   r+   r,   apply   sd   






z
FwOp.applyc                 C   s  t |jtjvrtdt|\}}}}}| j|j|j|j	t
|j||||j|t|j|j|t|jtttfr;|jjnd t|jttfrH|jjnd t|jttfrU|jjnd d\}}	}
}d }|rt||	|jdkrktnd d}|jdkrtj|
|gtjdd|_||fS )Nrc   )r1   rg   rh   r   r5   r4   r6   	dropout_pZcompute_logsumexpcustom_mask_typescaleseqlen_kwindow_sizeblock_tables	page_sizer   )rj   ri   Zop_bwcpu)r@   r2   )rk   r   rU   r[   rl   r8   OPERATORr1   rg   rh   r<   prT   rx   r/   r   r   r   _window_sizer"   r!   r{   r|   r&   BwOpr:   Ztensorint64	rng_state)rq   r)   rb   r4   r5   ry   r6   rs   rj   ri   rng_seed
rng_offsetrr   r+   r+   r,   rm     sl   
	

1
zFwOp.apply_bmhkrG   c                    sH   t t| |}t|}t|d|j| t|d|j| t||j |S )Nr1   rh   )	superrU   not_supported_reasonsr-   r%   r1   rh   rH   r   )rq   rG   r=   matmul_alignment_mn	__class__r+   r,   r   ]  s   zFwOp.not_supported_reasons)=rJ   rK   rL   rM   r   r~   rX   r	   str__annotations__r:   ZhalfZbfloat16rY   r@   SUPPORTED_MAX_Krk   r;   r   r   r   r   r   r   r   r   r   r   r   rS   r   r   r"   r    r!   r[   r   r   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDZSUPPORTS_PARTIALZSUPPORTS_BMGHKNAMEfloatr\   r   r]   ra   r   rR   classmethodr(   boolr
   r   r&   ru   rm   r   __classcell__r+   r+   r   r,   rU      st   
 
9G$rU   c                	       s   e Zd ZU ejZeddZejZejZdZ	e
dejeeeeeejejf	Zee ed< dZejZejZejZdZdZg dZe e! ed	< e"d
e#de e$ f fddZ%e"de&de#dejde'fddZ(  Z)S )r   rV   Zefficient_attention_backward_ckr`   Nr[   TZckB)r^   @   r_   r>   r`   ra   rG   r*   c                    s  t t| |}t|}t|d|j| t|d|j| t|d|j| t||j	 t
|j	}|d ur|jr|jjdkrS|jdkrSg |jjd d |jjd R }n|jjd |jjdkrd|jjd nd|jjd |jjd f}t|j|kr|d	t|j d
| d |S )Nr1   rg   rh   rd   r   r   r   re   z=Broadcasting the `attn_bias` tensor is not supported (shape: z/ expected: r?   )r   r   r   r-   r%   r1   rg   rh   rH   r   r<   requires_gradrC   rn   tuplerE   )rq   rG   r=   r   rF   Zexpected_bias_shaper   r+   r,   r     s0   
$


zBwOp.not_supported_reasonsrr   r)   gradc                 C   s8  t |jtjvrtdt|\}}}}}|jj}	d }
}|jdkrJ|j	d u s;|j	jt
jks;|j	jj dks;|j	jdkrCtd|j	 |j	 \}
}| j||	|j|j|jft|j||||||j|j|	|j|
|t|j|jt|jtttfr~|jjnd d\}}}}t|jt
jr|jjsd }t||||dS )	Nrc   r   g        r}   )r   zInvalid rng_state: )r   r5   r4   r6   r7   ry   Z	logsumexpoutputrv   r   r   rw   rx   rz   )ZdqZdkZdvdb) rk   r   r   r[   rl   r8   r1   r@   r   r   r:   r   r2   rn   tolistr~   r0   rg   rh   r<   ri   rj   rT   rx   r/   r   r   r   r   r;   r   r'   )rq   rr   r)   r   r4   r5   ry   r6   r7   r@   r   r   Zgrad_qZgrad_kZgrad_vZ	grad_biasr+   r+   r,   ru     sb   



	'z
BwOp.apply)*rJ   rK   rL   rU   rM   r   r~   rX   rY   r   rk   r:   r;   r   r   r   r   r   r   rS   r   r[   r   r   r   ZSUPPORTS_ATTN_BIAS_GRADr   r   r   ZSUPPORTS_UNPADDED_LSEr   ra   r   rR   r   r(   r   r   r&   r'   ru   r   r+   r+   r   r,   r   g  s6   
 
 &r   )5dataclassesr   enumr   typingr   r   r   r   r   r	   r
   r   r:   commonr   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   rR   r-   r;   r8   r<   r   rH   rI   rT   rU   r   r+   r+   r+   r,   <module>   sF   (L 

)

 
 D