o
    81 i0                     @   s|   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZmZ G dd dejZG dd dejZdS )	    N)	rearrange)index_first_axis	pad_inputunpad_input)convert_blockmaskflash_blocksparse_attn_funcc                       sD   e Zd ZdZ					d fdd	Z							dd	d
Z  ZS )FlashBlocksparseAttentiona{  Implement the scaled dot product attention with softmax.
    Arguments
    ---------
        softmax_temp: The temperature to use for the softmax attention.
                      (default: 1/sqrt(d_keys) where d_keys is computed at
                      runtime)
        attention_dropout: The dropout rate to apply to the attention
                           (default: 0.1)
    N           c           	         sn   t    tj|| _|| _|| _|d d d d }| j|}| 	d| t
| jdd}| 	d| d S )N      layoutF)causalblockmask_converted)super__init__hydrautilsZinstantiatesparsity_configsoftmax_temp	dropout_pZmake_layoutZregister_bufferr   r   )	selfr   r   attention_dropoutmax_seq_lengthdevicedtyper   r   	__class__ r/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/flash_blocksparse_attention.pyr      s   
	z"FlashBlocksparseAttention.__init__FTc	              
   C   st  |rJ |du s
J |j tjksJ |jsJ |du r|jd }	|jd }
|
d d d d }|d | jjd ksEJ |d | jjd k| jd|d d|d f }|du rt|d}|
}tjd|	d |
 |
tj|j	d}t
|||| jry| jnd|| j|d	}t|d
|	d}|dfS |j}|jd }t|d}t||\}}}}}t|dd|d}t
|||| jr| jnd|| j|d	}ttt|d||	|
d|d}|dfS |dusJ |}
|
d d d d }|d | jjd ksJ |d | jjd k| jd|d d|d f }|r!t
|||| jr| jnd|| j|d	}|dfS t
||| j| jr-| jnd|| j|dd}|dfS )a  Implements the multihead softmax attention.
        Arguments
        ---------
            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
            attn_mask: An implementation of BaseMask that encodes where each
                       query can attend to
            key_padding_mask: An implementation of BaseMask that encodes how
                         many query each sequence in the batch consists of
        Nr   r   r      zb s ... -> (b s) ...)stepr   r   r	   )softmax_scaler   z(b s) ... -> b s ...)bz b s three h d -> b s (three h d)z nnz (three h d) -> nnz three h d   threehznnz h d -> nnz (h d)zb s (h d) -> b s h d)r(   F)r"   r   convert_mask)r   torchZfloat16Zis_cudashaper   r   ZarangeZint32r   r   Ztrainingr   r   Zbool_matrixr   r   r   )r   qkv	attn_maskkey_padding_maskr   Z
cu_seqlensZmax_sneed_weightsr)   Z
batch_sizeZseqlenZseqlen_roundedZ	blockmaskoutputZkey_padding_mask_boolZnheadsxZx_unpadindices_Zoutput_unpadr   r   r   forward0   s   



	6

	&z!FlashBlocksparseAttention.forward)Nr	   r
   NN)NNFNNFT)__name__
__module____qualname____doc__r   r4   __classcell__r   r   r   r   r      s     r   c                       s<   e Zd Z							d	d fddZ	dd	d
Z  ZS )FlashBlocksparseMHATr	   Fr
   Nreturnc                    s   |sJ |	|
d}t    || _|| _|| _| j| dks"J d| j| | _| jdv s1J dtj|d| fd|i|| _t	|f||d|| _
tj||fd|i|| _d S )	N)r   r   r   z(self.kdim must be divisible by num_heads)r        @   z&Only support head_dim == 16, 32, or 64r%   bias)r   r   )r   r   	embed_dimr   	num_headsZhead_dimnnZLinearWqkvr   
inner_attnout_proj)r   r?   r@   r   r>   Zbatch_firstr   r   r   r   r   kwargsZfactory_kwargsr   r   r   r      s&   

zFlashBlocksparseMHA.__init__c           
      C   sH   |  |}t|dd| jd}| j|||| jd\}}	| t|d|	fS )Nz b s (three h d) -> b s three h dr%   r&   )r.   r/   r   zb s h d -> b s (h d))rB   r   r@   rC   r   rD   )
r   r1   Z
x_ignored_Zx_ignored_1_r-   r.   r/   r,   contextZattn_weightsr   r   r   r4      s   


zFlashBlocksparseMHA.forward)TTr	   Fr
   NN)r;   N)NNF)r5   r6   r7   r   r4   r9   r   r   r   r   r:      s    #r:   )mathr   r*   Ztorch.nnrA   Zeinopsr   Zflash_attn.bert_paddingr   r   r   Z+flash_attn.flash_blocksparse_attn_interfacer   r   Moduler   r:   r   r   r   r   <module>   s     