o
    + i?i                    @  s   d dl mZ d dlmZmZmZ d dlZd dlZd dl	m
  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ dadadaerSd d	lmZ d d
lmZ dd ZdddZdddZdddZdddZ dddZ!e			ddd d!Z"dd$d%Z#e	&	&	&	&ddd2d3Z$e	&	&	&	&ddd6d3Z$e	&	&	&	&ddd8d3Z$					dd9d3Z$dd=d>Z%dd?d@Z&ddAdBZ'e	&	&	&dd&d&d&d&dCddJdKZ(e	&	&	&dd&d&d&d&dCddLdKZ(e	&	&	&dd&d&d&d&dCddMdKZ(			dddNddddOdPdKZ(e	&	&	&dd&d&d&d&dCddRdSZ)e	&	&	&dd&d&d&d&dCddTdSZ)e	&	&	&dd&d&d&d&dCddUdSZ)			dddNdddCdVdSZ)e	&	&	&	&	&	&	&ddd\d]Z*e	&	&	&	&	&	&	&ddd^d]Z*e	&	&	&	&	&	&	&ddd_d]Z*					N		dd`d]Z*									a		b		 ddcddZ+									a		b		 ddedfZ,e	&	&	&	&	&	&	&	&dddhdiZ-e	&	&	&	&	&	&	&	&dddjdiZ-e	&	&	&	&	&	&	&	&dddkdiZ-					N			ddldiZ-						dddqdrZ.	ddddddddNdddds
dd{d|Z/dddZ0dS )    )annotations)TYPE_CHECKINGLiteraloverloadN)_C_ops)in_dynamic_or_pir_mode)LayerHelper)signature_safe_contextmanager)get_device_capability)	Generator)Tensorc                  C  sH   t j } | d ur| dkst  r t \}}t|d | }|S td)NFalse
   zmPaddle is not compiled with CUDA, we cannot get SMVersion from device, please try to compile Paddle with CUDA)paddleversioncudaZis_compiled_with_rocmr
   int
ValueError)Zcuda_versionmajorminorarch r   p/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/nn/functional/flash_attention.py_get_arch_info'   s   

r           c                 C  s|   t  }d|  kodkn  }|sdS | jd }| j }|s dS |dk}|dk}|dk}|o/|}	|dko5|}
|	s:|
r<dS dS )	NV   Y   T      r   F)r   shapestop_gradient)query	dropout_pr   Zis_sm86_to_sm89head_dimZrequires_gradZis_head_dim_gt192Zis_head_dim_lte224Z
is_dropoutZcond1Zcond2r   r   r    check_flash_head_dim_constraints6   s    
r%   Fc                 C  s,   |sdS | j d }|j d }||krdS dS )NTF)r    )r"   key	is_causalZseqlen_qZseqlen_kr   r   r   %check_flash_causal_non_square_seqlensO   s   

r)   c                 C  s8   t  }| j}|dkrtjtjg}||v S tjg}||v S )NP   )r   dtyper   Zfloat16Zbfloat16)r"   debugr   r+   Zsupported_dtypesr   r   r   check_dtypes_low_precision[   s   r-   returnboolc                 C  sl   dt  vrdS | jdkrdS | jd dkrdS t dk rdS t| |s&dS t| ||s.dS t| s4dS dS )NgpuF   r      r*   T)r   
get_devicendimr    r   r%   r)   r-   )r"   r'   	attn_maskdropoutr(   r   r   r   can_use_flash_attng   s   	


r7   c                 C  s:   dt  vrdS t dk rt dkrdS | jdkrdS dS )Nr0   F2   Z   r1   T)r   r3   r   r4   )r"   r   r   r   can_use_efficient   s   
r:   Tenable_mathenable_flashenable_mem_efficientGenerator[None, None, None]c                 c  sB    t }t }t}| a |a|azdV  W |a |a|adS |a |a|aw )z
    With the sdp_kernel context manager, different algorithm implementations can
    be selected for scaled_dot_product_attention.
    N)g_enable_mathg_enable_mem_efficientg_enable_flash)r;   r<   r=   Zoriginal_enable_mathZoriginal_enable_flashZoriginal_enable_mem_efficientr   r   r   
sdp_kernel   s   rB   xr   c                 C  s*   t | d}d|_t j|dd}d|_|S )Ng     T   )Zdiagonal)r   Z	full_liker!   Ztriu)rC   maskr   r   r   get_triangle_upper_mask   s
   rF   .r"   r'   valuerE   dropout_ratefloatcausalreturn_softmaxLiteral[False]trainingtuple[Tensor, None]c                 C     d S Nr   r"   r'   rG   rE   rH   rJ   rK   rM   r   r   r   _math_attention      
rR   Literal[True]tuple[Tensor, Tensor]c                 C  rO   rP   r   rQ   r   r   r   rR      rS   tuple[Tensor, Tensor | None]c                 C  rO   rP   r   rQ   r   r   r   rR      rS   c                 C  s   | j d }t| g d} t|g d}t|g d}tj| |d  |dd}	|dur1|	| }	|s9t|	}
nt }d|v rOt|	}|	| }	t|	}
ntj	|	}
|dkrbtj
|
||d	d
}
t|
|}t|g d}||rv|
fS dfS )z
    This is a basic implementation of scaled dot product attention composed of
    combinations of fundamental components.
    r   r      rD            T)rC   yZtranspose_yNxpur   Zupscale_in_train)rM   mode)r    r   	transposematmulFsoftmaxr3   rF   ZincubateZ softmax_mask_fuse_upper_triangler6   )r"   r'   rG   rE   rH   rJ   rK   rM   r$   productweightsplaceoutr   r   r   rR      s,   
r$   r   strc                 C  s   | dkrdS dS )Nr2   
flash_attnmem_efficientr   )r$   r   r   r   _select_sdp_cuda  s   ri   c                 C  s   t  }d|v r
dS d|v rdS d|v rdS tdu r$d|vr dS t| S tdu r4tdu r4tdu r4td	td
u rHtdu rBtdu rBdS d|vrHdS td
u rTtd
u rTt| S td
u rZdS dS )z
    There are currently three different implementation options available for
    scaled dot product attention, and the chosen approach depends on whether it
    is determined by the sdp_kernel configuration or specified through input values.
    r\   rg   iluvatar_gpu	metax_gpuNr0   mathF@No available backend for scaled_dot_product_attention was found.Trh   )r   r3   rA   ri   r?   r@   AssertionError)r$   rd   r   r   r   _select_sdp  s6   ro   c           	      C  s   t  }d|v r
dS d|v rdS d|v rdS tdu r<tdu r<tdu r<t| ||||}t| }d}|r4dS |r8dS |r<dS td	u rLtd	u rLtd	u rLtd
tdu r`td	u rZtd	u rZdS d|vr`dS tdu rotdu rot| j	d S tdu rudS dS )z9
    this select sdpa is alignment for torch version
    r\   rg   rj   rk   NTrh   rl   Frm   r0   r   )
r   r3   rA   r?   r@   r7   r:   rn   ri   r    )	r"   r'   r5   r6   r(   rd   Z	use_flashZuse_efficientZuse_mathr   r   r   _select_sdp_for_sdpa:  sJ   
rp   )fixed_seed_offsetrng_namerM   namer6   rq   Tensor | Nonerr   rs   
str | Nonec          
      C  rO   rP   r   
r"   r'   rG   r6   rJ   rK   rq   rr   rM   rs   r   r   r   flash_attentionp     rw   c          
      C  rO   rP   r   rv   r   r   r   rw     rx   c          
      C  rO   rP   r   rv   r   r   r   rw     rx    )rq   rr   rM   rs   softmax_scalec                C  s  | j d }t|}|dkr/dt v rd}ndt v r d}ntdgd r+d}n
tjjdgd }t s@|dks@J d|d	ksL|dksLJ d
|rV|dksVJ d|du sb|dksbJ d|dksn|dksnJ d|sx|dksxJ d|	du s|dksJ d|
du s|dksJ dt r|dkrt	| |||d|||| |
\}}}}||r|fS dfS |dkr|
du r| j d d }
t
| ||dddd|
|ddd	dddd\}}|dfS td| td#i t }|jdd}||}||}|tj}|tj}| |||d}||||d}|jd|||||| |dd ||r,|fS dfS |dkrIdd lm} || ||d|d|d!}|dfS t| ||||||d"S )$a>  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API is only support inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        dropout(float): The dropout ratio.
        causal(bool): Whether enable causal mode.
        return_softmax(bool): Whether to return softmax.
        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
        training(bool): Whether it is in the training phase.
        rng_name(str): The name to select Generator.
        name(str|None, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        out(Tensor): The attention tensor.
                    4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
        softmax(Tensor): The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> paddle.seed(2023)
            >>> q = paddle.rand((1, 128, 2, 16))

            >>> output = paddle.nn.functional.flash_attention.flash_attention(q, q, q, 0.9, False, False)
            >>> print(output)
            (Tensor(shape=[1, 128, 2, 16], dtype=float32, place=Place(cpu), stop_gradient=True,
            [[[[0.34992966, 0.34456208, 0.45826620, ..., 0.39883569,
                0.42132431, 0.39157745],
               [0.76687670, 0.65837246, 0.69117945, ..., 0.82817286,
                0.76690865, 0.71485823]],
              ...,
              [[0.71662450, 0.57275224, 0.57053083, ..., 0.48108247,
                0.53336465, 0.54540104],
               [0.59137970, 0.51350880, 0.50449550, ..., 0.38860250,
                0.40526697, 0.60541755]]]]), None)

    rY   rg   r\   rX   rj   FLAGS_cudnn_deterministicFLAGS_flash_attn_versionz2flash attention 3 only support dynamic or pir moder   z*flash attention 3 does not support dropoutz1flash attention 3 does not support return softmaxNz6flash attention 3 does not support setting seed_offsetry   z3flash attention 3 does not support setting rng_namez3flash attention 3 does not support setting trainingz/flash attention 3 does not support setting namez8flash attention 2 does not support setting softmax_scaler   rZ   rD   Fr   !Invalid flash attention version: qZinput_param_name)r~   kvrq   re   ra   softmax_lseseed_offsetr6   rJ   rK   is_testrr   typeinputsoutputsattrsrh   memory_efficient_attentionZ	attn_biaspscalerM   rH   rJ   rK   rM   rg   )r    ro   r   r3   	get_flagsbase	frameworkr   r   rg   Zflash_attn_v3r   r   localsinput_dtype"create_variable_for_type_inferencefloat32int64	append_op-paddle.incubate.nn.memory_efficient_attentionr   rR   )r"   r'   rG   r6   rJ   rK   rq   rr   rM   rs   rz   r$   sdp_func_name
fa_versionresult_attentionresult_softmax_re   r   helperr+   ra   r   r   r   r   outputr   r   r   rw     s
  
O




	qkvc                C  rO   rP   r   r   r6   rJ   rK   rq   rr   rM   rs   r   r   r   flash_attn_qkvpacked     r   c                C  rO   rP   r   r   r   r   r   r     r   c                C  rO   rP   r   r   r   r   r   r     r   c             
   C  s  | j d }t|}	|	dkrst r*t| |d|||| |\}
}}}|
|r'|fS dfS tdi t }|jdd}||}||}|t	j
}|t	j}| |d}||||d}|jd|||||| |d	d
 ||rp|fS dfS | ddddddf ddd| j d g}| dddddf }| dddddf }|	dkrddlm} ||||d|d|d}|dfS t|||||||dS )a'
  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.
        Don't call this API if flash_attn is not supported.

    Args:
        qkv(Tensor): The query/key/value packed tensor in the Attention module.
                        5-D tensor with shape:
                        [batchsize, seqlen , num_heads/num_heads_k + 2, num_heads_k, head_dim].
                        The dtype can be float16 or bfloat16.
        dropout(float): The dropout ratio.
        causal(bool): Whether enable causal mode.
        return_softmax(bool): Whether to return softmax.
        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
        training(bool): Whether it is in the training phase.
        rng_name(str): The name to select Generator.
        name(str|None, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        - out(Tensor). The attention tensor. 4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim]. The dtype can be float16 or bfloat16.
        - softmax(Tensor). The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('flash_attn need A100 compile')
            >>> import paddle

            >>> paddle.seed(2023)
            >>> q = paddle.rand((1, 128, 2, 16))
            >>> qkv = paddle.stack([q, q, q], axis=2)
            >>> output = paddle.nn.functional.flash_attn_qkvpacked(qkv, 0.9, False, False)
            >>> print(output)
            (Tensor(shape=[1, 128, 2, 16], dtype=float32, place=Place(cpu), stop_gradient=True,
            [[[[0.34992966, 0.34456208, 0.45826620, ..., 0.39883569,
                0.42132431, 0.39157745],
               [0.76687670, 0.65837246, 0.69117945, ..., 0.82817286,
                0.76690865, 0.71485823]],
              ...,
              [[0.71662450, 0.57275224, 0.57053083, ..., 0.48108247,
                0.53336465, 0.54540104],
               [0.59137970, 0.51350880, 0.50449550, ..., 0.38860250,
                0.40526697, 0.60541755]]]]), None)
            >>> # doctest: -SKIP

    r   rg   Nr   r   r   )r   rq   r   r   r   r   rh   r   r   r   )r   )r    ro   r   r   r   r   r   r   r   r   r   r   r   reshaper   r   rR   )r   r6   rJ   rK   rq   rr   rM   rs   r$   r   r   r   r   r   r+   re   ra   r   r   r   r   r"   r'   rG   r   r   r   r   r   r     s   
E

.	cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   c                 C  rO   rP   r   r"   r'   rG   r   r   r   r   r   r6   rJ   rK   rq   rr   rM   rs   r   r   r   flash_attn_unpaddedA     r   c                 C  rO   rP   r   r   r   r   r   r   U  r   c                 C  rO   rP   r   r   r   r   r   r   i  r   c                 C  s   t  r"t| |||||d|||||	|
| |\}}||
r|fS dfS td	i t }|jdd}||}||}|tj}|tj	}| |||||d}||||d}|j
d|||||||	|
| |dd ||
ro|fS dfS )
a  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API is only support inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        3-D tensor with shape:
                        [total_seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        3-D tensor with shape:
                        [total_seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        3-D tensor with shape:
                        [total_seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index query.
        cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index key and value.
        max_seqlen_q(int): Maximum sequence length of query in the batch.
        max_seqlen_k(int): Maximum sequence length of key/value in the batch.
        scale(float): The scaling of QK^T before applying softmax.
        dropout(float, optional): The dropout ratio.
        causal(bool, optional): Whether enable causal mode.
        return_softmax(bool, optional): Whether to return softmax.
        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
        rng_name(str, optional): The name to select Generator.
        training(bool, optional): Whether it is in the training phase.
        name(str|None, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        out(Tensor): The attention tensor.
                    3-D tensor with shape: [total_seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
        softmax(Tensor): The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> paddle.seed(2023)
            >>> q = paddle.rand((2, 128, 8, 16), dtype='float16')
            >>> cu = paddle.arange(0, 384, 128, dtype='int32')
            >>> qq = paddle.reshape(q, [256, 8, 16])
            >>> output = paddle.nn.functional.flash_attention.flash_attn_unpadded(qq, qq, qq, cu, cu, 128, 128, 0.25, 0.0, False, False)

    Nr   r~   r   )r~   r   r   r   r   rq   r   r   r   r   r6   rJ   rK   r   rr   r   )r   )r   r   r   r   r   r   r   r   r   r   r   )r"   r'   rG   r   r   r   r   r   r6   rJ   rK   rq   rr   rM   rs   r   r   r   r+   re   ra   r   r   r   r   r   r   r   r   }  sn   N

	r   r   rD   c                 C  s.   t | |||||||||	|
|||||||||S rP   )flash_attn_varlen_func)r"   r'   rG   r   r   r   r   	seqused_q	seqused_krz   rJ   qv	q_descale	k_descale	v_descalewindow_sizesoftcap
num_splitspack_gqa	sm_marginr   r   r   flash_attention_v3_varlen  s,   r   c                 C  s   dt  vs
J dt dgd rJ dt jjdgd dks&J dt s-J d|d	u s5J d
|	d	u rK| jd |d	urF|jd nd d }	t| |||||||||||||	|
|d |d |||d	u|d	urm|nd|\}}||fS )a
  
    The equation is:
    .. math::
        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.
    This is the varlen version of flash attention.
    Warning:
        This API is only support inputs with dtype float16 and bfloat16.
    Args:
        query(Tensor): The query tensor in the Attention module.
                        3-D tensor with shape:
                        [token_num, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        3-D tensor with shape:
                        [token_num, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        3-D tensor with shape:
                        [token_num, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index query.
        cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index key and value.
        causal(bool): Whether enable causal mode.
        softmax_scale(float): The softmax scale of the attention.
        max_seqlen_q(int): Maximum sequence length of query in the batch. Note it's the padding length, not the max actual seqlen.
        max_seqlen_k(int): Maximum sequence length of key/value in the batch.
    Returns:
        out(Tensor): The attention tensor. 3-D tensor with shape: [token_num, num_heads, head_dim]. The dtype can be float16 or bfloat16.
        softmax(Tensor): The softmax tensor. None if return_softmax is False.
    Examples:
        .. code-block:: python
            >>> # doctest: +SKIP('flash_attn_v3 need H100 compile')
            >>> import paddle
            >>> paddle.seed(2023)
            >>> q = paddle.rand((10, 2, 128), dtype="bfloat16")
            >>> cu_seqlens_q = paddle.to_tensor([0, 10], dtype="int32")
            >>> max_seq_len_q = 10
            >>> output = paddle.nn.functional.flash_attention.flash_attention_v3_varlen(q, q, q, cu_seqlens_q, cu_seqlens_q, max_seqlen_q=max_seq_len_q, max_seqlen_k=max_seq_len_q, causal=True)
            >>> # doctest: -SKIP
    r\   z.flash_attn_varlen_func is not supported on xpur{   z5flash_attn_varlen_func does not support deterministicr|   rY   zDFLAGS_flash_attn_version is 2, conflicts with flash_attn_varlen_funcz7flash_attn_varlen_func only support dynamic or pir modeNz2flash_attn_varlen_func does not support setting qvr   r   rZ   rD   F)	r   r3   r   r   r   r   r    r   Zflash_attn_v3_varlen)r"   r'   rG   r   r   r   r   r   r   rz   rJ   r   r   r   r   r   r   r   r   r   re   r   r   r   r   r   6  s`   C
r   varlen_paddedc                 C  rO   rP   r   r   r   r   r   r   r   r6   rJ   rK   rq   rr   r   rM   rs   r   r   r   flash_attn_varlen_qkvpacked     r   c                 C  rO   rP   r   r   r   r   r   r     r   c                 C  rO   rP   r   r   r   r   r   r     r   c                 C  s   t  r!t| |||	d||||||| |
|\}}||r|fS dfS td	i t }|jdd}||}||}|tj}|tj	}| |||	d}||||d}|j
d||||||||| |
dd ||rl|fS dfS )
a
  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.

    Args:
        qkv(Tensor): The padded query/key/value packed tensor in the Attention module. The padding part won't be computed
                        4-D tensor with shape:
                        [total_seq_len, num_heads/num_heads_k + 2, num_heads_k, head_dim].
                        The dtype can be float16 or bfloat16.
        cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index query.
        cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index key and value.
        max_seqlen_q(int): Maximum sequence length of query in the batch. Note it's the padding length, not the max actual seqlen
        max_seqlen_k(int): Maximum sequence length of key/value in the batch.
        scale(float): The scaling of QK^T before applying softmax.
        dropout(float, optional): The dropout ratio.
        causal(bool, optional): Whether enable causal mode.
        return_softmax(bool, optional): Whether to return softmax.
        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
        rng_name(str, optional): The name to select Generator.
        training(bool, optional): Whether it is in the training phase.
        name(str|None, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        - out(Tensor). The attention tensor. The tensor is padded by zeros. 3-D tensor with shape: [total_seq_len, num_heads, head_dim]. The dtype can be float16 or bfloat16.
        - softmax(Tensor). The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('flash_attn need A100 compile')
            >>> import paddle
            >>> paddle.seed(2023)
            >>> q = paddle.rand((2, 128, 8, 16), dtype='float16')
            >>> cu = paddle.arange(0, 384, 128, dtype='int32')
            >>> qq = paddle.reshape(q, [256, 8, 16])
            >>> qkv = paddle.stack([qq, qq, qq], axis=2)
            >>> output = paddle.nn.functional.flash_attn_varlen_qkvpacked(qkv, cu, cu, 128, 128, 0.25, 0.0, False, False)
            >>> # doctest: -SKIP

    Nr   r   r   )r   r   r   rq   r   r   r   )r   )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r6   rJ   rK   rq   rr   r   rM   rs   r   r   r   r+   re   ra   r   r   r   r   r   r   r   r     sh   F

r5   r#   r(   backendc	                 C  s  | j }	| j dkrtj| dd} |j dkrtj|dd}|j dkr'tj|dd}|dkrE|  rE| rE| rEtjjjj| |||||}
|du rTt	| ||||\}
}n| j
d }t| ||||}|jtjkr}t|tjd| jdtjtd | jd}|d	krt rd}d
}d}t| |||||||| |
\}
}}}ntdi t }|jdd}||}
||}|tj}|tj}| |||d}|
|||d}|jd	||||d
| ddd n^|dkr-ddlm} tj| j
d g| j
d  dd}dt| j
d  }| g d} |g d}|g d}|| ||||||}|g d}
n|dkr?t | |||||d
|d }
|	dkrKtj!|
dd}
|
S )as
  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        3-D tensor with shape:
                        [seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        3-D tensor with shape:
                        [seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        3-D tensor with shape:
                        [seq_len, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        attn_mask(Tensor, optional): A float mask of the same type as query,
                        key, value that is added to the attention score.
        dropout_p(float, optional): The dropout ratio.
        is_causal(bool, optional): Whether enable causal mode.
        training(bool, optional): Whether it is in the training phase.
        name(str|None, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        out(Tensor): The attention tensor.
                    4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim].
                    3-D tensor with shape: [seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('bfloat need V100 compile')
            >>> import paddle
            >>> q = paddle.rand((1, 128, 2, 16), dtype=paddle.bfloat16)
            >>> output = paddle.nn.functional.scaled_dot_product_attention(q, q, q, None, 0.9, False)
            >>> print(output)
            >>> # doctest: -SKIP
    rY   r   )ZaxisZp2pNr   r+   infrg   Fry   r~   r   )r~   r   r   r5   r   r   r   rh   )*variable_length_memory_efficient_attentionrD   int32g      ?r   rW   rl   r   )"r4   r   Z	unsqueezeZis_distdistributedZauto_parallelZring_attentionZRingFlashAttentionapplyrw   r    rp   r+   r/   whereZ	to_tensorrI   r   r   rg   r   r   r   r   r   r   r   ZHpaddle.incubate.nn.functional.variable_length_memory_efficient_attentionr   npsqrtr^   rR   Zsqueeze)r"   r'   rG   r5   r#   r(   rM   rs   r   Z
query_ndimre   r   r$   r   rq   rK   rr   r   r+   ra   r   r   r   r   r   Zseq_lensr   r   r   r   r   scaled_dot_product_attentiong  s   E


	





	
r   )
r6   rJ   r   return_softmax_lsereturn_seed_offsetrq   rr   rM   rs   rz   startend_row_indicesr   int | tuple | Noner   r   rz   float | Nonec       
         C  s
  |durt |tr||f}| jd }| jd }|du sJ d|rFtj|d d ||d  d dddd|df}tj||d|d}nHtjdd|dfdd}tj|d d ||d  d dd|dddddf< tj|d  ||d  dd|dddddf< tj|d|d	|d}|du rt	
| |||	d||d
| |

\}}}}n>|jtjksJ d|j t|jdksJ d|j |jd |jd ksJ d|jd  d|jd  |jd |jd ksJ d|jd  d|jd  |jd d|jd fv s	J d|r,|jd dkrd
}n5|jd dkr"d}n*td|jd  |jd dkr7d
}n|jd dkrBd}n
td|jd  dt v rVd}ntdgd rbd}n
tjjdgd }|dkr|du szJ dt	| ||||	||d
| |

\}}}}nW|dkr|dksJ d|rJ d|	du sJ d|
dksJ d |sJ d!|du sJ d"|du r| jd d# }t	| |||||\}}ntd$| |g}|r||g7 }|r||g7 }t|dkr|d S |S )%a{  
    FlashMask: Official Implementation

    This module provides the official implementation of the FlashMask algorithm as described in the paper. For more details, please refer to the paper available at: https://arxiv.org/abs/2410.01359.

    The core equation utilized in FlashMask is as follows:

    .. math::

        \text{result} = \text{softmax}\left(\frac{Q \cdot K^T}{\sqrt{d}} + M\right) \cdot V

    In this equation:

        - ``Q``, ``K``, and ``V`` are the input tensors to the attention module.
        - All these tensors share the same dimensions.
        - ``d`` denotes the size of the last dimension of these tensors.
        - ``M`` represents the column-wise sparse mask introduced by FlashMask.

    Args:
        query (Tensor):  The query tensor in the attention module.
            A 4-D tensor with shape [batch_size, q_seq_len, num_heads, head_dim].
            The dtype can be float16 or bfloat16.
        key (Tensor): The key tensor in the attention module.
            A 4-D tensor with shape [batch_size, k_seq_len, k_num_heads, head_dim].
            The dtype can be float16 or bfloat16.
        value (Tensor): The value tensor in the attention module.
            A 4-D tensor with shape [batch_size, k_seq_len, k_num_heads, head_dim].
            The dtype can be float16 or bfloat16.
        startend_row_indices(Tensor):
            A column-wise sparse attention mask row indices tensor.
            A 4-D tensor with shape [batch_size, k_num_heads, k_seq_len, {1, 2, 4}].
            The dtype must be int32. k_num_heads can be 1 or the same as key's num_heads. When num_heads is 1, it will be broadcast to match key's num_heads.
            Depending on the value of the causal parameter, startend_row_indices can take different shapes and meanings.

            - When `causal=True` and the shape is [batch_size, k_num_heads, k_seq_len, 1],
              indicating unidirectional attention. The value represents the starting row index of the left
              lower triangular mask in the dense mask. The value startend_row_indices[..., 0] indicates that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) will be masked.
            - When `causal=True` and the shape is [batch_size, k_num_heads, k_seq_len, 2],
              indicating unidirectional attention. The values represent the starting and ending row indices of
              the left lower triangular mask in the dense mask. The values startend_row_indices[..., 0:2] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) but above the startend_row_indices[..., 1]-th row (exclusive) will be masked.
            - When `causal=False` and the shape is [batch_size, k_num_heads, k_seq_len, 2],
              indicating bidirectional attention. The values represent the starting row index of the left
              lower triangular mask and the ending row index of the right upper triangular mask in the dense mask. The values startend_row_indices[..., 0:2] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) will be masked, and elements in the upper right triangle starting from the startend_row_indices[..., 1]-th row upwards (exclusive) will be masked.
            - When `causal=False` and the shape is [batch_size, k_num_heads, k_seq_len, 4] ,
              indicating bidirectional attention. The values represent the start and end row indices of the
              left lower triangular mask and the start and end row indices of the right upper triangular mask in the dense mask. The values startend_row_indices[..., 0:4] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) but above the startend_row_indices[..., 1] row (exclusive) will be masked, and elements in the upper right triangle starting from the startend_row_indices[..., 2]-th row downwards (inclusive) but above the startend_row_indices[..., 3] row (exclusive) will be masked.

        dropout (float): The dropout ratio. Default is 0.0.
        causal (bool): Whether to enable causal mode. Default is False.
        window_size (int|tuple, optional): Indicates the window size of sliding window local attention.
            If causal mode is enabled, Query at position i will only attend to keys between [i - window_size, i] or [i - window_size[0], i].
            If causal mode is disabled, Query at position i will only attend to keys between [i - window_size, i + window_size] or [i - window_size[0], i + window_size[1]].
        return_softmax_lse (bool): Whether to return the log-sum-exp of the softmax. Default is False.
        return_seed_offset (bool): Whether to return the random seed offset. Default is False.
        fixed_seed_offset(Tensor, optional): With fixed seed, offset for dropout mask.
        rng_name (str): The name to select Generator.
        training (bool): Whether the module is in training mode. Default is True.
        name (str, optional): Name of the operation. Default is None. Normally, users do not need to set this property.
            For more information, refer to :ref:`api_guide_Name` .

    Returns
        Tensor. The computed attention result with the same shape as the input `query`.

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.

    Hint:
        This API supports GQA.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('flash_attn need A100 compile')
            >>> import paddle
            >>> paddle.seed(2023)
            >>> q = paddle.rand((1, 10, 2, 32),dtype="bfloat16") # shape: [batch_size, seq_len, num_heads, head_dim]
            >>> k = paddle.rand((1, 10, 2, 32),dtype="bfloat16") # shape: [batch_size, seq_len, num_heads, head_dim]
            >>> v = paddle.rand((1, 10, 2, 32),dtype="bfloat16") # shape: [batch_size, seq_len, num_heads, head_dim]
            >>> startend_row_indices = paddle.to_tensor([8]*10 + [5]*10, dtype="int32").reshape([1, 2, 10, 1])
            >>> output = paddle.nn.functional.flashmask_attention(q, k, v, startend_row_indices, causal=True)
            >>> print(output)
            Tensor(shape=[1, 10, 2, 32], dtype=bfloat16, place=Place(gpu:0), stop_gradient=True,
                [[[[0.82421875, 0.27539062, 0.80859375, 0.98046875, 0.00251770,
                    0.41992188, 0.17285156, 0.11767578, 0.42773438, 0.31250000,
                    0.34570312, 0.70312500, 0.29296875, 0.44531250, 0.51562500,
                    0.96093750, 0.85546875, 0.15625000, 0.34765625, 0.98437500,
                    0.96484375, 0.45312500, 0.33593750, 0.56640625, 0.07714844,
                    0.43750000, 0.83984375, 0.66796875, 0.93750000, 0.24804688,
                    0.51171875, 0.55468750],
                    [0.54687500, 0.74609375, 0.43164062, 0.32421875, 0.10693359,
                    0.37304688, 0.53906250, 0.17187500, 0.57421875, 0.75000000,
                    0.13378906, 0.57031250, 0.19531250, 0.01403809, 0.29101562,
                    0.14257812, 0.07568359, 0.88671875, 0.75390625, 0.17089844,
                    0.87109375, 0.93359375, 0.89843750, 0.58203125, 0.75390625,
                    0.27539062, 0.67968750, 0.24804688, 0.57812500, 0.67578125,
                    0.92578125, 0.98046875]],

                    [[0.59765625, 0.62890625, 0.62109375, 0.75781250, 0.03295898,
                    0.64062500, 0.27929688, 0.20800781, 0.72265625, 0.52343750,
                    0.53125000, 0.61718750, 0.57421875, 0.56640625, 0.65625000,
                    0.48242188, 0.68359375, 0.42968750, 0.26562500, 0.86718750,
                    0.83203125, 0.40820312, 0.38281250, 0.59765625, 0.43945312,
                    0.22851562, 0.86328125, 0.51562500, 0.89453125, 0.62500000,
                    0.50390625, 0.67968750],
                    [0.34765625, 0.61328125, 0.58593750, 0.60156250, 0.43164062,
                    0.41601562, 0.71093750, 0.59765625, 0.53515625, 0.78125000,
                    0.13867188, 0.30664062, 0.48828125, 0.04394531, 0.24316406,
                    0.18847656, 0.10644531, 0.71093750, 0.69140625, 0.35937500,
                    0.44531250, 0.81640625, 0.44140625, 0.64062500, 0.81640625,
                    0.61328125, 0.72265625, 0.53125000, 0.49414062, 0.59765625,
                    0.54296875, 0.61328125]],

                    [[0.65234375, 0.47656250, 0.71875000, 0.64843750, 0.23828125,
                    0.61328125, 0.29101562, 0.26562500, 0.54296875, 0.60937500,
                    0.67187500, 0.67578125, 0.64062500, 0.41406250, 0.47656250,
                    0.40820312, 0.66406250, 0.39453125, 0.39453125, 0.62109375,
                    0.58593750, 0.31054688, 0.31835938, 0.45703125, 0.52343750,
                    0.43164062, 0.64453125, 0.49804688, 0.82812500, 0.48242188,
                    0.38476562, 0.59375000],
                    [0.44921875, 0.62109375, 0.50390625, 0.51562500, 0.51953125,
                    0.57812500, 0.78515625, 0.73437500, 0.60546875, 0.55078125,
                    0.30273438, 0.23339844, 0.60546875, 0.33007812, 0.23242188,
                    0.30468750, 0.34570312, 0.70703125, 0.72656250, 0.58593750,
                    0.40234375, 0.62109375, 0.62109375, 0.69531250, 0.66796875,
                    0.51562500, 0.45898438, 0.67968750, 0.48828125, 0.50000000,
                    0.54687500, 0.71875000]],

                    [[0.67578125, 0.50000000, 0.58203125, 0.62109375, 0.43554688,
                    0.69531250, 0.30273438, 0.24023438, 0.57812500, 0.63671875,
                    0.51171875, 0.52734375, 0.60546875, 0.45507812, 0.42382812,
                    0.46093750, 0.55859375, 0.34960938, 0.39453125, 0.57031250,
                    0.55078125, 0.47265625, 0.24609375, 0.51953125, 0.46093750,
                    0.49218750, 0.49609375, 0.60156250, 0.76953125, 0.57421875,
                    0.40429688, 0.57031250],
                    [0.45703125, 0.71093750, 0.58984375, 0.43164062, 0.54296875,
                    0.57031250, 0.72265625, 0.61328125, 0.64453125, 0.50781250,
                    0.28125000, 0.19531250, 0.60546875, 0.40625000, 0.18554688,
                    0.33203125, 0.40039062, 0.58593750, 0.79687500, 0.45507812,
                    0.32812500, 0.58203125, 0.70703125, 0.64453125, 0.53906250,
                    0.57421875, 0.48828125, 0.53515625, 0.49804688, 0.50000000,
                    0.48437500, 0.55468750]],

                    [[0.64453125, 0.43164062, 0.54687500, 0.53125000, 0.42187500,
                    0.71484375, 0.30273438, 0.21484375, 0.50390625, 0.69531250,
                    0.58203125, 0.51562500, 0.61328125, 0.41992188, 0.40039062,
                    0.46679688, 0.58984375, 0.39062500, 0.41992188, 0.49023438,
                    0.47851562, 0.47070312, 0.30078125, 0.50390625, 0.47656250,
                    0.44921875, 0.43164062, 0.63671875, 0.78125000, 0.60156250,
                    0.48242188, 0.58203125],
                    [0.52343750, 0.69921875, 0.58984375, 0.35156250, 0.49218750,
                    0.58593750, 0.71093750, 0.59375000, 0.66406250, 0.49414062,
                    0.24023438, 0.18554688, 0.66796875, 0.50000000, 0.23144531,
                    0.29882812, 0.49414062, 0.57031250, 0.70312500, 0.42773438,
                    0.35351562, 0.47460938, 0.73437500, 0.53125000, 0.47070312,
                    0.49609375, 0.50000000, 0.55078125, 0.50000000, 0.45898438,
                    0.45703125, 0.61328125]],

                    [[0.63671875, 0.41210938, 0.52734375, 0.56640625, 0.44531250,
                    0.64843750, 0.37890625, 0.31250000, 0.56640625, 0.62890625,
                    0.53125000, 0.51562500, 0.54296875, 0.50781250, 0.35546875,
                    0.41601562, 0.55468750, 0.36914062, 0.35937500, 0.45117188,
                    0.46875000, 0.49609375, 0.28710938, 0.50000000, 0.49609375,
                    0.50000000, 0.51562500, 0.57031250, 0.77734375, 0.62109375,
                    0.43164062, 0.50781250],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]],

                    [[0.62109375, 0.44531250, 0.46875000, 0.61328125, 0.39062500,
                    0.60156250, 0.41015625, 0.28710938, 0.58984375, 0.67968750,
                    0.55859375, 0.48632812, 0.51562500, 0.42382812, 0.37695312,
                    0.46679688, 0.54687500, 0.44921875, 0.33789062, 0.36328125,
                    0.49023438, 0.44140625, 0.25000000, 0.45312500, 0.43945312,
                    0.45507812, 0.46679688, 0.57812500, 0.65625000, 0.64062500,
                    0.42382812, 0.57031250],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]],

                    [[0.62500000, 0.47070312, 0.51562500, 0.61328125, 0.36718750,
                    0.66406250, 0.37890625, 0.28320312, 0.65625000, 0.66015625,
                    0.48632812, 0.53906250, 0.46679688, 0.47851562, 0.43359375,
                    0.45703125, 0.47070312, 0.39843750, 0.32617188, 0.37304688,
                    0.49023438, 0.50390625, 0.27148438, 0.46679688, 0.37695312,
                    0.49023438, 0.47265625, 0.58593750, 0.64453125, 0.60156250,
                    0.38476562, 0.62109375],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]],

                    [[0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]],

                    [[0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ],
                    [0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        , 0.        , 0.        , 0.        ,
                    0.        , 0.        ]]]])
            >>> # doctest: -SKIP


    To convert FlashMask's `startend_row_indices` to `dense_mask`, use the code below:

    .. code-block:: python

        >>> import paddle
        >>> import numpy as np
        >>> def flashmask_to_densemask(startend_row_indices, dtype, causal=True):
        ...     if startend_row_indices is None:
        ...         return None
        ...     bz, num_head, seq_len, bound_num = startend_row_indices.shape
        ...     m = paddle.zeros((bz, num_head, seq_len, seq_len), dtype=dtype)
        ...     has_end = (causal and bound_num == 2) or ((not causal) and bound_num == 4)
        ...     for bi in range(bz):
        ...         for hi in range(num_head):
        ...             for j in range(seq_len):
        ...                 downstart = startend_row_indices[bi, hi, j, 0]
        ...                 if has_end:
        ...                     downend = startend_row_indices[bi, hi, j, 1]
        ...                     m[bi, hi, downstart:downend, j] = -np.inf
        ...                 else:
        ...                     m[bi, hi, downstart:, j] = -np.inf
        ...                 if causal:
        ...                     m[bi, hi, :j, j] = -np.inf
        ...                 else:
        ...                     if has_end:
        ...                         upstart = startend_row_indices[bi, hi, j, 2]
        ...                         upend = startend_row_indices[bi, hi, j, 3]
        ...                         m[bi, hi, upstart:upend, j] = -np.inf
        ...                     else:
        ...                         upend = startend_row_indices[bi, hi, j, 1]
        ...                         m[bi, hi, :upend, j] = -np.inf
        ...     return m

    For `Causal Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> startend_row_indices = paddle.to_tensor([8]*10, dtype="int32").reshape([1, 1, 10, 1])
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[8],
                [8],
                [8],
                [8],
                [8],
                [8],
                [8],
                [8],
                [8],
                [8]]]])
        >>> # doctest: -SKIP


    For `Sliding Window Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
          [0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> startend_row_indices = paddle.to_tensor([3, 4, 5, 6, 7, 8, 9, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[3 ],
                [4 ],
                [5 ],
                [6 ],
                [7 ],
                [8 ],
                [9 ],
                [10],
                [10],
                [10]]]])
        >>> # doctest: -SKIP

    For `Causal Document Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> startend_row_indices = paddle.to_tensor([4, 4, 4, 4, 7, 7, 7, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[4 ],
                [4 ],
                [4 ],
                [4 ],
                [7 ],
                [7 ],
                [7 ],
                [10],
                [10],
                [10]]]])
        >>> # doctest: -SKIP

    For `Document Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([4, 4, 4, 4, 7, 7, 7, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTE = paddle.to_tensor([0, 0, 0, 0, 4, 4, 4, 7, 7, 7], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, UTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[4 , 0 ],
                [4 , 0 ],
                [4 , 0 ],
                [4 , 0 ],
                [7 , 4 ],
                [7 , 4 ],
                [7 , 4 ],
                [10, 7 ],
                [10, 7 ],
                [10, 7 ]]]])
        >>> # doctest: -SKIP

    For `Share Question Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 1, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 1, 1, 0],
          [1, 1, 1, 1, 0, 0, 0, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> startend_row_indices = paddle.to_tensor([10, 10, 10, 10, 7, 7, 7, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 1], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[10],
                [10],
                [10],
                [10],
                [7 ],
                [7 ],
                [7 ],
                [10],
                [10],
                [10]]]])
        >>> # doctest: -SKIP

    For `Global + Sliding Window Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

        >>> # doctest: +SKIP('Only example')

       [[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 0, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 0, 0, 1, 1, 1, 0, 0, 0],
          [1, 1, 0, 0, 0, 1, 1, 1, 0, 0],
          [1, 1, 0, 0, 0, 0, 1, 1, 1, 0],
          [1, 1, 0, 0, 0, 0, 0, 1, 1, 1],
          [1, 1, 0, 0, 0, 0, 0, 0, 1, 1]]]])

        >>> import paddle
        >>> LTS = paddle.to_tensor([10, 10, 4, 5, 6, 7, 8, 9, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> LTE = paddle.to_tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTS = paddle.to_tensor([0, 0, 0, 0, 2, 2, 2, 2, 2, 2], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTE = paddle.to_tensor([0, 0, 0, 0, 3, 4, 5, 6, 7, 8], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, LTE, UTS, UTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 4], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[10, 10, 0 , 0 ],
                [10, 10, 0 , 0 ],
                [4 , 10, 0 , 0 ],
                [5 , 10, 0 , 0 ],
                [6 , 10, 2 , 3 ],
                [7 , 10, 2 , 4 ],
                [8 , 10, 2 , 5 ],
                [9 , 10, 2 , 6 ],
                [10, 10, 2 , 7 ],
                [10, 10, 2 , 8 ]]]])
        >>> # doctest: -SKIP

    For `Causal Blockwise Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([4, 4, 4, 4, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> LTE = paddle.to_tensor([7, 7, 7, 7, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, LTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[4 , 7 ],
                [4 , 7 ],
                [4 , 7 ],
                [4 , 7 ],
                [10, 10],
                [10, 10],
                [10, 10],
                [10, 10],
                [10, 10],
                [10, 10]]]])
        >>> # doctest: -SKIP

    For `Prefix LM Document Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
          [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([3, 3, 3, 5, 5, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTE = paddle.to_tensor([0, 0, 2, 3, 3, 5, 5, 7, 8, 9], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, UTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[3 , 0 ],
                [3 , 0 ],
                [3 , 2 ],
                [5 , 3 ],
                [5 , 3 ],
                [10, 5 ],
                [10, 5 ],
                [10, 7 ],
                [10, 8 ],
                [10, 9 ]]]])
        >>> # doctest: -SKIP

    For `Prefix LM Causal Mask`, where `causal=False`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> UTE = paddle.to_tensor([0, 0, 0, 0, 0, 5, 6, 7, 8, 9], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, UTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[10, 0 ],
                [10, 0 ],
                [10, 0 ],
                [10, 0 ],
                [10, 0 ],
                [10, 5 ],
                [10, 6 ],
                [10, 7 ],
                [10, 8 ],
                [10, 9 ]]]])

    For `QK-sparse Mask`, where `causal=True`, the values of `startend_row_indices` are as follows:

    .. code-block:: python

       [[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])

        >>> # doctest: +SKIP('Only example')
        >>> import paddle
        >>> LTS = paddle.to_tensor([10, 10, 2, 3, 4, 5, 6, 7, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> LTE = paddle.to_tensor([10, 10, 5, 5, 5, 5, 8, 8, 10, 10], dtype="int32").reshape([1, 1, 10, 1])
        >>> startend_row_indices = paddle.concat([LTS, LTE], axis=-1)
        >>> print(startend_row_indices)
        Tensor(shape=[1, 1, 10, 2], dtype=int32, place=Place(gpu:0), stop_gradient=True,
            [[[[10, 10],
                [10, 10],
                [2 , 5 ],
                [3 , 5 ],
                [4 , 5 ],
                [5 , 5 ],
                [6 , 8 ],
                [7 , 8 ],
                [10, 10],
                [10, 10]]]])

        >>> # doctest: -SKIP
    NrD   r   z/can't use window_size with startend_row_indicesr   r   )maxrX   )minr   Fz9startend_row_indices.dtype must be paddle.int32, but got r1   z,startend_row_indices rank must be 4,but got zCstartend_row_indices.shape[0] must be equal to batch_size, but got z and zAstartend_row_indices.shape[2] must be equal to seqlen_k, but got zJstartend_row_indices head_num must be equal to 1(broadcast) or head_num_k.r   TzoInvalid shape of startend_row_indices, when causal is True, the last dimension should be either 1 or 2 but got zpInvalid shape of startend_row_indices, when causal is False, the last dimension should be either 2 or 4 but got r\   r{   r|   z^flashmask_attention does not support setting softmax_scale, use flashmask_attention_v2 insteadrY   r   z/flashmask_attention_v2 does not support dropoutz:flashmask_attention_v2 does not support return seed_offsetz;flashmask_attention_v2 does not support setting seed_offsetry   z8flashmask_attention_v2 does not support setting rng_namezAflashmask_attention_v2 does not support setting training to Falsez4flashmask_attention_v2 does not support setting namerZ   r}   )
isinstancer   r    r   Zaranger   ZclipZrepeat_interleaveemptyr   rg   r+   r   lenr   r3   r   r   r   flashmask_attentionZflashmask_attention_v2)r"   r'   rG   r   r6   rJ   r   r   r   rq   rr   rM   rs   rz   sqZbszre   r   Zresult_softmax_lseZresult_seed_offsetZhas_endr   r   r   r   r   r   .  s4      p








	



r   paddle.Tensorr   c                 C  s|   | j r|j s
J dt rt| ||}|S tdi t }|tj}|tj}| ||d}d|i}|j	d||d |S )aj
  
    The equation is:

    .. math::

        result=reduce\_sum(softmax(\frac{ Q * K^T }{\sqrt{d}}), dim=-2)

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seqlen_q, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seqlen_k, num_heads, head_dim].
                        The dtype can be float16 or bfloat16.
        softmax_lse(Tensor): The logsumexp of each row returned by _C_ops.flash_attn().
                        3-D tensor with shape:
                        [batch_size, num_heads, seqlen_q_rounded], where seqlen_q_rounded = ceil(seqlen_q/128).
                        The dtype is float32.
    Returns:
        reduced_attention_scores(Tensor), The reduce sum of attention scores across seqlen_q.
        4-D tensor with shape: [batch_size, num_heads, 1, seqlen_k]. The dtype is float32.
    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('reduce_attn_scores need A100 compile')
            >>> import paddle
            >>> import numpy as np
            >>> import paddle._C_ops as _C_ops
            >>> from paddle.nn.functional.flash_attention import (
            >>>     calc_reduced_attention_scores
            >>> )
            >>> np.random.seed(2024)
            >>> q_shape = (5,1024,16,128)
            >>> k_shape = (5,2048,16,128)
            >>> dtype = 'float16'
            >>> query = np.random.random(q_shape)
            >>> key = np.random.random(k_shape)
            >>> q = paddle.to_tensor(
            >>>     query, place=place, dtype=dtype, stop_gradient=True
            >>> )
            >>> k = paddle.to_tensor(
            >>>     key, place=place, dtype=dtype, stop_gradient=True
            >>> )
            >>> _, _, softmax_lse, _ = _C_ops.flash_attn(
            >>>     q,
            >>>     k,
            >>>     k,
            >>>     (None,), #fixed_seed_offset
            >>>     None, #attn_mask
            >>>     0.0, #dropout
            >>>     False, #causal
            >>>     False, #return_softmax
            >>>     False, #is_test
            >>>     "" #rng_name
            >>> )
            >>> reduced_attn_scores = calc_reduced_attention_scores(
            >>>     q,
            >>>     k,
            >>>     softmax_lse,
            >>> )
            >>> # doctest: -SKIP
    z6calc_reduced_attention_scores() is for inference only.calc_reduced_attn_scores)r~   r   r   reduced_scores)r   r   r   N)r   )
r!   r   r   r   r   r   r   r   r   r   )r"   r'   r   r   r   ra   r   r   r   r   r   calc_reduced_attention_scores@	  s.   Fr   )r   )F)r.   r/   )FTT)r;   r/   r<   r/   r=   r/   r.   r>   )rC   r   r.   r   )....)r"   r   r'   r   rG   r   rE   r   rH   rI   rJ   r/   rK   rL   rM   r/   r.   rN   )r"   r   r'   r   rG   r   rE   r   rH   rI   rJ   r/   rK   rT   rM   r/   r.   rU   )r"   r   r'   r   rG   r   rE   r   rH   rI   rJ   r/   rK   r/   rM   r/   r.   rV   )Nr   FFT)r$   r   r.   rf   )r.   rf   )...)r"   r   r'   r   rG   r   r6   rI   rJ   r/   rK   rL   rq   rt   rr   rf   rM   r/   rs   ru   r.   rN   )r"   r   r'   r   rG   r   r6   rI   rJ   r/   rK   rT   rq   rt   rr   rf   rM   r/   rs   ru   r.   rU   )r"   r   r'   r   rG   r   r6   rI   rJ   r/   rK   r/   rq   rt   rr   rf   rM   r/   rs   ru   r.   rV   )r   FF)r   r   r6   rI   rJ   r/   rK   rL   rq   rt   rr   rf   rM   r/   rs   ru   r.   rN   )r   r   r6   rI   rJ   r/   rK   rT   rq   rt   rr   rf   rM   r/   rs   ru   r.   rU   )r   r   r6   rI   rJ   r/   rK   r/   rq   rt   rr   rf   rM   r/   rs   ru   r.   rV   ).......) r"   r   r'   r   rG   r   r   r   r   r   r   r   r   r   r   rI   r6   rI   rJ   r/   rK   rL   rq   rt   rr   rf   rM   r/   rs   ru   r.   rN   ) r"   r   r'   r   rG   r   r   r   r   r   r   r   r   r   r   rI   r6   rI   rJ   r/   rK   rT   rq   rt   rr   rf   rM   r/   rs   ru   r.   rU   ) r"   r   r'   r   rG   r   r   r   r   r   r   r   r   r   r   rI   r6   rI   rJ   r/   rK   r/   rq   rt   rr   rf   rM   r/   rs   ru   r.   rV   )r   FFNry   TN)NNNFNNNNr   r   rD   Nr   )........)r   r   r   r   r   r   r   r   r   r   r   rI   r6   rI   rJ   r/   rK   rL   rq   rt   rr   rf   r   r/   rM   r/   rs   ru   r.   rN   )r   r   r   r   r   r   r   r   r   r   r   rI   r6   rI   rJ   r/   rK   rT   rq   rt   rr   rf   r   r/   rM   r/   rs   ru   r.   rU   )r   r   r   r   r   r   r   r   r   r   r   rI   r6   rI   rJ   r/   rK   r/   rq   rt   rr   rf   r   r/   rM   r/   rs   ru   r.   rV   )r   FFNry   TTN)Nr   FTNN)r"   r   r'   r   rG   r   r5   rt   r#   rI   r(   r/   rM   r/   rs   ru   r   ru   r.   r   rP   )r"   r   r'   r   rG   r   r   rt   r6   rI   rJ   r/   r   r   r   r/   r   r/   rq   rt   rr   rf   rM   r/   rs   ru   rz   r   )r"   r   r'   r   r   r   r.   r   )1
__future__r   typingr   r   r   numpyr   r   Zpaddle.nn.functionalnnZ
functionalr`   r   Zpaddle.base.frameworkr   Zpaddle.base.layer_helperr   Zpaddle.base.wrapped_decoratorr	   Zpaddle.device.cudar
   r?   rA   r@   collections.abcr   r   r   r%   r)   r-   r7   r:   rB   rF   rR   ri   ro   rp   rw   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s  







-

-6 ` 



 
6
x
  L      