o
    81 iq                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 G dd de jjZdd ZG d	d
 d
e jjZdd ZG dd de jjZdS )    N)_cast_if_autocast_enabled)AttnMaskType)scaled_masked_softmax_backwardscaled_masked_softmax_forward)scaled_masked_softmax_get_batch_per_block+scaled_upper_triang_masked_softmax_backward*scaled_upper_triang_masked_softmax_forwardc                   @   s(   e Zd ZdZedd Zedd ZdS )ScaledUpperTriangMaskedSoftmaxz
    Fused operation which performs following three operations in sequence
    1. Scale the tensor.
    2. Apply upper triangular mask (typically used in gpt models).
    3. Perform softmax.
    c                 C   s*   t |g}t||d }| || |S Nr   )torchtensorr   save_for_backward)ctxinputsscalescale_tsoftmax_results r   d/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/fused_softmax.pyforward'   s   z&ScaledUpperTriangMaskedSoftmax.forwardc                 C   s"   | j \}}t|||d }|d fS r
   )saved_tensorsr   r   Zoutput_gradsr   r   Zinput_gradsr   r   r   backward.   s
   

z'ScaledUpperTriangMaskedSoftmax.backwardN)__name__
__module____qualname____doc__staticmethodr   r   r   r   r   r   r	      s    
r	   c           	      C   s   |   \}}}}||ksJ d| d||} t| |}tjjjdd tj| }W d    n1 s4w   Y  |||||S )Nz&causal mask is only for self attentionFenabled)	sizeviewr   r   cudaampautocastr	   apply)	r   _r   bnpsqskargsprobsr   r   r   "scaled_upper_triang_masked_softmax7   s   
r.   c                   @   s$   e Zd Zedd Zedd ZdS )ScaledMaskedSoftmaxc                 C   s,   t |g}t|||d }| || |S r
   )r   r   r   r   )r   r   maskr   r   r   r   r   r   r   J   s   zScaledMaskedSoftmax.forwardc                 C   s$   | j \}}t|||d }|d d fS r
   )r   r   r   r   r   r   r   Q   s   

zScaledMaskedSoftmax.backwardN)r   r   r   r   r   r   r   r   r   r   r/   I   s
    
r/   c                 C   sJ   t | ||}tjjjdd tj| W  d    S 1 sw   Y  d S )NFr   )r   r   r#   r$   r%   r/   r&   )r   r0   r   r,   r   r   r   scaled_masked_softmaxX   s   $r1   c                       sL   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Ze	dd Z
  ZS )FusedScaleMaskSoftmaxa  
    fused operation: scaling + mask + softmax

    Arguments:
        input_in_fp16: flag to indicate if input in fp16 data format.
        input_in_bf16: flag to indicate if input in bf16 data format.
        attn_mask_type: attention mask type (pad or causal)
        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
        mask_func: mask function to be applied.
        softmax_in_fp32: if true, softmax in performed at fp32 precision.
        scale: scaling factor used in input tensor scaling.
    c                    s   t    || _|| _| jr| jrtd| jp| j| _|| _|| _|| _|| _	|| _
| j
d u s6|s6td| jrS| jtjkrDt| _d S | jtjkrOt| _d S tdd S )Nz;both fp16 and bf16 flags cannot be active at the same time.z%softmax should be in fp32 when scaledzInvalid attn_mask_type.)super__init__input_in_fp16input_in_bf16RuntimeErrorinput_in_float16attn_mask_typescaled_masked_softmax_fusion	mask_funcsoftmax_in_fp32r   r   causalr.   fused_softmax_funcpaddingr1   
ValueError)selfr5   r6   r9   r:   r;   r<   r   	__class__r   r   r4   m   s(   



zFusedScaleMaskSoftmax.__init__c                 C   s>   |  dksJ | j|g| R  r| ||S | ||S )N   )dimis_kernel_availabler!   forward_fused_softmaxforward_torch_softmax)rA   inputr0   r   r   r   r      s   zFusedScaleMaskSoftmax.forwardc                 C   s   || }| j rf| jrf| jtjks| jtjkrf|d urfd|  k r$dkrfn dS |d dkrf|d dkrf|d dkrfd|  krCdkrfn dS | ||||}| jtjkr^|| dkr\dS dS || dkrfdS dS )N   i    rD   r   TF)r:   r8   r9   r   r=   r?   get_batch_per_block)rA   r0   r(   r)   r*   r+   Zattn_batchesZbatch_per_blockr   r   r   rF      s,   	z)FusedScaleMaskSoftmax.is_kernel_availablec                 C   s"   | j d ur| j nd}| |||S )Ng      ?)r   r>   )rA   rI   r0   r   r   r   r   rG      s   z+FusedScaleMaskSoftmax.forward_fused_softmaxc                 C   s|   | j r
| jr
| }| jd ur|| j }|d ur| ||n|}tjjdd|}| j r<| jr<| jr8|	 }|S |
 }|S )Nr   )rE   )r8   r<   floatr   r;   r   nnZSoftmaxr5   ZhalfZbfloat16)rA   rI   r0   Zmask_outputr-   r   r   r   rH      s   

z+FusedScaleMaskSoftmax.forward_torch_softmaxc                 C   s   t | |||S )N)r   )r*   r+   r(   r)   r   r   r   rK      s   z)FusedScaleMaskSoftmax.get_batch_per_block)r   r   r   r   r4   r   rF   rG   rH   r   rK   __classcell__r   r   rB   r   r2   _   s    !	r2   )r   Zapex._autocast_utilsr   Zapex.transformer.enumsr   Zfused_softmax_libr   r   r   r   r   ZautogradFunctionr	   r.   r/   r1   rM   Moduler2   r   r   r   r   <module>   s   	