o
    81 im                     @   s  d dl mZ d dlmZ d dlZd dlZd dlmZ d dl	m  m
Z d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZmZmZmZmZ G d	d
 d
ejjZ				d,dededee de dee de fddZ!G dd dej"Z#G dd dej"Z$G dd dej"Z%G dd dejjZ&						 	 		d-dededed ee d!ee d"e'd#e de d$e(d%e(dee de fd&d'Z)G d(d) d)ej*Z+G d*d+ d+ej*Z,dS ).    )partial)OptionalN)Tensor)ProcessGroup)
custom_fwd
custom_bwd)gelu_bwdrelu_bwd
sqrelu_bwd
sqrelu_fwd)all_gather_raw
all_reduceall_reduce_rawreduce_scatterreduce_scatter_rawc                   @   s0   e Zd Zee	dddZeedd ZdS )	FusedDenseFuncFNTc                 C   s,  |j | _|| _|| _|| _t r|jt d}|	 }|dur-|r-t
||dd\}}n|}t rI|jt d}|durG|jt dnd}|	 }|durW|rW|  |jdd |jd }	}
|	 }t||
g|jR  dkrxtdt|||}| jr| || n| | |s|S ||fS )z
        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
        dtypeNTZasync_op +fused_dense only supports matrix dims <= 2M)Zrequires_gradcompute_weight_gradientreturn_residualprocess_groupsequence_paralleltorchis_autocast_enabledtoget_autocast_gpu_dtype
contiguousr   waitshapenumelminRuntimeErrorFlinearsave_for_backward)ctxxweightbiasr   r   r   total_xhandle_xbatch_shapen	batch_dimoutput r3   f/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/ops/fused_dense.pyforward   s2   	
zFusedDenseFunc.forwardc                 G   s  |  }| jr|\}|  }| j}| j}| jr/| j\}}|d ur,|r,t||dd\}}	n	|}n| j\}d }|jd d }
|
 }|	||jd }| j
d r| jsZt|| }nt|	||jd ||}|j	g |
|jd R  }|d ur|r}tnt}|||dd\}}nd }| j
d r| jsJ |d ur|r|	  t|	||jd || j
d \}}nd }| j
d r|nd }|d ur| j
d r|  |||d d d fS )NTr   r   r         )r    r   r   r   r   saved_tensorsr   r"   r#   reshapeneeds_input_gradr&   r'   tr   addmmr   r   r!   fused_dense_cudalinear_bias_wgrad)r)   grad_outputargs
grad_inputr   r   r*   r+   r-   r.   r/   r1   	reduce_fnhandle_grad_inputZgrad_weightZ	grad_biasr3   r3   r4   backwardE   sP   




zFusedDenseFunc.backward)FNT__name__
__module____qualname__staticmethodr   r5   r   rD   r3   r3   r3   r4   r      s    'r   FTr*   r+   r,   r   r   r   c                 C   s   | j tjtjfv p| j tjkot }| jr,|jr,|d u s |jr,|r,t| |||||S |d u s2J t	
| ||}|s=|S || fS N)r   r   float16bfloat16float32r   is_cudar   applyr&   r'   )r*   r+   r,   r   r   r   dtype_eligibleoutr3   r3   r4   fused_dense_funcv   s   rR   c                       sF   e Zd Z				ddededededdf
 fd	d
ZdddZ  ZS )
FusedDenseTFNin_featuresout_featuresr,   r   returnc                    s    t  j|||||d || _d S )Nr,   devicer   )super__init__r   )selfrT   rU   r,   r   rX   r   	__class__r3   r4   rZ      s   	
zFusedDense.__init__c                 C   s   t || j| j| j|dS )z
        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
        we do an all_gather of x before doing the matmul.
        )r   r   )rR   r+   r,   r   )r[   r*   r   r3   r3   r4   r5      s   zFusedDense.forward)TFNNrJ   )rF   rG   rH   intboolrZ   r5   __classcell__r3   r3   r\   r4   rS      s"    rS   c                       F   e Zd Z					ddededededdf
 fd	d
Zdd Z  ZS )ColumnParallelLinearTr6   NrT   rU   r   r,   rV   c	                    s   t j|}	|| rtd| d| || }
|
|	 }|
|	 }|tt j||k  }t j||| |||d || _|| _	d S )Nzout_features () must be a multiple of rW   )
r   distributedget_world_size
ValueErrorr^   get_rankrY   rZ   r   r   )r[   rT   rU   r   r,   r   multiple_ofrX   r   
world_sizemultipledivmodlocal_multipler\   r3   r4   rZ      s   
zColumnParallelLinear.__init__c                 C   s   t || j| j| j| jdS )N)r   r   )rR   r+   r,   r   r   )r[   r*   r3   r3   r4   r5      s   zColumnParallelLinear.forwardTTr6   NN	rF   rG   rH   r^   r   r_   rZ   r5   r`   r3   r3   r\   r4   rb      s$    
rb   c                       ra   )RowParallelLinearTr6   NrT   rU   r   r,   rV   c	                    s   t j|}	t j|}
|| rtd| d| || }||	 }||	 }|tt j||k  }t j|| ||o>|
dk||d || _|| _	d S )Nzin_features (rc   r   rW   )
r   rd   re   rg   rf   r^   rY   rZ   r   r   )r[   rT   rU   r   r,   r   rh   rX   r   ri   Zrankrj   rk   rl   rm   r\   r3   r4   rZ      s"   

zRowParallelLinear.__init__c                 C   s*   t || j| j}| jrtnt}||| jS )z
        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
        a reduce_scatter of the result.
        )rR   r+   r,   r   r   r   r   )r[   r*   rQ   rB   r3   r3   r4   r5      s   zRowParallelLinear.forwardrn   ro   r3   r3   r\   r4   rp      s$    
 rp   c                   @   s<   e Zd Zee							d
ddZeedd	 ZdS )FusedMLPFuncgelu_approxTFr   Nc                    s  d|
  krdksJ  J |dv sJ |dkr|
dksJ |s"d}	|	dv s(J || _ || _|| _|	| _|| _|
| _t rF|jt	 d}|
 }|durZ|rZt||d	d
\}}n|}t rt	   fdd||fD \}}|dur{|j dnd}|dur|j dnd}|
 }|dur|
 nd}|
 }|dur|
 nd}|dur|r|  |jdd |jd }}| }t||g|j|jR  dkrtd|
dkrt|||}|dkrttjddn|dkrtntj}tjd ||}W d   n	1 s	w   Y  n|dk}t||||||||
^}}|r*|d }t|||}|	dks@|	dkrJ|dkrJ| ||||| n|	dkrX| |||| n|	dkre| |||| |jg ||jd R  }|sw|S ||fS )a  
        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
        with sequence parallelism: we do an all_gather of x before doing the matmul.
        If sequence_parallel=False, then the input is already gathered.

        checkpoint_lvl:
        0: no recomputation in the bwd
        1: recompute gelu_out / relu_out in the bwd
        2: recompute pre_act and gelu_out / relu_out in the bwd
        r      rr   relusqrelurv   r7   r   r6   r7   r   NTr   c                    s   g | ]}|j  d qS )r   )r   ).0ar   r3   r4   
<listcomp>.  s    z(FusedMLPFunc.forward.<locals>.<listcomp>r   r   rr   tanhZapproximatefuser2r   r6   ru   )r   r   r   checkpoint_lvl
activation	heuristicr   r   r   r   r    r   r!   r"   r#   r$   r%   r&   r'   r   gelur   ru   jitfuserr=   linear_act_forwardr9   r(   )r)   r*   weight1bias1weight2bias2r   save_pre_actr   r~   r   r   r   r-   r.   r/   r0   r1   pre_actactivation_fnoutput1Zis_gelurestoutput2r3   r   r4   r5      sv   



zFusedMLPFunc.forwardc                 G   sl  |  }| j}| j}|dkrttjddn|dkrtntj}| jr(|\}|  }| j	}| j
}| j^}	}
}}|d u s;|s=|	}|jd d }| }|dv r|d ur[|r[t|	|dd\}}|d	ksg|d
krl|dkrl|\}}ns|d
kr|\}tjd ||}W d    n1 sw   Y  nQ|dkr|\}|d ur|rt|	|\}}| jdkrt||
|}tjd ||}W d    n1 sw   Y  nt|||jd |
||dkd| j\}}|||jd }|||jd }|||jd }| jd rt||| jd \}}nd }| jd r|nd }| jdkrTt|| }|dkr.tn|dkr5tnt}tjd |||}W d    n	1 sNw   Y  nt||||dk| j\}}| jd sjd }| jd	 r| js}t||
 }nt|||jd ||
}|jg ||jd R  }|d ur|rtnt }|||dd\}}nd }| jdkr| jd
 r|d ur|r|dkr|!  t|||jd || jd \}}n8d }| jd r|nd }n+| jd
 r|d ur|r|dkr|!  t| |||jd  }nd }|d ur(| jd	 r(|!  |||||d d d d d d d fS )Nrr   r{   r|   rv   r   )r   r6   Tr   r   r6   ru   r}   r7      rs   )"r    r~   r   r   r&   r   r   ru   r   r   r   r8   r"   r#   r   r   r   r   r   r'   r=   r   r9   r:   r>   r;   r   r
   r	   Zbias_act_linear_dgrad_bgradr<   r   r   r!   )r)   r?   r@   r~   r   r   rA   r   r   r*   r   r   r   r-   r/   r1   r.   r   r   r   _Zgrad_weight2Z
grad_bias2Zgrad_output1Zactivation_grad_fnZgrad_pre_actZ
grad_bias1rB   rC   Zgrad_weight1r3   r3   r4   rD   [  s   



	



zFusedMLPFunc.backward)rr   TFr   r   NTrE   r3   r3   r3   r4   rq      s    _rq   rr   r   r   r   r   r   r   r~   r   c                 C   s  |dv sJ | j tjtjfv p| j tjkot }| p*| jd |dkr&dnd dk}| jrV|jrV|jrV|d u s;|jrV|d u sB|jrV|rV|rVt	| |||||||||	|
|S |
d u s\J t
| ||}|dkrntt
jdd	ntt
jd
d}||}t
|||}|s|S || fS )Nrt   r   ru         r   rr   r{   r|   T)Zinplace)r   r   rK   rL   rM   r   r"   rN   rq   rO   r&   r'   r   r   ru   )r*   r   r   r   r   r   r   r   r~   r   r   r   rP   Zdim_eligibler   r   r   r   r3   r3   r4   fused_mlp_func  sP   $r   c                       s<   e Zd Z										d fdd	Zdd	d
Z  ZS )FusedMLPNTrr   Fr   autoc                    s   |dv sJ |dv sJ |
|d}t    |p|}|p|d }|| _|| _|| _|dkr/|	nd| _tj||fd|i|| _tj||fd|i|| _	dS )	a  
        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
        we do an all_gather of x before doing the matmul, gelu, then matmul.
        Finally we do a reduce_scatter of the output.

        checkpoint_lvl (increasing lvl means slower but more memory saving):
            0: no recomputation in the bwd
            1: recompute gelu_out in the bwd
            2: recompute pre_act and gelu_out in the bwd
        heuristic:
            -1: don't fuse gemm + gelu (separate kernel)
            0..4: use this heuristic for the algo section in the fused gemm + gelu
            'auto': heuristic will be picked automatically:
                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
                For H100, we set heuristic=-1 for both fp16 and bf16 as the fused cuBlasLt implementation
                is slower than the unfused version.
        return_residual: whether to return the input x along with the output. This is for
            performance reason: for post-norm architecture, returning the input allows us
            to fuse the backward of nn.Linear with the residual connection.
        rw   rt   rX   r   rs   rv   r   r,   N)
rY   rZ   r   r   r~   r   nnLinearfc1fc2)r[   rT   hidden_featuresrU   r   r   r   r   r~   r   rX   r   factory_kwargsr\   r3   r4   rZ     s   #

zFusedMLP.__init__c                 C   s   t  s|jnt  }| jdkr?| jdkr<t jddkr d}n"tt	t
t jjd}|dkr2dn|t jkr9d	nd}nd}n| j}t|| jj| jj| jj| jj| j| j| j| j||d
}| jrd|\}}|d urmt||}| jsr|S ||fS )Nr   rr   cuda)	   r   r   .   r   r   r6   )r   r   r   r~   r   r   )r   r   r   r   r   r   r   Zget_device_capabilitytuplemapr^   versionsplitrK   r   r   r+   r   r,   trainingr   r~   r   )r[   r*   r   r   r   cuda_verrQ   r3   r3   r4   r5   D  s6   

 
zFusedMLP.forward)
NNTTrr   Fr   r   NNrJ   )rF   rG   rH   rZ   r5   r`   r3   r3   r\   r4   r     s    0r   c                       sB   e Zd Z											ddef fddZd	d
 Z  ZS )ParallelFusedMLPNrr   Tr   r   r   c                    s   |	dv sJ |dv sJ |dusJ ||d}t    |p|}|p%|d }|| _|| _|| _|	| _|dkr8|
nd| _t|||fd|i|| _t	|||fd|i|| _
dS )	aT  
        process_group is required. We're doing Tensor Parallel with sequence parallelism:
        we do an all_gather of x before doing the matmul, gelu, then matmul.
        Finally we do a reduce_scatter of the output.

        checkpoint_lvl (increasing lvl means slower but more memory saving):
            0: no recomputation in the bwd
            1: recompute gelu_out in the bwd
            2: recompute pre_act and gelu_out in the bwd
        heuristic:
            -1: don't fuse gemm + gelu (separate kernel)
            0..4: use this heuristic for the algo section in the fused gemm + gelu
            'auto': heuristic will be picked automatically:
                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
        rw   rt   Nr   rs   rv   r   r,   )rY   rZ   r   r   r   r~   r   rb   r   rp   r   )r[   rT   r   rU   r   r   r   r   r   r~   r   rX   r   r   r\   r3   r4   rZ   f  s4   

zParallelFusedMLP.__init__c                 C   s   t  s|jnt  }| jdkr4| jdkr1tttt j	j
d}|dkr'dn|t jkr.dnd}nd}n| j}t|| jj| jj| jj| jj| j| j| j|| j| jd}| jrXtnt}||| jS )	Nr   rr   r   r   r   r6   r   )r   r   r~   r   r   r   )r   r   r   r   r   r   r   r   r^   r   r   r   rK   r   r   r+   r   r,   r   r~   r   r   r   r   )r[   r*   r   r   r   rQ   rB   r3   r3   r4   r5     s,   

 zParallelFusedMLP.forward)NNrr   NTTTr   r   NN)rF   rG   rH   r   rZ   r5   r`   r3   r3   r\   r4   r   e  s     2r   )NFNT)	NNrr   TFr   r   NT)-	functoolsr   typingr   Zfused_dense_libr=   r   Ztorch.nnr   Ztorch.nn.functionalZ
functionalr&   r   Ztorch.distributedr   Zflash_attn.utils.torchr   r   Zflash_attn.ops.activationsr   r	   r
   r   Zflash_attn.utils.distributedr   r   r   r   r   ZautogradFunctionr   r_   rR   r   rS   rb   rp   rq   strr^   r   Moduler   r   r3   r3   r3   r4   <module>   s   	^
(+ g	

8R