o
    81 iQ                     @   s	  d dl mZ d dlmZ d dlZd dlZd dlmZ d dlZd dl	m
Z
 d dlm
  mZ d dlZzd dlZW n eyB   dZY nw eddefgZd dlmZmZ d dlmZmZmZmZmZmZ d d	lmZmZ d d
l mZ! d dl mZ" d dl#m$Z$ zd dl%m&Z' W n ey   dZ'Y nw dZ'e(dddkZ)ddddddZ*dXddZ+dd Z,dYddZ-dYd d!Z.e/d  d"Z0d#Z1dZ2ej3Z4e4ej5krej3ne4Z6d$Z7dZ8dZ9dZ:d#Z;dZ<dZ=d%Z>d&Z?d'Z@d(ZAd)gZBi ZCi ZDd*D ]ZAe@eA ZEeEZFeAZGeAd+koeGd,kZHeBD ]\Z>Z?d ZIdZJdZKe?ZLdZMejNe>eLeEeAe7e6dd-ZOejNe>e?eFeAe7e6dd-ZPejNe>e?eFeGe7e6dd-ZQd.d/ eOePeQfD \ZOZPZQeQR Sdd0T Sdd0U ZVe<sYeQneVZWeHriejNe>eLeEeGe7e6d1ndZXejNe>eLeEeGe7e6dd-ZYejNe>eLeEeGe7e6dd-ZZejNe>eLeEd2e7ej[d1Z\e9rd3d/ eOePeQfD \Z]Z^Z_ej`e>d2 e7ejad1eL Zbej`e>d2 e7ejad1e? Zce:dure?e: d ksJ d4d/ ePeQfD \ZdZeeej`e>e? e: e7ejad1d5e?e: d6ZfndZfd7D ]Z2egd8eAd9e2d:e?d; e+e>eEeLe?eHseAneAeG eGe2eJd<ZheduraeAd(krae4ej5kraeAeGkrae-eOSd2d%ePSd2d%eQSd2d%e2eJd  d=Zie.eOSd2d%ePSd2d%eQSd2d%eZSd2d%eYSd2d%e\Sd2d%e2eJd  d=Zje4ej5kreAeGkre9se*eeOePeQe1e2eJe;e0e8d>d?Zkne*ee]e^e_ebeceLe?e1e2eJe;e0e8d>d?ZkekjleCe2eAe>e?fd@f< emd2 e9seeeOePeQe1e2eJe;e=e0dd>dA\ZnZoneee]e^e_ebeceLe?e1e2eJe;e=e0dd>dA\ZnZoeojleDe2eAe>e?fd@f< eAd(kre4ej5kreAeGkre'durdBd/ eOePeQfD \ZpZqZremd2 e*e'epeqere2d2eseA e0e8dCd	ZtetjleCe2eAe>e?fdCf< edurbeAd(krbe4ej5krbeAeGkrbemd2 e*eie0e8dDdZueujleCe2eAe>e?fdEf< emd2 e*eje0e8dDdZvevjleDe2eAe>e?fdEf< emd2 e9se*e!eOe:du rtePnede:du r|eWneeeXe2eJe;eIeKe0e8dFdGZwne*e"e]e^e_ebeceLe?e2eJe;eIeKe0e8dFdHZwewjleCe2eAe>e?fdIf< e4ej5kreAeGkre)semd2 e9see!eOePeQe2eJe;e=e0ddFdA\ZnZxnee"e]e^e_ebeceLe?e2eJe;e=e0ddFdA\ZnZxexjleDe2eAe>e?fdIf< e4ej5kr(eAeGkr(egdJekjldK dLdMehekjl dN dOdP egdQeojldK dLdMdReh eojl dN dOdP eAd(kre4ej5kreAeGkre'durRegdSetjldK dLdMehetjl dN dOdP eduregdTeujldK dLdMeheujl dN dOdP egdUevjldK dLdMdReh evjl dN dOdP egdVewjldK dLdMehewjl dN dOdP e4ej5kreAeGkre)segdWexjldK dLdMdReh exjl dN dOdP qqqdS )Z    )
namedtuple)partialN)
NamedTupleZtimingmean)	rearrangerepeat)benchmark_forwardbenchmark_backwardbenchmark_combinedbenchmark_allbenchmark_fwd_bwdpytorch_profiler)flash_attn_funcflash_attn_varlen_func)r   )r   )do_bench)	attentionZ FLASH_ATTENTION_DISABLE_BACKWARDFALSETRUE   T )repeatsverbosedescc                   s"   t t fddd|dd S )Nc                      s    i S N r   argsfunckwargsr   a/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/hopper/benchmark_attn.py<lambda>;   s    ztime_fwd.<locals>.<lambda>   )ZwarmuprepgMbP?)Timingr   )r   r   r   r   r   r   r   r   r   time_fwd)   s   "r$   Fr&   c                 C   s   |rt d|| | d }n>|dkr|}n7tj|dd}	t|	| | |d  td}
t|	| | |d  t|d }||
 d    }| | d | | ||  S )Nr      r%   cuda)device   )	maxtorcharangemaximumZtensorminimumfloatr   item)batchnheadsseqlen_qseqlen_kheaddim	headdim_vcausalwindow_sizeZ
avg_seqlenZrow_idxZcol_leftZ	col_rightr   r   r   flops>   s   "&r:   c                 C   sb   | t jkr	tjjS | t jkrtjjS | t jkrtjjS | t j	kr$tjj
S | t jkr-tjjS td)NzUnsupported tensor data type.)r,   Zfloat16cudnn	data_typeZHALFbfloat16ZBFLOAT16float32FLOATint32ZINT32Zint64ZINT64
ValueError)Z
torch_typer   r   r   convert_to_cudnn_typeL   s   




rB   r&   c              
      s  | j \}}}}|j \}	}
}}	|j ||
||fksJ td us!J d| ||}}}t|tj|||dtj| jd}tjt| j	tj
jtj
jd  | }  | } | } jd| ||ddt| |pq|dk|dkry|sy|nd d	\}}|d
j   |d
tj
j        tjjtjjg       | ||||||||itj  dtjd fdd}|S )NCUDNN is not availabler*   )dtyper)   Zio_data_typeZintermediate_data_typeZcompute_data_typesdpaF      ?r   )nameqkvZis_inference
attn_scaleuse_causal_masksliding_window_lengthTr(   r)   rD   c                     s      S r   executer   r   grapho_gpuvariant_pack	workspacer   r   run   s   zcudnn_spda_setup.<locals>.run) shaper;   r,   
empty_likeemptyr>   r)   pygraphrB   rD   r<   r?   tensor_likedetachrF   mathsqrt
set_outputset_dim
set_stridestrideZset_data_typevalidatebuild_operation_graphcreate_execution_plans	heur_modeAFALLBACKcheck_supportbuild_plansget_workspace_sizeuint8)rI   rJ   rK   r8   window_size_leftbr3   r4   r6   _nheads_kr5   q_gpuk_gpuv_gpuZ	stats_gpuostatsrX   r   rS   r   cudnn_spda_setup[   sR   


rx   c                    sT  | j \}}	}
}|j \}}}}|j ||||fksJ |j ||	|
|fks$J |j ||	|
|fks/J |j ||	|
dfks:J td usBJ d| ||||f\}}}}}t|t| t|tjt| jtjjtjjd	|
 } 	|
 }	|
 }	|
 }	|
 }	|
 }jd| |||||dt| |p|dk|dkr|s|nd d
\}}}|dj   |d j    |dj       tjjtjjg     | ||||||||||||| |i	tj d	tjd
 fdd}|S )Nr*   rC   rE   sdpa_backwardrG   r   )
rH   rI   rJ   rK   rv   ZdOrw   rL   rM   rN   Tr(   rO   c                     s      fS r   rP   rR   Zdk_gpuZdq_gpuZdv_gpurT   rV   rW   r   r   rX      s   
z!cudnn_spda_bwd_setup.<locals>.run)rY   r;   r,   rZ   r\   rB   rD   r<   r?   r]   r^   ry   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   r[   rm   rn   )rI   rJ   rK   rv   gZlser8   ro   rp   r3   r4   r6   rq   rr   r5   rs   rt   ru   rU   Zg_gpurw   ZdqZdkZdvrX   r   rz   r   cudnn_spda_bwd_setup   sn   



r|   
   g        r(   r'       i      )r'   r~   )   @   i   )r)   rD   Zrequires_gradc                 C   s   g | ]}|  t qS r   )r^   torD   requires_grad_.0xr   r   r   
<listcomp>      r   rO   r*   c                 C   s   g | ]}t | d  qS )zb s h d -> (b s) h d)r   r^   r   r   r   r   r   r   %  r   c                 C   s   g | ]	}t |d tdqS )zb (n p) h d -> (b n) p h d)p)r   	page_sizer   r   r   r   r   0  s    z(b s) -> b s)s)FTz
### headdim = z, causal = z, seqlen = z ###)r8   r9   )r8   ro   ZFav2)r8   r9   softcapr   r   r   ZFlash2)r8   r9   r   deterministicr   r   r   c                 C   s$   g | ]}|  d d  qS )r*   r'   )r^   	transpose
contiguousr   r   r   r   r   r   R  s   $ ZTritonZCuDNNZcuDNNZFav3)	qvr8   r9   r   
num_splitspack_gqar   r   r   )r8   r9   r   r   r   r   r   r   ZFlash3z
Fav2 fwd: g     @@z.3fzms, g-q=z.1fz TFLOPSz
Fav2 bwd: g      @zTriton fwd: zCuDNN fwd: zCuDNN bwd: z
Fav3 fwd: z
Fav3 bwd: )Fr%   )Fr&   )ycollectionsr   	functoolsr   r_   ostypingr   r,   Ztorch.nnnnZtorch.nn.functionalZ
functionalFtimer;   ImportErrorr0   r#   Zeinopsr   r   Zflash_attn.utils.benchmarkr   r	   r
   r   r   r   Zflash_attn.flash_attn_interfacer   r   Zflash_attn_interfaceZflash_attn_func_v3Zflash_attn_varlen_func_v3Ztriton.testingr   Ztriton_fused_attentionr   Ztriton_attentiongetenvZDISABLE_BACKWARDr$   r:   rB   rx   r|   Zmanual_seedr   Z	dropout_pr8   r=   rD   Zfloat8_e4m3fnZ	dtype_genr)   r   Zvarlenr   r   Z
V_colmajorr   Z
batch_sizeZseqlendimr6   Zbs_seqlen_valsZtime_fZtime_br3   Z	nheads_kvr7   Zhas_qvr   r9   r   r4   Z	leftpad_kZrandnrI   rJ   rK   r^   r   r   r   Z
v_colmajorZv_fa3r   r{   rv   r>   rw   Zq_unpadZk_unpadZv_unpadr-   r@   Zcu_seqlens_qZcu_seqlens_kZk_pagedZv_pagedZ
page_tableprintZnFLOPSZ
cudnn_spdaZcudnn_spda_bwdZm0r   sleeprq   Zm0bZqtktvtr`   Zm3m2Zm2bm1Zm1br   r   r   r   <module>   s.    


7
E
  

&
 .L &

 

$
 


@(

*. 
*
*.*.