o
    81 i                     @   s   d Z ddlZddlm  mZ ddddejddd	Zdddddejd
ddZdddddejd
ddZdddddejd
ddZ	dddddejd
ddZ
dddejdddddZdddddZdS )z) Useful functions for writing test code.     N
    TFrepeatsdescverboseamp	amp_dtypec                   sX   |rt |d  fdd}tjd|||dt d}	|	|}
|r(t |
 |	|
fS )zCUse Pytorch Benchmark on the forward pass of an arbitrary function.z- Forward passc                     sD   t jd d | i | W d    d S 1 sw   Y  d S )NcudaZdevice_typeZdtypeenabled)torchautocast)inputskwinputsr   r	   fn f/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/utils/benchmark.pyamp_wrapper   s   "z&benchmark_forward.<locals>.amp_wrapperzfn_amp(*inputs, **kwinputs))Zfn_ampr   r   stmtglobalsZnum_threads)print	benchmarkTimerr   get_num_threadstimeit)r   r   r   r   r   r	   r   r   r   tmr   r   r   benchmark_forward   s   


r    gradr   r   r   r   r	   c                O   s   |rt |d tjd||d | |i |}	t|	tu r!|	d }	W d   n1 s+w   Y  |du r:t|	}n
|j|	jkrDtddd }
tj	d	|
||	|d
t
 d}||}|rbt | ||fS )zDUse Pytorch Benchmark on the backward pass of an arbitrary function.z- Backward passr
   r   r   N&Grad shape does not match output shapec                 W   s.   |D ]}t |tjrd |_q| j|dd d S )NTZretain_graph)
isinstancer   Tensorr"   backward)yr"   r   xr   r   r   f6   s
   zbenchmark_backward.<locals>.fzf(*inputs, y=y, grad=grad))r*   r   r(   r"   r   r   r   r   typetuple
randn_likeshapeRuntimeErrorr   r   r   r   r   r"   r   r   r   r   r	   r   r   r(   r*   r   r   r   r   r   benchmark_backward   s,   

r2   c                   s   |rt |d tjd d |i |}	t|	tu r!|	d }	W d   n1 s+w   Y  |du r:t|	}n
|j|	jkrDtd fdd}
tj	d	|
|||d
t
 d}||}|rgt | ||fS )LUse Pytorch Benchmark on the forward+backward pass of an arbitrary function.z- Forward + Backward passr
   r   r   Nr#   c                    s   |D ]}t |tjrd |_qtjd d |i |}t|tu r(|d }W d    n1 s2w   Y  |j| dd d S )Nr
   r   r   Tr$   )r%   r   r&   r"   r   r,   r-   r'   )r"   r   r   r)   r(   r   r   r   r*   `   s   zbenchmark_combined.<locals>.fzf(grad, *inputs, **kwinputs))r*   r   r   r"   r   r   r+   r1   r   r   r   benchmark_combinedH   s,   


r4   c          	   
   O   sF   t | g|R |||||d|t| g|R ||||||d|fS r3   r   r!   )r    r2   	r   r"   r   r   r   r   r	   r   r   r   r   r   benchmark_fwd_bwdu   s8   
	r7   c          	      O   sh   t | g|R |||||d|t| g|R ||||||d|t| g|R ||||||d|fS r5   )r    r2   r4   r6   r   r   r   benchmark_all   sT   
		r8   )trace_filenamer'   r   r	   cpur   c             	   O   s  |r0t jd||d | |i |}	t|	tu r|	d }	t |	}
W d   n1 s+w   Y  tdD ]D}|rF|D ]}t|t jrEd|_q:t jd||d | |i |}	t|	tu r`|	d }	W d   n1 sjw   Y  |rx|	j	|
dd q4|rt j
jjgng t j
jjg }t j
j|dddJ}|r|D ]}t|t jrd|_qt jd||d | |i |}	t|	tu r|	d }	W d   n1 sw   Y  |r|	j	|
dd W d   n1 sw   Y  |rt| jd	d
 |dur|| dS dS )zEWrap benchmark functions in Pytorch profiler to see CUDA information.r
   r   r   N   Tr$   )
activitiesZrecord_shapesZ
with_stack2   )Z	row_limit)r   r   r,   r-   r.   ranger%   r&   r"   r'   ZprofilerZProfilerActivityZCPUCUDAZprofiler   Zkey_averagestableZexport_chrome_trace)r   r9   r'   r   r	   r:   r   r   r   outg_r)   r<   Zprofr   r   r   pytorch_profiler   sf   rD   )r   r   c                O   sj   t j  t j  t j  | |i | t j  t j d }|r.t| d| d t j  |S )Ni  >z max memory: GB)r   r
   Zempty_cacheZreset_peak_memory_statsZsynchronizeZmax_memory_allocatedr   )r   r   r   r   r   Zmemr   r   r   benchmark_memory  s   




rF   )__doc__r   Ztorch.utils.benchmarkutilsr   Zfloat16r    r2   r4   r7   r8   rD   rF   r   r   r   r   <module>   sR   -0(38