o
    81 i                     @   s   d dl Z d dlZd dlZd dlm  mZ d dlZd dl	Z	d dl
Z
e	jddZejddd ejdedd	 ejd
edd	 ejddd ejddd e ZdddZdddZdd Zedkrge  dS dS )    NzProcess some integers.)descriptionz--causal
store_true)actionz--splits   )typedefaultz	--repeats
   z
--validatez--gqa Tc                 K   sH   |rt |d tjd| |dt d}||}|r t || ||fS )zCUse Pytorch Benchmark on the forward pass of an arbitrary function.z- Forward passzfn(**kwinputs))fnkwinputs)stmtglobalsZnum_threads)print	benchmarkTimertorchZget_num_threadsZtimeit)r
   repeatsZdescverboser   tm r   _/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/hopper/test_kvcache.pybenchmark_fa_kv_old   s   


r   c                 O   sl   t dD ]	}| |i | q|}tj  t }t |D ]	}| |i | qtj  t }|| | S )N   )ranger   cudaZsynchronizetime)r
   r   argskwargs_Znitersstartendr   r   r   benchmark_fa_kv"   s   

r"   c                      s  d} d}d}t j}d}d d}d}dg d}t|}|| }	g d	}
t||ks+J tfd
d|dd  D s<J ||ksBJ t fdd|
D sOJ t d t j| ||fd|d}t j| ||fd|d}t jd|| |fd|d}t j|
d gt jdd}t jdgt jdd}t j|d | |fd|d}t j|
dd  dg|	  t jdd}t j	|t jddd |d  }t
jr{tj|||||tt
jt
jdd\}}tj|||||tt
jdtt
jdd	\}}tj|||||tt
jt
jtt
jdd	\}}tj|||||tt
jt
jd}td td||    | td||     tj|||||tt
jt
jdd\}}td td||||  |j td||     td||     td||     tdt
j ttjt
j|||||tt
jt
jd	}ttjt
j|||||tt
jt
jd	}td ttjt
j|||||tt
jt
jd	}ttjt
j|||||tt
jt
jd	}td t
j|d! |d! ||  td"t
j|d! |d! ||  d S )#N@         i @  i         )i     r   )i   i (  i0  c                 3       | ]}| k V  qd S Nr   .0s)small_request_ntokensr   r   	<genexpr>K       zmain.<locals>.<genexpr>r   c                 3   r)   r*   r   r+   )cache_seqlenr   r   r/   M   r0   i:  r   )devicedtyper   )r3   r2   T)qk_cachev_cachecache_seqlenscache_batch_idxcausal
num_splitsreturn_softmax_lse)	r4   r5   r6   r7   r8   r9   r:   Zgqa_decodingr;   )r4   r5   r6   r7   r8   r9   r:   bigzdiff-maxz	diff-meanZsmallZlsezlse-dif-maxfa3)r   r4   r5   r6   r7   r8   r9   r:   zfa2 zbig (split, fa3, fa2, ratio):i@B zsmall (split, fa3, fa2, ratio):)r   Zfloat16lensumallZmanual_seedZrandnZtensorZint32Zrandpermr   validater=   Zflash_attn_with_kvcacheboolr9   ZsplitsZgqafa2r   absmaxitemmeanshaper   r"   ) Znheads_qZ	nheads_kvZheaddimr3   Z
num_cachesZntokensZmax_queries_per_batchZquery_seqlensZnum_queriesZnum_padding_queriesZcontext_seqlensr5   r6   Zq_buf_largeZcache_seqlen_largeZcache_idx_largeZq_buf_smallZcache_seqlens_smallZcache_idxs_smallZout0Zlse0Zout1_split1Zlse1_split1Zout1Zlse1Zout2Zout3Zlse_fa2Ztime_fa3_bigZtime_fa3_smallZtime_fa2_bigZtime_fa2_smallr   )r1   r.   r   main/   s  "














"rI   __main__)r   r	   T)r   )r   Zflash_attn_interfacer=   Z
flash_attnrC   Ztorch.utils.benchmarkutilsr   r   argparsemathArgumentParserparseradd_argumentint
parse_argsr   r   r"   rI   __name__r   r   r   r   <module>   s*    

 ;
