o
    81 i3                     @   sp   d dl Z d dlZd dlZd dlZd dlZd dlZd dlm  mZ dd Z	dd Z
dd Zedkr6e  dS dS )	    Nc                 C   s   | dkrdS d| d   > S )N   )
bit_length)x r   e/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/hopper/benchmark_split_kv.pyround_up_to_power_of_2
   s   r   c                 O   sP   t j  tdD ]	}| |i | q	tjd| ||dd}|d}|j}|S )N   zfn(*args, **kwargs))fnargskwargs)stmtglobals   )torchcudaZsynchronizerange	benchmarkTimertimeitmean)r	   r
   r   _tmeasurementZavg_timer   r   r   r      s   


r   c            +      C   sp  t jt j j} d}d}d}t j}d}t d dg}g }|t	dgg ddg t
dd	 |D }t
d
d	 |D }	|D ]s\}
}}}|| dksPJ td|
 d td| d| d| d|  || }|| }t j||	||fd|d}t j||	||fd|d}|du rtdddddddddddddd |D ]\}}}|| | | d  }|| | | d  }t|| }d!}|| }|| t||  }t j||||fd|d}t j|t jdd"d | }t j|g| t jdd"}ttj||||||d#d$ d$ }ttj||||||ddd%	d$ d$ }ttj||||||ddd%	d$ d$ }|r]d}td&} td|D ]o}!ttj||||||d|!d%	d$ d$ }"tj||||||d|!d%}#tj||||||ddd%}$|#|$  
  }%|#|$    }&t|%st|&s|%d'ks|&d(krtd)|! d*|% d+|&  |"| k r|"} |!}q?d}'td&}(td|D ]o}!ttj||||||d|!d%	d$ d$ }"tj||||||d|!d%}#tj||||||ddd%}$|#|$  
  }%|#|$    }&t|%st|&s|%d'ks|&d(krtd,|! d*|% d+|&  |"|(k r(|"}(|!}'q||' |  })||( }*|*d-kr]ttj||||||ddd%	d$ d$ }ttj||||||d|'d%	d$ d$ }(|du rtd.| d/| d0| d1|d2d3|(d2d4|d2d5|' d6||( d2d7|)d2d8|| d9 d2 |du rt|d|d|d|d:|d;|| d<|| d9 d: qqAd S )=N   Tr   *   )zLlama-3.1-70B@         i   )r      r      c                 s   s    | ]\}}}|V  qd S Nr   ).0r   reqsr   r   r   	<genexpr>P       zmain.<locals>.<genexpr>c                 s   s    | ]\}}}|V  qd S r    r   )r!   Zseqlenr   r   r   r   r#   Q   r$   r   z***zQHEADS:z
, KVHEADS:z
, HEADDIM:z, TP:r   )devicedtypeFCONTEXTz<9ZBSZz<5ZQLENz<6ZFA2z<10ZFA3ZRATIOz<7zGB/sr   r   )r&   r%   )qk_cachev_cachecache_seqlenscache_batch_idxcausalg     @@)r(   r)   r*   r+   r,   r-   Zpack_gqa
num_splitsinfgMb`?g-C6?z"Numerical error too high: Splits: z, Max: z, Mean: z(Numerical error too high (gqa): Splits: g?zCONTEXT:z, BSZ:z, QLEN:z, FA2:z.2fz, FA3 SPLIT MANUAL:z, FA3:z, FA3 NUM SPLITS:z, RATIO:z, EFF:z, GB/s:gMbP?z<10.2fz<9.2fz<7.2f)r   r   Zget_device_propertiesZcurrent_deviceZmulti_processor_countZbfloat16Zmanual_seedextend	itertoolsproductmaxprintZrandnr   mathceilZrandpermZint32Ztensorr   
flash_attnZflash_attn_with_kvcacheflash_attn_interfacefloatr   absitemr   isnan)+Znum_smsZ
max_splitsZcheck_all_splitsr-   r&   Z	tp_degreeZmodel_configsZall_batch_configsZ
num_cachesZcache_seqlenZ
model_nameZnheads_qZ	nheads_kvZheaddimr)   r*   Zcontext_seqlennum_requestsZquery_seqlenZbytes_kvZbytes_qZblockHZblockMZblockM_div_HZnum_work_tilesr(   Z
cache_idxsr+   Zfa2_time_heuristicZfa3_time_one_splitZfa3_time_gqa_heuristicZfa3_fastest_num_splitsZfa3_fastest_splitk_timer.   r   Zout0Zout1Zmax_diffZ	mean_diffZfa3_fastest_num_splits_gqaZfa3_fastest_splitk_time_gqaZ
efficiencyZheuristic_ratior   r   r   main#   s  

 2



,


,





 r>   __main__)r   r7   r8   r1   timer5   Ztorch.utils.benchmarkutilsr   r   r   r>   __name__r   r   r   r   <module>   s      )
