o
    81 i(                     @   sj  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 edddgZd	d
ddZd	d
dZd	d
dZddgZg dZddgZddgZddgZddgZdZdZdZdZeG dd dZde	e fddZde	e fddZde	e fddZded eddfd!d"Zd#e
e ddfd$d%Ze d&kre j!d'd(d)Z"e"j#d*d+d,dd-d. e"$ Z%ee%j& dS dS )/    N)
namedtuple)	dataclass)Path)ListOptionalKerneltemplatefilenamezcutlass::half_tzcutlass::bfloat16_tzcutlass::float_e4m3_t)fp16bf16Ze4m3)r
   r   P   Z   )@   `            FTz#include "flash_fwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template void run_mha_fwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {HEAD_DIM_V}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
#endif
a  #include "flash_fwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_SM8x
#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template void run_mha_fwd_<80, {DTYPE}, {HEAD_DIM}, {HEAD_DIM_V}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
template void run_mha_fwd_<86, {DTYPE}, {HEAD_DIM}, {HEAD_DIM_V}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
#endif
#endif
a#  #include "flash_bwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template<>
void run_mha_bwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SOFTCAP}>(Flash_bwd_params &params, cudaStream_t stream) {{
    run_mha_bwd_hdim{HEAD_DIM}<{ARCH}, {DTYPE}, {SOFTCAP}>(params, stream);
}}
#endif
a  #include "flash_bwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_SM8x
#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template<>
void run_mha_bwd_<80, {DTYPE}, {HEAD_DIM}, {SOFTCAP}>(Flash_bwd_params &params, cudaStream_t stream) {{
    run_mha_bwd_hdim{HEAD_DIM}<80, {DTYPE}, {SOFTCAP}>(params, stream);
}}
template<>
void run_mha_bwd_<86, {DTYPE}, {HEAD_DIM}, {SOFTCAP}>(Flash_bwd_params &params, cudaStream_t stream) {{
    run_mha_bwd_hdim{HEAD_DIM}<86, {DTYPE}, {SOFTCAP}>(params, stream);
}}
#endif
#endif
c                   @   sz   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< ed
efddZed
efddZ	dS )r   smdtypehead_dim
head_dim_vsplitpaged_kvsoftcappackgqa	directionreturnc              
   C   s"  | j dkr_| jdkr;| jp| jp| j}tjt| jt| j	 | j
| jt| j t| j t| j t| dS tjt| j	 | j
| jt| j t| j t| j td dS | j dkr| jdkr~tjt| jt| j	 | j
t| j dS tjt| j	 | j
t| j dS d S )	Nfwdr   )ARCHDTYPEHEAD_DIM
HEAD_DIM_VSPLITPAGEDKVSOFTCAPPACKGQAT)r   r    r!   r"   r#   r$   r%   bwd)r   r   r    r$   )r   r    r$   )r   r   r   r   r   KERNEL_IMPL_TEMPLATE_FWD_SM90formatstr	DTYPE_MAPr   r   r   lowerr   KERNEL_IMPL_TEMPLATE_FWD_SM8xKERNEL_IMPL_TEMPLATE_BWD_SM90KERNEL_IMPL_TEMPLATE_BWD_SM8x)selfr    r0   c/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/hopper/generate_kernels.pyr   _   s2   



zKernel.templatec                 C   sz   d| j  d| j | j| jkrd| j nd d| j | jrdnd | jr&dnd | jr-dnd | jr4dnd d	| j d
S )NZflash_Z_hdim_ _paged_split_softcap_packgqa_sm.cu)	r   r   r   r   r   r   r   r   r   )r/   r0   r0   r1   r	   ~   s   zzKernel.filenameN)
__name__
__module____qualname__int__annotations__r)   boolpropertyr   r	   r0   r0   r0   r1   r   S   s   
 r   c                  c   s8   t t tttttt	D ]l\} }}}}}}|r&|dk s%|dkr&|s%|r&q|dks.| t
v r<t|| ||||||dd	V  |dkrR|dkrRt|| |d||||dd	V  |dkrz|dkrz| dv rzt|| |d||||dd	V  t|| |d	||||dd	V  qt t ttt	D ]\} }}}t|| ||d
d
|d
dd	V  qd S )Nr   r   )	r   r   r   r   r   r   r   r   r   r   r   r   )r   r
   r   i   Fr&   )	itertoolsproductr*   keysHEAD_DIMENSIONSr"   r#   r$   r%   SMDTYPE_MAP_FWD_SM8xr   DTYPE_MAP_BWD)r   r   r   r   r   r   r   r0   r0   r1   get_all_kernels   s   , rH   c              
   #   sN   t t tttttD ]\ dk rq fdd| D }t	|dkr_d  r5dnd r;dnd rAd	nd rGd
nd d d	}d
dd |D }t||V   fdd| D }t	|dkrd  rzdnd rdnd rd	nd rd
nd d d	}d
dd |D }t||V  qd S )Nr   c                    sb   g | ]-}|j d kr|j kr|jkr|jkr|jkr|jkr|jkr|j|jkr|qS r   	r   r   r   r   r   r   r   r   r   .0kr   r   r   r   r   r   r0   r1   
<listcomp>      b zbatch_hdim.<locals>.<listcomp>r   Zflash_fwd_hdimall_r4   r3   r5   r6   r7   r8   r9   
c                 S      g | ]	}d |j  dqS z
#include ""r	   rK   r0   r0   r1   rO          c                    sb   g | ]-}|j d kr|j kr|jkr|jkr|jkr|jkr|jkr|j|jkr|qS rI   rJ   rK   rN   r0   r1   rO      rP   Zflash_fwd_hdimdiff_c                 S   rR   rS   rU   rK   r0   r0   r1   rO      rV   )rA   rB   r*   rC   r"   r#   r$   r%   rE   lenjoinKERNEL_BATCHkernels_allZkernelsr	   r   r0   rN   r1   
batch_hdim   s    (BBr\   c                 #   sB   t t tttttD ]P\ dkrq fdd| D }t	|dkr]d d  r8dnd r>d	nd d
rEdnd d d}d
dd |D }t||V  qt t ttD ]7\ dk rqqg fdd| D }t	|dkrd d  d d}d
dd |D }t||V  qgd S )Nr   c                    sV   g | ]'}|j d kr|j kr|jkr|jkr|jkr|jkr|jkr|qS rI   )r   r   r   r   r   r   r   rK   r   r   r   r   r   r   r0   r1   rO      s   V z!batch_softcap.<locals>.<listcomp>r   Zflash_fwd_hdimr2   r4   r3   r5   Z_softcapallr7   r8   r9   rQ   c                 S   rR   rS   rU   rK   r0   r0   r1   rO      rV   c                    s8   g | ]}|j d kr|j kr|jkr|jkr|qS )r&   )r   r   r   r   rK   )r   r   r   r0   r1   rO      s   8 Zflash_bwd_hdimZ_softcapall_smc                 S   rR   rS   rU   rK   r0   r0   r1   rO      rV   )rA   rB   r*   rC   rD   r"   r#   r%   rE   rW   rX   rY   rZ   r0   r]   r1   batch_softcap   s(   (>r^   kernelautogen_dirc                 C   s   d}|| j  || j  d S )Na  // Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
// Splitting the different template instantiations to different files to speed up compilation.
// This file is auto-generated. See "generate_kernels.py"

)r	   
write_textr   )r_   r`   Zpreluder0   r0   r1   write_kernel   s   rb   
output_dirc                 C   sz   | d urt | nt tj} | jddd tt }|D ]}t||  qt|D ]}t||  q't|D ]}t||  q3d S )NT)parentsexist_ok)	r   __file__parentmkdirlistrH   rb   r\   r^   )rc   r[   r_   r0   r0   r1   main   s   
rj   __main__Zgenerate_kernelsz<Generate the flash_attention kernels template instantiations)progdescriptionz-oz--output_dirZinstantiationszEWhere to generate the kernels  will default to the current directory )defaultrequiredhelp)'argparserA   collectionsr   dataclassesr   pathlibr   typingr   r   rY   r*   rF   rG   rE   rD   r#   r"   r$   r%   r'   r,   r-   r.   r   rH   r\   r^   rb   r)   rj   r:   ArgumentParserparseradd_argument
parse_argsargsrc   r0   r0   r0   r1   <module>   sb   

/