o
    81 i3                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlm  mZ d dl	m
Z
mZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZmZ zd dlmZ W n eyZ   dZY nw zd dlmZ W n eyn   dZY nw zd dlZW n ey   dZY nw dd	 Zd?ddZ d@ddZ!dAddZ"dd Z#dd Z$e%d  dZ&dZ'ej(Z)g dZ*d
dgZ+g dZ,dZ-dZ.ddgedurdgng  Z/i Z0i Z1i Z2i Z3i Z4i Z5e+D ]xZ6e,D ]rZ7e*D ]l\Z8Z9ej:;  e6e7e8e9fZ<e-e7 Z=d d! e>d"D \Z?Z@ZAejBe?e@eAgd#d$ZCeCDejEZCe$e!eCe.e6e&d
d%ZFeFe0e<df< e!eCe.e6d&ZGedure?Hd'd#I Dej(ZJe@Hd'd#I Dej(ZKeAHd'd#I Ld d'd"d#Dej(ZMd'eNe7 ZOe$eeJeKeMe6eOd(d
d)d*	ZFe$eeJeKeMe6eOe&d
d)d*	ZFeFe0e<d)f< eeJeKeMLd d'd"d#e6eOP Hd'd#ZQejRjSeQeGd+d+d, e?De)e@De)eADe)Z?Z@ZAe?jTd- d. ZUejVd/gejWdd0ZXejVd/gejWdd0ZYejVd/gejWdd0ZZe$ee?e@eAeUe6d1eXeYeZe&d
d2ZFeFe0e<df< edureCDe)Z[e\d' e$e e[e9e9e6d&e&d
d3ZFeFe0e<df< e]d4e6 d5e7 d6e8 d7e9 d8	 e/D ]0Z^e#e"e8e9e7e=e6dd9e0e<e^f e3e<e^f< e]e^ d:e3e<e^f d;d<e0e<e^f d=  d> qqqqdS )B    N)	rearrangerepeat)benchmark_allbenchmark_forwardbenchmark_backward)benchmark_fwd_bwdbenchmark_combined)flash_attn_qkvpacked_func)flash_attn_func_flash_attn_forward)	attentionc                 C   s   | t jkr	tjjS | t jkrtjjS | t jkrtjjS | t j	kr$tjj
S | t jkr-tjjS | t jkr6tjjS | t jkr?tjjS td)NzUnsupported tensor data type.)torchZfloat16cudnn	data_typeZHALFbfloat16ZBFLOAT16float32FLOATZint32ZINT32Zint64ZINT64float8_e4m3fnZFP8_E4M3Zfloat8_e5m2ZFP8_E5M2
ValueError)Z
torch_type r   p/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/hopper/benchmark_flash_attention_fp8.pyconvert_to_cudnn_type"   s   






r   Fc                    sZ  | j \}}}}}td usJ dtj||||| j| jdt||||g|| | ||| dg}tj|||dtj| jd}	tjddddtj| jd}
tjddddtj| jd tj	t
| jtjjtjjdtj| ||||g|| | d ||| d dgdd}jdt|j t| t
| jd	}tj| ||||g|| | d ||| d dg|| d}jd
t|j t| t
| jd	}tj| ||||g|| | d ||| d dg|| d d}jdt|j t| t
| jd	}fdd}tjddddtjdd}| }| }| }| }| }| }j|||||||||ddt| |dd\}}}}|d|j |  |d|
j |
  |d j        tjjtjjg     ||||||||||||||||||||||
| itj dtj d fdd}|S )NzCUDNN is not availabledtypedevice   )Zio_data_typeZintermediate_data_typeZcompute_data_type   r   )Zstorage_offsetQ)namedimstrider   K   Vc                      s    j g dg dtjjdS )N)r   r   r   r   )r   r    r   )tensorr   r   r   r   )graphr   r   get_default_scale_tensorj   s
   z2cudnn_spda_setup.<locals>.get_default_scale_tensorcudaT      ?Zsdpa)qkv	descale_q	descale_k	descale_v	descale_sscale_sscale_oZis_inferenceZ
attn_scaleZuse_causal_maskr   F)r   r   c                     s      fS )N)execute)argskwargsZ
amax_o_gpur%   Zo_gpuZvariant_packZ	workspacer   r   run   s   zcudnn_spda_setup.<locals>.run)!shaper   r   Zzerosr   r   Z
as_stridedemptyr   Zpygraphr   r   r   r$   listr    ZonesZsdpa_fp8mathsqrtZ
set_outputZset_dimZ
set_stridevalidateZbuild_operation_graphZcreate_execution_plansZ	heur_modeAFALLBACKZcheck_supportZbuild_plansZget_workspace_sizeZuint8)qkvZseqlen_qZseqlen_kcausalb_nheadsheaddimZo_gpu_transposedZ	stats_gpuZ
amax_s_gpuZnew_qr)   Znew_kr*   new_vr+   r&   Zdefault_scale_gpur,   r-   r.   r/   r0   r1   oZamax_sZamax_or6   r   r5   r   cudnn_spda_setup4   s   







rG           Tc                 C   s   | j \}}}}}| jdd\}}	}
t|d}t|	d}	dt| }tj|| ||| j| jd}ttj	|||	d|dd	|d
}|rYt
tj||fd|jdd}||j|jd }tj|dd}t||}td||
}|j| jdS )z
    Arguments:
        qkv: (batch_size, seqlen, 3, nheads, head_dim)
        dropout_p: float
    Output:
        output: (batch_size, seqlen, nheads, head_dim)
    r"   r   zb t h d -> (b h) t dzb s h d -> (b h) d sr(   r   r   )betaalphaz(b h) t s -> b h t s)hg     )r   r   )r   zbhts,bshd->bthd)r7   Zunbindr   r:   r;   r   r8   r   r   ZbaddbmmZtriufulltoZsoftmaxFZdropoutZeinsum)r?   	dropout_pr@   
batch_sizeseqlenrB   rC   dr)   r*   r+   softmax_scaleZscoresZcausal_maskr   Zattention_dropoutputr   r   r   attention_pytorch   s    

rW   fwdc                 C   sT   |dv sJ d|  |d  | | |rdnd }|dkr|S |dkr&d| S d| S )	N)rX   bwdZfwd_bwd   r"   r   rX   rY   g      @g      @r   )batchrS   rD   rC   r@   modefr   r   r   flops   s   $$r^   c                 C   s   t |s| | d S dS )Nl    J)rH   )r:   isnan)Zfloptimer   r   r   
efficiency   s   ra   c                 O   s*   t d t| g|R i |}|d jS )Nr   )r`   sleepr   mean)funcr3   r4   time_fr   r   r   time_fwd   s   

rf      r'   ))    i   )   i   )      )rZ   i   )r"   i    )r   i @  )@         rk   ZPytorchZFlash3ZcuDNNc                 C   s&   g | ]}t jtttttt jd dqS )F)r   r   Zrequires_grad)r   ZrandnrR   rS   rC   rD   r   r   ).0rB   r   r   r   
<listcomp>   s   & rp   r   r"   rI   )r@   repeatsverbose)r@   r      ZTriton)rq   rr   Zdescg      ?)ZatolZrtolrM   g      r(   r   )rM   rM   )r@   Zwindow_sizer,   r-   r.   rq   rr   )rq   rr   z### causal=z
, headdim=z, batch_size=z	, seqlen=z ###)r\   z fwd: z.2fz TFLOPs/s, g     @@z ms, )F)rH   T)rX   )_pickler:   r`   r   Ztorch.nnnnZtorch.nn.functionalZ
functionalrP   Zeinopsr   r   Zflash_attn.utils.benchmarkr   r   r   r   r   Z
flash_attnr	   Zflash_attn_interfacer
   r   Ztriton_fused_attentionr   Zattention_tritonImportErrorZxformers.opsopsZxopsr   r   rG   rW   r^   ra   rf   Zmanual_seedrq   r   r   r   Zbs_seqlen_valsZcausal_valsZheaddim_valsr   rQ   methodsre   Ztime_bZtime_f_bZspeed_fZspeed_bZ	speed_f_br@   rD   rR   rS   r'   Zempty_cacheconfigrC   ranger)   r*   r+   stackr?   rO   r   r]   Zres_baselineZ	transpose
contiguousZq_transposedZk_transposedZpermuteZv_transposedr;   scaleZhalfrestestingZassert_closer7   rU   r$   r   r,   r-   r.   Zqkv_fp8rb   printmethodr   r   r   r   <module>   s   

y





$

"


"
*