o
    81 iSL                     @   s  d dl Z d dlmZmZ d dlZd dlZd dlZd dlZd dlZd dl	Z						d(ddZ
											d)d	d
Ze jdddge jdddge jdg de jdg de jdg de jdddge jdg ddd Ze jdddge jdddge jdg de jdg de jdg de jdddge jdg ddd Ze jdejge jddge jdddge jdddge jdg d e jdg de jdg de jd!ddge jdddge jdg d"d#d$ Ze jdejge jddge jdddge jdddge jdg d e jdg de jdg de jd!ddge jdddge jdg d"d%d& Zed'kre  dS dS )*    N)	rearrangerepeatr   c                 C   s  t tj| |tjdd}tj||tjd}|d ur3t |d}t|d|jd d}t||k|| d}|d u r9|nt |dd}	|d u rG| nt |dd}
|d dk ra|||	 |
 |d	  kS |d u rkt||n|	}	t	|t
||	 |
 |d	  |	k|||	 |
 |d  k S )
Ndevicedtypezs -> s 1zb -> b 1 1 1zs -> b 1 1 sr   )bl        r      )r   torchZarangelongr   shapewheresumZ	full_like
logical_orminimum)seqlen_qseqlen_kwindow_sizequery_padding_maskkey_padding_maskr   key_leftpadZrow_idxZcol_idxsksq r   d/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/hopper/test_attn_kvcache.pyconstruct_local_mask
   s*   	
r           FTc              	   C   s*  |r|	d df}	| j }|r|  | | } }}| jd |jd }}t|d| jd |jd  d}t|d| jd |jd  d}| jd }|sXtd| t| |}ntd| |t| }|
dkrt||
 }| }||
 }|dur|	t
| d	td
 |	d dks|	d dkrt|||	||| j|d}|	|td
 |dur|| }tj|dd|j }|	d dks|	d dkr|tj|dddd}|dur|t
| dd}dd|  }|dur|| d}n|}td||| }|dur	|	t
| dd |j|d|j|dfS )a  
    Arguments:
        q: (batch_size, seqlen_q, nheads, head_dim)
        k: (batch_size, seqlen_k, nheads_k, head_dim)
        v: (batch_size, seqlen_k, nheads_k, head_dim)
        query_padding_mask: (batch_size, seqlen_q)
        key_padding_mask: (batch_size, seqlen_k)
        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
        dropout_p: float
        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
        causal: whether to apply causal masking
        window_size: (int, int), left and right window size
        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
            output back to fp16/bf16.
        reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
            without changing the math. This is to estimate the numerical error from operation
            reordering.
    Output:
        output: (batch_size, seqlen_q, nheads, head_dim)
        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
    r   r
   zb s h d -> b s (h g) d   )gr   zbthd,bshd->bhtsNzb s -> b 1 1 sz-inf)r   )dimT)r    Zkeepdimr   zb s -> b 1 s 1      ?zbhts,bshd->bthdzb s -> b s 1 1r   )r   floatr   r   r   ZeinsummathsqrttanhZmasked_fill_r   r   r   ZsoftmaxtoZmasked_fillall)qkvr   r   Z	attn_biasZ	dropout_pZdropout_maskcausalr   ZsoftcapZupcastZreorder_opsr   Zdtype_ogr   r   dZscoresZ
local_maskZ	attentionZdropout_scalingZattention_dropoutputr   r   r   attention_ref-   sX   %
	
r/   r,   num_requestsr
      query_seqlen)r
      x   context_seqlen)i   i;  i  headdim)@         gqa_parallelznheads_kv, gqa_ratio)	r
   r
   r         r?   r
       )r=      )r3   r
   r
      )   r1   )r3   r   c              
   C   s   d}|}	|}
| | }t j|	|
| |fdt jd}t j|	|
| |fdt jd}t j||||fdt jd}t j|g| t jdd}t j  t||||d\}}tj	|||||dd|d\}}t j  || 
   dksnJ || 
   d	ks|J d S )
Ncudar   r   r   r,   r
   T)r)   k_cachev_cachecache_seqlensr,   
num_splitsreturn_softmax_lser:   gMbp?g-C6*?)r   randnbfloat16tensorint32rF   synchronizer/   flash_attn_interfaceflash_attn_with_kvcacheabsmaxitemmean)	nheads_kv	gqa_ratior0   r2   r5   r6   r,   r:   r   
num_cachescache_seqlennheads_qrI   rJ   r)   rK   out_ref_out_fa3lse_fa3r   r   r   test_flash_attn_kvcache_nosplit   s@   



 rb   r?   )i@  i  i  c                 C   sb  d}|}	|}
| | }t j|	|
| |fdt jd}t j|	|
| |fdt jd}t j||||fdt jd}|t j}|t j}|t j}t j|g| t jdd}t j  t	||||d\}}t jdgt j
dd}t jdgt j
dd}t jdgt j
dd}tj|||||dd||||d\}}t j  ||    d	ksJ ||    d
ksJ d S )NrF   r   rG   rH   r!   r
   T)r)   rI   rJ   rK   r,   rL   rM   r:   	descale_q	descale_k	descale_vg{Gz?Mb`?)r   rN   rO   r'   float8_e4m3fnrP   rQ   rF   rR   r/   float32rS   rT   rU   rV   rW   rX   )rY   rZ   r0   r2   r5   r6   r,   r:   r   r[   r\   r]   rI   rJ   r)   rK   r^   r_   rc   rd   re   r`   ra   r   r   r   #test_flash_attn_kvcache_nosplit_fp8   sN   



 ri   r   use_heuristic_only)r
   r3      cache_seqlen_rand)	r;   )r1   r
   )r   r   r>   )r1   r1   r<   )r?   	   rC   r@   c                  C   s$  d}d}|dkrd}n|}| | }|rd}nd}t j||| |fdt jd}t j||| |fdt jd}t j||||fdt jd}||
}||
}||
}t j|t jddd | }|rmt jd|d |ft jd|nt j|g| t jdd}t j	  t
j||||||dd	d
d	\}}td|d D ]}t
j|||||||d	|	|d
\}}t j	  td|| td|| td||||     td||||     td||||     td||||     |r||    dksJ ||    dksJ n||    dks&J ||    dks5J |   }|   }|   }|   }||    }||    }|tjkru|tjks||dks|J |tjkr|tjks|dksJ qd S )NrF   rD      r
   r8   r   rG   r"   TF)	r)   rI   rJ   rK   cache_batch_idxr,   rL   rM   r:   r   )
r)   rI   rJ   rK   ro   r,   rL   rM   r:   max_seqlen_k_hint
output-ref
output-fa3output-max-diffoutput-mean-difflse-max-difflse-mean-diff{Gz?MbP?rf   -C6?)r   rN   rO   r'   randpermrQ   randintrP   rF   rR   rS   rT   rangeprintrU   rV   rW   rX   r$   inf) rY   rZ   r0   r2   r5   r6   r,   rj   rl   r:   r   r   r[   r\   r]   
max_splitsrI   rJ   r)   
cache_idxsrK   r^   lse_refir`   ra   lse_max_reflse_mean_reflse_max_fa3lse_mean_fa3lse_max_difflse_mean_diffr   r   r   test_flash_attn_kvcache_output	  s   


<



 &&r   c           #      C   sl  d}d}|dkrd}n|}| | }|rd}nd}t j||| |fdt jd}t j||| |fdt jd}t j||||fdt jd}||
}||
}||
}t j|t jddd | }|rmt jd|d |ft jd|nt j|g| t jdd}t j	  t jd	gt j
dd}t jd	gt j
dd}t jd	gt j
dd}tj||||||dd
d|||d\}}td|d D ]}tj|||||||d
|	||||d\}}t j	  td|| td|| td||||     td||||     td||||     td||||     |r;||    dks+J ||    dks:J n||    dksJJ ||    dksYJ |   }|   }|   }|   } ||    }!||    }"|tjkr|tjks|!dksJ |tjkr| tjks|"dksJ qd S )NrF   rD   rn   r
   r8   r   rG   r"   r!   TF)r)   rI   rJ   rK   ro   r,   rL   rM   r:   rc   rd   re   r   )r)   rI   rJ   rK   ro   r,   rL   rM   r:   rp   rc   rd   re   rq   rr   rs   rt   ru   rv   g?rw   g{Gz?rf   rx   ry   )r   rN   rO   r'   rz   rQ   r{   rP   rF   rR   rh   rS   rT   r|   r}   rU   rV   rW   rX   r$   r~   )#rY   rZ   r0   r2   r5   r6   r,   rj   rl   r:   r   r   r[   r\   r]   r   rI   rJ   r)   r   rK   rc   rd   re   r^   r   r   r`   ra   r   r   r   r   r   r   r   r   r   "test_flash_attn_kvcache_output_fp8t  s   


<



 &&r   __main__)r   NNNN)NNNr   NFr   r   TFN)ZpytestZeinopsr   r   r   Z
flash_attnrS   	itertoolsr$   timer   r/   markZparametrizerb   ri   rO   r   rg   r   __name__mainr   r   r   r   <module>   s    
'
Z*0P
V
