o
    )iW{                  "   @   sX  d dl Z d dlmZ d dlmZmZ edrdndZe r!dndZ	e
 d	kZej	 	 d&d
ejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejf ddZejdedejdejdejfddZejdedejdejdejdejdejdejfddZe  				 	d'd!ed"e jd#e jfd$d%ZdS )(    N)current_platform)tltritonP      @         )      xstride_k_cache_blnum_queries_per_kvIN_PRECISIONBLOCK_MBLOCK_DMODELBLOCK_DMODEL_PADDED
BLOCK_SIZEBLOCK_NSLIDING_WINDOWnum_unroll_cachenum_unroll_requestSKIP_DECODE	USE_SINKS	MAX_Q_LENMAX_CTX_LENc3           Y   	   C   s  t d}3t d}4t d}5|4|% }6t ||3 }7t |
|3 }8t |
|3 d }9|9|8 }:|7|: };|/r:|:dkr:d S |'|5 }<t d|*}=t d|+}>t d|)}?|5|' t d|' }@|8|@d d d f  | |4|  |?d d d f |  }At t d|)|(k ddt j}Bt j| |A |Bd d d f |@d d d f |:k @ dd}C|0st j|'gtdt j	d}Dnt j|t j|'g|4t j
d |@|:k tddjt j	d}Dt j|'gdt j	d}Et j|'|)gt j	d}Ft jd|;|*|-d	D ]}Gt |G|*}Gt ||3|  |G|* |  }H|Hd d d f | |6|  |?d d d f | |  |G|=d d d f  |* |  |?d d d f | |   }I|Hd d d f |! |6|"  |?d d d f |#  |=d d d f |$  }J|G|* |;ks`|(|)kr}t j||I |Bd d d f |G|=d d d f  |;k @ dd}Knt ||I }K|Kj r|Kt j	t | |Cj}Ln|K}Lt j|'|*gt j	d}Mt j|C|L|M|&d
}Mt |G|=d d d f  |;k |Mtd}M|M|9 }M|,dkrt |;|@d d d f  |G|=d d d f   |,k |Md}Mt |Dt j|Mdd}Nt |M|Nd d d f  }Ot j|Odd}Pt |D|N }Q|F|Qd d d f  }F|G|* |;ks"|(|)kr?t j||J |Bd d d f |G|=d d d f  |;k @ dd}Rnt ||J }R|Rj r\|Rt j	t |	 |Cj}Sn|R}S|O|Sj}Ot j|O|S|F|&d
}F|E|Q |P }E|N}Dq|>d d d f | |6|  |?d d d f |  }I|>d d d f | |6|  |?d d d f |  }J||I }T||J }Ut |<|:k dd}Vt jd|V|5d  |' |+|.d	D ]}Gt |G|+}Gt j|T|8|G |  |Bd d d f |G|>d d d f  |:k @ dd}Lt j|'|+gt j	d}Mt j|C|L|M|&d
}M|M|9 }Mt |@d d d f |G|>d d d f  k|Mtd}M|,dkr<t |@d d d f |G|>d d d f   |,k |Md}Mt |Dt j|Mdd}Nt |M|Nd d d f  }Ot j|Odd}Pt |D|N }Q|F|Qd d d f  }Ft j|U|8|G |  |Bd d d f |G|>d d d f  |:k @ dd}S|O|Sj}Ot j|O|S|F|&d
}F|E|Q |P }E|N}Dq|F|Ed d d f  }F|8|@d d d f  | |4|  |?d d d f |  }W||W }Xt j|X|F|Bd d d f |@d d d f |:k @ d d S )Nr                 maskother-infdtype      ?)Zloop_unroll_factoraccZinput_precisioni)Zaxisr    )r   
program_idloadarangewheretoint1fullfloatfloat32Zint64zerosrangemultiple_ofr$   is_fp8dotmaximummaxexpsumstore)YQKVK_cacheV_cacheZsink_ptrB_Locsm_scalek_scalev_scaleB_Start_LocB_Seqlenr   Outstride_b_loc_bstride_b_loc_s
stride_qbs	stride_qh	stride_qd
stride_kbs	stride_kh	stride_kd
stride_vbs	stride_vh	stride_vd
stride_obs	stride_oh	stride_odstride_k_cache_bsstride_k_cache_hstride_k_cache_dr   stride_k_cache_xstride_v_cache_bsstride_v_cache_hstride_v_cache_dstride_v_cache_blr   r   r   r   r   r   r   r   r   r   r   r   r   r   	cur_batchcur_headstart_mcur_kv_headcur_batch_seq_lencur_batch_in_all_start_indexcur_batch_in_all_stop_indexcur_batch_query_lencur_batch_ctx_lenblock_start_locZ	offs_bs_noffs_noffs_doffs_moff_qdim_maskqm_il_ir'   start_nbnoff_koff_vk_loadkqkm_ijpl_ijalphav_loadvk_ptrsv_ptrs
block_maskoff_oout_ptrs r   m/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/attention/ops/prefix_prefill.py_fwd_kernel#   sh  
5





 

 

(
&
"r   c(           K   	   C   s  t d}(t d})t d}*|)|$ }+t |	|( },t ||( }-t ||( }.|%|* }/t d|'}0t d|&}1|*|% t d|% }2|.|2d d d f  | |)|  |1d d d f |  }3t j| |3 |2d d d f |-|, k dd}4t j|%gt jdtd }5t j|%gt jd}6t j|%|&gt jd}7td|,|'D ]}8t |8|'}8t j||(|  |8|0 |
 |  |8|0 |,k dd}9|9d d d f | |+|  |1d d d f | |  |8|0d d d f  |
 |  |1d d d f | |  }:|9d d d f |  |+|!  |1d d d f |"  |8|0d d d f  |
 |#  };t j||: |8|0d d d f  |,k dd}<t j|%|'gt jd}=|=t 	|4|<7 }=t 
|8|0d d d f  |,k |=td}=|=|9 }=t |=d}>t |5|>}?t j|=|?d d d f  }@t |@d}At j|5|? }B|B|6 |A }C|B}D|7|Dd d d f  }7t j||; |8|0d d d f  |,k dd}E|@|Ej}@|7t 	|@|E7 }7|C}6|?}5q|0d d d f | |+|  |1d d d f |  }:|0d d d f | |+|  |1d d d f |  };||: }F||; }Gt 
|/|-|, k dd}Htd|H|*d  |% |'D ]}8t |8|'}8t j|F|.|8 |  |8|0d d d f  |-|, k dd}<t j|%|'gt jd}=|=t 	|4|<7 }=|=|9 }=t 
|2d d d f |8|0d d d f  k|=td}=t |=d}>t |5|>}?t j|=|?d d d f  }@t |@d}At j|5|? }B|B|6 |A }C|B}D|7|Dd d d f  }7t j|G|.|8 |  |8|0d d d f  |-|, k dd}E|@|Ej}@|7t 	|@|E7 }7|C}6|?}5q|.|2d d d f  | |)|  |1d d d f |  }I||I }Jt j|J|7|2d d d f |-|, k d	 d S )
Nr   r   r   r   r   r#   infr"   r(   )r   r)   r*   r+   r2   r1   r0   r3   r4   r6   r,   r8   r7   mathr9   r:   r-   r$   r;   )Kr<   r=   r>   r?   r@   rA   rB   rE   rF   ZB_Ctxlen
block_sizer   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   r   rY   rZ   r[   r\   r]   r   r   r   r   r^   r_   r`   ra   rf   rb   rc   rg   rh   ri   rj   rk   rm   rn   ro   r'   rp   rq   rr   rs   ru   rv   rw   m_i_newrx   ry   rz   l_i_new	acc_scaler|   r}   r~   r   r   r   r   r   r   _fwd_kernel_flash_attn_v2%  s  
+






(
r   c-           Y   	   C   s  t d}-t d}.t d}/|.|& }0t |
|- }1t |	|- }2t |	|- d }3|3|2 }4|1|4 }5|,r:|4dkr:d S |(|/ }6t d|+}7t d|*}8|/|( t d|( }9|2|9d d d f  | |.|  |8d d d f |  }:t t d|*|)k ddt j};t j| |: |;d d d f |9d d d f |1|5 k @ dd}<t j|(gt jdt	d }=t j|(gt jd}>t j|(|*gt jd}?t ||. }@t d|(|6 |5 }Ad}Bt
d|5|+D ]}Ct |C|+}Ct j||-|  |C|7 | |  |C|7 |5k dd}D|Dd d d f | |0|  |8d d d f | |  |C|7d d d f  | |   |8d d d f | |!  }E|Dd d d f |" |0|#  |8d d d f |$  |C|7d d d f  | |%  }Ft j||E |;d d d f |C|7d d d f  |5k @ dd}G|Gj r|Gt jt | |<j}Hn|G}Ht j|(|+gt jd}It j|<|H|I|'d}It |C|7d d d f  |5k |It	d	}I|I|9 }It d|+d d d f |B |Ad d d f  |@ }Jt |Jdk|Ad d d f |1k @ |Jt	d	}J|I|J7 }I|B|+7 }Bt |Id}Kt |=|K}Lt j|I|Ld d d f  }Mt |Md}Nt j|=|L }O|O|> |N }P|O}Q|?|Qd d d f  }?t j||F |;d d d f |C|7d d d f  |5k @ dd}R|Rj rN|Rt jt | |<j}Sn|R}S|M|Sj}Mt j|M|S|?d
d}?|P}>|L}=q|7d d d f | |0|  |8d d d f |  }E|7d d d f | |0|  |8d d d f |  }F||E }T||F }Ut |6|1|5 k dd}Vt ||. }@t d|(|6 |5 }A|5}Bt
d|V|/d  |( |+D ]}Ct |C|+}Ct j|T|2|C |  |;d d d f |C|7d d d f  |1|5 k @ dd}Ht j|(|+gt jd}It j|<|H|Id
d}I|I|9 }It |9d d d f |C|7d d d f  k|It	d	}It d|+d d d f |B |Ad d d f  |@ }Jt |Jdk|Ad d d f |1k @ |Jt	d	}J|I|J7 }I|B|+7 }Bt |Id}Kt |=|K}Lt j|I|Ld d d f  }Mt |Md}Nt j|=|L }O|O|> |N }P|O}Q|?|Qd d d f  }?t j|U|2|C |  |;d d d f |C|7d d d f  |1|5 k @ dd}S|M|Sj}Mt j|M|S|?d
d}?|P}>|L}=q|?|>d d d f  }?|2|9d d d f  | |.|  |8d d d f |  }W||W }Xt j|X|?|;d d d f |9d d d f |1|5 k @ d d S )Nr   r   r   r   r   r#   r   r&   r"   ieeer(   )r   r)   r*   r+   r,   r-   r.   r2   r1   r0   r3   r4   r$   r5   r6   r8   r7   r   r9   r:   r;   )Yr<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   ZAlibi_slopesr   r   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   r   rY   rZ   r[   r\   r]   r   r   r   r   r   r   r   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   r'   Zalibi_slopeZalibi_start_qZalibi_start_krp   rq   rr   rs   rt   ru   rv   Zalibirw   r   rx   ry   rz   r   r   r{   r|   r}   r~   r   r   r   r   r   r   _fwd_kernel_alibi  sl  
1




 
  ( r   Fkv_cache_dtyperC   rD   c                    s  | j tju }tr|rdnd }d|v rJ|j tjt fv sJ |j tjt fv s*J |dv r3t }n|dkr;tj}ntd||	|}|	|}|j tjksZ|j tjkr^|dkr^td| j
d |j
d |j
d }}}||krw||ksyJ t|}|d u rd	|d
  }|	j
d | j
d  | j
d |j
d  } d t|ksJ |d u s|dkrd}|d urz|d u sJ d|rtd nt} t|f}t| g | ||||||||||	||j
d |j
d ||d|d| d| d| d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|dR |||||||tdd	 d S |
d u rdn|
}
i }t rddd} fdd}t| g | |||||||||||	|j
d ||d|d| d| d| d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|dR |j
d ||||||dddddd|d ud| d S )Nr   fp8)r   Zfp8_e4m3Zfp8_e5m2zUnsupported FP8 dtype:autozLkv_cache_dtype='auto' unsupported for            FP8 KV Cache prefill kernelr%   g      ?r   r   z%Sinks arg is not supported with alibir      r   )	r   r   r   r   r   r   r   	num_warps
num_stages)ZkpackZwaves_per_euc                    s    t | d fS )Nr   )r   cdiv)ZMETAbatchheadmax_input_lenr   r   <lambda>\  s    z'context_attention_fwd.<locals>.<lambda>r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r$   torchr1   	IS_TURINGZuint8r   Z	fp8_dtypeZfloat8_e5m2
ValueErrorviewshaper   Znext_power_of_2len
BASE_BLOCKr   r   Zstride	NUM_WARPSis_rocmr   )rm   ru   r|   or   Zk_cacheZv_cacheZb_locZb_start_locZ	b_seq_lenZmax_seq_lenr   rC   rD   Zalibi_slopesZsliding_windowrB   Zskip_decodeZsinksZq_dtype_is_f32r   Ztarget_dtypeZLqZLkZLvZ	Lk_paddedr   ZBLOCKgridZextra_kargsr   r   r   context_attention_fwd  s  



"

	
 !"$%&')3

	
 !#$%&'56r   )r   r   )NNNFN)r   Zvllm.platformsr   Zvllm.triton_utilsr   r   Zhas_device_capabilityr   r   r   Zget_device_capabilityr   ZjitZ	constexprr   intr   r   Zinference_modestrZTensorr   r   r   r   r   <module>   s   2%&'()*+,-./012  %&'( :'()*+,- x