o
    )i=                     @   sD  d dl mZ d dlmZ d dlZd dlm  mZ d dl	Z
d dlmZ d dlmZ d dl	mZ d dlmZmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ eeZdZ ej!ej"ej#ej$ej$ej%ej&ej'dZ(G dd deZ)eG dd dZ*G dd deZ+			d9dej,dej,dej,dej,de-dej,deej. de#de#ddfdd Z/ed!ej,dej,dej,dej,d"e-d#e-fd$d%Z0e1d& eed'd(d!ej,dej,dej,dej,d"e-d#e-dej,fd)d*Z2eed'd+d!ej,dej,dej,dej,d"e-d#e-dej,fd,d-Z3d.ej.fd/d0Z4d1d2 Z5d3e-d4e-d5e-d6ej.de-f
d7d8Z6dS ):    )	dataclass)OptionalN)impl)requires_jax)XLA_LIB)AttentionBackendAttentionImplAttentionLayerAttentionTypeCommonAttentionState)
VllmConfig)init_logger)cdivnext_power_of_2   )halfbfloat16floatZfp8Zfp8_e4m3Zfp8_e5m2int8uint8c                   @   s   e Zd ZedefddZeded fddZeded fdd	Zeded
 fddZ	ede
de
de
de
dee
df f
ddZedejdejdejddfddZedede
fddZede
de
de
fdd Zedede
fd!d"ZdS )#PallasAttentionBackendreturnc                   C   s   dS )NZPALLAS_VLLM_V1 r   r   r   m/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/v1/attention/backends/pallas.pyget_name+      zPallasAttentionBackend.get_namePallasAttentionBackendImplc                   C      t S N)r   r   r   r   r   get_impl_cls/   r   z#PallasAttentionBackend.get_impl_clsPallasMetadatac                   C   r   r   )r!   r   r   r   r   get_metadata_cls3   r   z'PallasAttentionBackend.get_metadata_clsr   c                   C   r   r   r   r   r   r   r   get_state_cls7   r   z$PallasAttentionBackend.get_state_cls
num_blocks
block_sizenum_kv_heads	head_size.c                 C   s   t |tt }| ||d |fS )N   )r   TPU_HEAD_SIZE_ALIGNMENT)r$   r%   r&   r'   padded_head_sizer   r   r   get_kv_cache_shape;   s   z)PallasAttentionBackend.get_kv_cache_shapesrc_kv_cachedst_kv_cache
src_to_dstNc                 C   s   t d)Nz,swap_blocks is not used for the TPU backend.)RuntimeError)r,   r-   r.   r   r   r   swap_blocksF   s   z"PallasAttentionBackend.swap_blocksvllm_configc                 C   s2   d| j j d }t| jj|}d|d  > }|S )N         )Zscheduler_configZmax_num_seqsr   model_configmax_model_len
bit_length)r1   Zmax_num_page_per_reqZmin_page_sizer   r   r   get_min_page_sizeR   s   z(PallasAttentionBackend.get_min_page_size	model_len	page_sizec                 C   s   t | |}d| d S )Nr2   r3   )r   )r9   r:   Znum_page_per_reqr   r   r   get_max_num_seqs[   s   
z'PallasAttentionBackend.get_max_num_seqsc                 C   s<   | j jdkrdS t| j jd }|dkrdS |dkrdS |S )Ni          )r5   r6   r   )r1   r:   r   r   r   get_page_sized   s   z$PallasAttentionBackend.get_page_size)__name__
__module____qualname__staticmethodstrr   typer    r"   r#   inttupler+   torchTensorr0   r   r8   r;   r>   r   r   r   r   r   )   sJ    

r   c                   @   sR   e Zd ZU ejed< ejed< ejed< ejed< ejed< ejed< eed< dS )	r!   slot_mappingblock_tablescontext_lensquery_start_locnum_seqsnum_kv_update_slices$num_slices_per_kv_cache_update_blockN)r?   r@   rA   rG   rH   __annotations__rE   r   r   r   r   r!   u   s   
 





r!   c                   @   s   e Zd Zdejdfdededededeee  dee de	d	ee d
e	dee ddfddZ
		ddedejdejdejdejdedeej deej dejfddZdS )r   N	num_headsr'   scaler&   alibi_slopessliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_namer   c                 C   s   || _ || _t|| _|| _|| _|| _|
| _| j | j | _|d ur&t	d|	t
jkr/t	dd | _|dkrBt|  | _d S d S )NzAlibi slopes is not supported.zmEncoder self-attention and encoder/decoder cross-attention are not implemented for PallasAttentionBackendImplauto)rQ   r'   r   rR   r&   rT   rV   rX   Znum_queries_per_kvNotImplementedErrorr
   DECODERkv_cache_quantized_dtypeTPU_STR_DTYPE_TO_TORCH_DTYPEgetlowerstrip)selfrQ   r'   rR   r&   rS   rT   rU   rV   rW   rX   r   r   r   __init__   s$   



z#PallasAttentionBackendImpl.__init__layerquerykeyvaluekv_cacheattn_metadataoutputoutput_scalec	                 C   s  |durt d| dkr|du rt|}|S |j\}	}
||	| j| j}|d| j| j}|d| j| j}| jt	 dkrut
| jt	t	 }tjjj|d|| j fdd}tjjj|d|| j fdd}tjjj|d|| j fdd}| jdu r| dkr|j}t|||||j|j| j|j|j	 | jdur|jdks|jdkrtdtjjj|||j|j|j|jdddd| j| j| j |j|jd	}| jt	 dkr|ddddd| jf }|!|	|
S )
a  Forward pass with Pallas attention.

        Args:
            query: shape = [num_tokens, num_heads * head_size]
            key: shape = [num_tokens, num_kv_heads * head_size]
            value: shape = [num_tokens, num_kv_heads * head_size]
            kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzMfused output quantization is not yet supported for PallasAttentionBackendImplr   g        )rf   z0k_scale_float and v_scale_float must be non-zeroT)	Znum_kv_pages_per_blockZnum_queries_per_blockZvmem_limit_bytesZ
use_kernelZsm_scalerT   Zsoft_capk_scalev_scale)"rZ   ZnumelrG   Z	ones_likeshapeviewrQ   r'   r&   r)   r   nnZ
functionalpadrX   rI   write_to_kv_cacherO   rN   r\   Z_k_scale_floatZ_v_scale_float
ValueErroropsxlaZragged_paged_attentionrK   rJ   rL   rM   rR   rT   rV   reshape)ra   rc   rd   re   rf   rg   rh   ri   rj   Z
num_tokensZhidden_sizer*   rI   r   r   r   forward   s   


z"PallasAttentionBackendImpl.forward)NN)r?   r@   rA   r
   r[   rE   r   r   listrC   rb   r	   rG   rH   r!   rw   r   r   r   r   r      s`    
	

,	
r         ?re   rf   rg   rI   rO   rN   r\   rl   rm   r   c	                 C   s   |j \}	}
}}t|tt }|durCt|}| tj| } t| |j|j	} | |} |tj| }t||j|j	}||}tj
| |gddd||}tjj|d |dd}tjj|||||
|}|| dS )a-   Write the key and values to the KV cache.

    Args:
        key: shape = [num_tokens, num_kv_heads, head_size]
        value: shape = [num_tokens, num_kv_heads, head_size]
        kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
        num_slices_per_kv_cache_update_block: int
    Nrk   )ZaxisTr   r4   )rn   r   r)   rG   finfotoZfloat32clampminmaxcatrv   rt   ru   Zdynamo_set_buffer_donor_flattenkv_cache_update_opZcopy_)re   rf   rg   rI   rO   rN   r\   rl   rm   _r:   num_combined_kv_headsr'   Z
dtype_infokvnew_kv_cacher   r   r   rr     s0   



rr   r   r:   num_slices_per_blockc                 C   s,   ddl m} t|| |||f||d}|S )Nr   )kv_cache_update)r:   r   )Z)vllm.attention.ops.pallas_kv_cache_updater   xbZcall_jax)r   rI   rg   rN   r:   r   r   r   r   r   r   kv_cache_update_op_impl<  s   r   zkv_cache_update_op(Tensor kv, Tensor slot_mapping, Tensor kv_cache,Tensor num_kv_update_slices, int page_size, int num_slices_per_block)-> Tensorr   ZXLAc                 C   s   t | |||||}|S r   )r   )r   rI   rg   rN   r:   r   r   r   r   r   kv_cache_update_op_xlaP  s
   r   ZCompositeExplicitAutogradc                 C   s   |S r   r   )r   rI   rg   rN   r:   r   r   r   r   kv_cache_update_op_non_xla[  s   r   dtypec                 C   s   | j rzt| jW S  ty   Y n+w | jr-| tju rdS | tju r%dS | tju r,dS nzt	| jW S  ty=   Y nw t
| }|dsL|drRt|d S td|  d)	N    @   r   z	torch.intz
torch.uintrk   zGetting the bit width of z is not supported)Zis_floating_pointrG   rz   bits	TypeErrorZ
is_complexZ	complex32Z	complex64Z
complex128ZiinforC   
startswithrE   )r   Z	str_dtyper   r   r   
dtype_bitsf  s.   


r   c                 C   s,   t | }d| dkrtd| dd| S )Nr   r   z4The bit width must be divisible by 32, but got bits=z, dtype={dtype})r   rs   )r   r   r   r   r   get_dtype_packing  s   
r   r%   r&   r'   rU   c                 C   sH   t |tt }|d }t|}t ||| }t|}| | | | d S )z6Returns the size in bytes of one page of the KV cache.r(      )r   r)   r   r   )r%   r&   r'   rU   r*   r   ZpackingZkv_cache_dtype_bitsr   r   r   get_page_size_bytes  s   
r   )Nry   ry   )7dataclassesr   typingr   rG   Ztorch_xla.core.xla_buildercoreZxla_builderr   Z$torch_xla.experimental.custom_kernelZ	torch_xlaZtorch.libraryr   Z#torch_xla._internal.jax_workaroundsr   r   Z vllm.attention.backends.abstractr   r   r	   r
   Zvllm.attention.backends.utilsr   Zvllm.configr   Zvllm.loggerr   Z
vllm.utilsr   r   r?   loggerr)   r   r   r   Zfloat8_e4m3fnZfloat8_e5m2r   r   r]   r   r!   r   rH   rE   r   rr   r   definer   r   r   r   r   r   r   r   r   <module>   s   L 	

.



	