o
    )iz                     @   s8  d dl Z d dlmZ d dlmZ d dl mZ d dlmZmZm	Z	m
Z
mZmZmZmZ d dlmZ zd dlmZ d dlmZmZ d d	lmZ d d
lmZ dZW n eyf   es_dZdZdZdZd Zeddw d dlZd dlmZ d dlm Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z:m;Z; d dl<m=Z= e7e>Z?erd dl@mAZA G dd de#ZBeG dd dZCde4de	eDeCf fddZEde	eDeCf deCfd d!ZFG d"d# d#e(ZGeG d$d% d%e&ZHG d&d' d'e'eH ZIG d(d) d)e$ZJdS )*    N)defaultdict)contextmanager)	dataclass)TYPE_CHECKINGAnyDictListOptionalSetTupleType)MultiModalPlaceholderMap)"BatchDecodeWithPagedKVCacheWrapper)+CUDAGraphBatchDecodeWithPagedKVCacheWrapper!trtllm_batch_decode_with_kv_cache)#BatchPrefillWithPagedKVCacheWrapper)flash_attn_varlen_funci   z_FlashInfer is not installed. Please install it from https://github.com/flashinfer-ai/flashinfer)_custom_ops)AttentionBackendAttentionImplAttentionLayerAttentionMetadataAttentionMetadataBuilderAttentionStateAttentionType)PAD_SLOT_IDcompute_slot_mappingcompute_slot_mapping_start_idxis_block_tables_empty)	Attention)PagedAttention)
VllmConfigget_layers_from_vllm_config)init_logger)async_tensor_h2dget_kv_cache_torch_dtypemake_tensor_with_pad)use_trtllm_attention)ModelInputForGPUBuilderc                   @   s.  e Zd ZedefddZeded fddZeded fdd	Zeded
 fddZ	eded fddZ
edededededeedf f
ddZedeedf fddZedejdejdejddfddZedeej d ejddfd!d"Zedee fd#d$Zed%edejfd&d'ZdS )(FlashInferBackendreturnc                   C   s   dS )NZ
FLASHINFER r+   r+   r+   n/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/attention/backends/flashinfer.pyget_name;      zFlashInferBackend.get_nameFlashInferImplc                   C      t S N)r/   r+   r+   r+   r,   get_impl_cls?   r.   zFlashInferBackend.get_impl_clsr   c                   C   r0   r1   )FlashInferMetadatar+   r+   r+   r,   get_metadata_clsC   r.   z"FlashInferBackend.get_metadata_clsFlashInferMetadataBuilderc                   C   r0   r1   )r5   r+   r+   r+   r,   get_builder_clsG   r.   z!FlashInferBackend.get_builder_clsFlashInferStatec                   C   r0   r1   )r7   r+   r+   r+   r,   get_state_clsK   r.   zFlashInferBackend.get_state_cls
num_blocks
block_sizenum_kv_heads	head_size.c                 C   s   | d|||fS )N   r+   )r9   r:   r;   r<   r+   r+   r,   get_kv_cache_shapeO   s   z$FlashInferBackend.get_kv_cache_shapec                  C   s,   t  } | dv s
J | dkrd}|S d}|S )N)NHDHNDr?   )r      r=         )r   rA   rB   r=   rC   )r7   get_kv_cache_layout)cache_layoutstride_orderr+   r+   r,   get_kv_cache_stride_orderX   s   z+FlashInferBackend.get_kv_cache_stride_ordersrc_kv_cachedst_kv_cache
src_to_dstNc                 C   s   t | || d S r1   )r    swap_blocks)rH   rI   rJ   r+   r+   r,   rK   `   s   zFlashInferBackend.swap_blocks	kv_cachessrc_to_distsc                 C   s   t | | d S r1   )r    copy_blocks)rL   rM   r+   r+   r,   rN   h   s   zFlashInferBackend.copy_blocksc                   C   s   g dS )N)@         r+   r+   r+   r+   r,   get_supported_head_sizeso   s   z*FlashInferBackend.get_supported_head_sizeskv_cache_dtypec                 C   s*   | dv rt jS | dkrt jS td|  )N)fp8Zfp8_e4m3Zfp8_e5m2zUnrecognized FP8 dtype: )torchZfloat8_e4m3fnZfloat8_e5m2
ValueError)rS   r+   r+   r,   get_fp8_dtype_for_flashinfers   s
   z.FlashInferBackend.get_fp8_dtype_for_flashinfer)__name__
__module____qualname__staticmethodstrr-   r   r2   r4   r6   r8   intr   r>   rG   rU   TensorrK   r   rN   rR   dtyperW   r+   r+   r+   r,   r)   9   s^    
r)   c                   @   s.   e Zd ZU dZeed< ee ed< eed< dS )PerLayerParametersz
    Currently, FlashInfer backend only support models in which all layers share
    the same values for the following hyperparameters.
    window_leftlogits_soft_capsm_scaleN)rX   rY   rZ   __doc__r]   __annotations__r	   floatr+   r+   r+   r,   r`   }   s
   
 r`   vllm_configr*   c           
      C   sn   t | t}i }| D ])\}}|j}t|tsJ |j}|dur$|d nd}|j}|j}	t	|||	||< q|S )z`
    Scan all attention layers and determine some hyperparameters
    to use during `plan`.
    Nr   )
r"   r   itemsimpl
isinstancer/   sliding_windowrb   scaler`   )
rg   Zlayersper_layer_paramskeylayerrj   window_sizera   rb   rc   r+   r+   r,   get_per_layer_parameters   s   

rr   rn   c                 C   sF   t | dks
J dt|  }|d }|D ]
}||ks J dq|S )aJ  
    Currently, FlashInfer backend only support models in which all layers share
    the same values for the following hyperparameters:
    - `window_left`
    - `logits_soft_cap`
    - `sm_scale`

    So this function asserts that all layers share the same values for these
    hyperparameters and returns the global values.
    r   z'No attention layers found in the model.zFlashInfer backend currently only supports models in which all layers share the same values for the following hyperparameters: `window_left`, `logits_soft_cap`, `sm_scale`.)lenlistvalues)rn   Z
param_setsglobal_paramsparamsr+   r+   r,   infer_global_hyperparameters   s   
rx   c                   @   s   e Zd Zdd Zdd Zedd Zdd Zd	d
 Ze	de
fddZde
fddZ	dde
defddZ	ddefddZ	ddefddZdd ZdS )r7   c                 C   s8   || _ d| _d | _d | _d | _d | _| j j| _d | _d S )NF)runner_is_graph_capturing_workspace_buffer_decode_wrapper_prefill_wrapperglobal_hyperparametersrg   Z_kv_cache_layout)selfry   r+   r+   r,   __init__   s   

zFlashInferState.__init__c                 C   s(   | j d u rtjttj| jjd| _ | j S )Nr_   device)r{   rU   zeros FLASHINFER_WORKSPACE_BUFFER_SIZEZuint8ry   r   r   r+   r+   r,   _get_workspace_buffer   s   
z%FlashInferState._get_workspace_bufferc                  C   sP   ddl m}  | d urtd|  | S tj}|d u r td dS td| |S )Nr   )_KV_CACHE_LAYOUT_OVERRIDEzUsing KV cache layout %sz!Using default KV cache layout NHDr?   )Z vllm.v1.attention.backends.utilsr   loggerZ	info_onceenvsZVLLM_KV_CACHE_LAYOUT)r   rE   r+   r+   r,   rD      s   
z#FlashInferState.get_kv_cache_layoutc                 C   s$   | j d u rt|  |  | _ | j S r1   )r}   r   r   rD   r   r+   r+   r,   _get_prefill_wrapper   s
   
z$FlashInferState._get_prefill_wrapperc                 C   s^   | j d u r,| jj| jj}| jj| jj}tjp|| dk}t| 	 | 
 |d| _ | j S )NrC   )use_tensor_cores)r|   ry   model_configget_num_attention_headsparallel_configget_num_kv_headsr   "VLLM_FLASHINFER_FORCE_TENSOR_CORESr   r   rD   )r   num_qo_headsr;   r   r+   r+   r,   _get_decode_wrapper   s    

z#FlashInferState._get_decode_wrappermax_batch_sizec                 c   s    d| _ d | _tj|fttj| jjd| _tj	|tj
| jjd| _t| jjj| jjd| _|  | _tj|| jjj tj
| jjd| _tj|d tj
| jjd| _tj|tj
| jjd| _d V  d| _ | `| `| `| `| `| `| `| `d S )NTr   )r   rA   F)rz   _graph_decode_wrapperrU   fullr   longry   r   _graph_slot_mappingZonesint32_graph_seq_lens
from_numpygraph_block_tablesto_graph_block_tablesr   _graph_decode_workspace_bufferemptyZcache_configZnum_gpu_blocks_graph_indices_buffer_graph_indptr_buffer_graph_last_page_len_buffer)r   r   r+   r+   r,   graph_capture   sP   

zFlashInferState.graph_capture
batch_sizec                 C   s4   | j sJ | | j}| j|_| j|_|  |_|S r1   )	rz   	__class__ry   r   r{   r   r|   r   r}   )r   r   stater+   r+   r,   graph_clone  s   

zFlashInferState.graph_cloneFis_encoder_decoder_modelc                 C   s  | j sJ | jd |d  }| jd | }| jj| jj}| jj| jj}tj	p/|| dk}t
| j|| j||  || _| jjdrMt| jj}n
t| jj| jjj}tjd|d tjd}	tjd|tjd}
tj|f| jjtjd}tjd|d tjd}tt| j}| jjjd!i ddd| jd | dd d	d
ddd|ddddd| j d| j!d|	d|
d|d|d|d| jj" d| jjdd d|d| jj#d|d| jjjddd| jd d t$%|}|&  |S )"NrA   rC   rT   r   r_   num_prefillsslot_mapping"multi_modal_placeholder_index_mapsenable_kv_scales_calculationFnum_prefill_tokensnum_decode_tokensmax_prefill_seq_lenmax_decode_seq_lenseq_lens_tensorblock_tablespaged_kv_indptrpaged_kv_indicespaged_kv_last_page_lenr   r;   head_dim	page_sizeseq_start_locquery_start_locr   	data_typeq_data_typeuse_cuda_graphTdecode_wrapperprefill_wrapperr+   )'rz   r   r   ry   r   r   r   r   r   r   r   r   r   rD   r   rS   
startswithr)   rW   r%   r_   rU   Zaranger   r   r:   rx   rr   rg   Zattn_backendZmake_metadatar   r   r   get_head_sizer   dataclassesasdictbegin_forward)r   r   r   Z_indptr_bufferZ_last_page_len_bufferr   r;   r   rS   Zpaged_kv_indptr_tensor_hostZpaged_kv_indices_tensor_hostZ"paged_kv_last_page_len_tensor_hostZquery_start_loc_hostrv   attn_metadatar+   r+   r,   $graph_capture_get_metadata_for_batch   s   

	

z4FlashInferState.graph_capture_get_metadata_for_batchc                 C   s   |j |j|jdS )Nr   r   r   r   )r   r   r   r+   r+   r,   get_graph_input_buffersh  s   z'FlashInferState.get_graph_input_buffersc                 C   sJ   |j jjd }|d d | j|jdd |d d | j|jdd d S )Nr   r   TZnon_blockingr   )decode_metadatar   shapeZcopy_r   )r   Zinput_buffersr   r   Znum_total_blocksr+   r+   r,   prepare_graph_input_buffersq  s   
z+FlashInferState.prepare_graph_input_buffersc                 C   s   | j rJ | }|jj}|jjdk}|r?|r?|jd u r-|jjd }| jj|j	 |df j
}n|jjd }| jj|j	 |df j
}| |j_| |j_|j  d S )Nr   FT)rz   r   r   r   Zinputs_embedsinput_tokensr   ry   Zgraph_runnersZvirtual_engineZ
attn_stater   r   r   r   r   )r   Zmodel_inputr   r   Z	is_decoder   r+   r+   r,   r   }  s*   

zFlashInferState.begin_forwardN)F)rX   rY   rZ   r   r   r[   rD   r   r   r   r]   r   r   boolr   r   r   r   r+   r+   r+   r,   r7      s2    
!	
J

r7   c                       s  e Zd ZU eed< eed< dZee ed< dZeed< dZ	ee
 ed< dZee ed	< dZeej ed
< dZeej ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed< dZee ed< dZee ed< dZee ed< dZee ed< dZejed< dZejed< edZejed< dZeed< dZ eed< dZ!ee" ed< dZ#ee" ed< d d! Z$d"d# Z%	d,d$ee&e'  d%e(e'e)f f fd&d'Z*e+d%ed  fd(d)Z,e+d%ed  fd*d+Z-  Z.S )-r3   r   r   rA   decode_query_lenTr   Nr   r   r   r   r   r   block_table_boundr   r   r   r   r;   r   r   r   r   cpur   Fis_profile_runrh   ra   rb   rc   c                 C   s@   t  }| jd ur| j|vrtd| dd| j dd S d S )NzOnly z are supported for head_dim,z
 received .)r)   rR   r   rV   )r   Zsupported_head_sizesr+   r+   r,   __post_init__  s   
z FlashInferMetadata.__post_init__c                 C   s`  | j dkr| jd u rd S | jd usJ | jd usJ | jd us!J | jd us(J | jd us/J | jd us6J | jd us=J | jd | jd  | _| jj	d d }|dksVJ | j
s| j| j| _| j| j| _| j| j| _| j| j| _| j| j| _| jj| j| jd | jd  | j| jd | j | j| j| j| jd| j| j| j| j| jd | jdkr.| jd usJ | jd usJ | jd usJ | j| j| _| j| j| _| j| j| _| jd ur| j| j| _| jd ur| j| j| _| jd usJ | jj| j| jd  | j| j| jd  | j| j| j| jd| j| j| j| j| jd d S d S )Nr   rA   T)causalrc   ra   rb   r   kv_data_typeNONE)Zpos_encoding_modera   rb   rc   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   Zplanr   r;   r   r   rc   ra   rb   r   r   r   r   )r   r   r+   r+   r,   r     s   




z FlashInferMetadata.begin_forwardskip_fieldsr*   c                    s.   |d u rt  }|d |d t |S )Nr   r   )setaddsuperasdict_zerocopy)r   r   r   r+   r,   r   6  s
   

z"FlashInferMetadata.asdict_zerocopyc                 C      | j dkrd S | S Nr   )r   r   r+   r+   r,   prefill_metadataA     
z#FlashInferMetadata.prefill_metadatac                 C   r   r   )r   r   r+   r+   r,   r   G  r   z"FlashInferMetadata.decode_metadatar1   )/rX   rY   rZ   r]   re   r   r	   r   r   r   r   r   r   r   rU   r^   r   r   r   r   r   r   r   r   r;   r   r   r   r_   r   r   r   ra   rb   rf   rc   r   r   r
   r\   r   r   r   propertyr   r   __classcell__r+   r+   r   r,   r3     sL   
 
J

r3   c                   @   sh   e Zd ZdddZdd Zddd	efd
dZdee defddZ	dee dee dedefddZ
dS )r5   input_builderr(   c                 C   s2   || _ |j| _|j| _|j| _d | _| jj| _d S r1   )r   ry   rl   r:   r~   rg   )r   r   r+   r+   r,   r   P  s   z"FlashInferMetadataBuilder.__init__c                 C   s   g | _ g | _g | _g | _g | _tt| _d| _d| _	d| _
g | _dg| _g | _d| _d| _| jd u rJtt| j}|| _|j| _|j| _|j| _d S d S )Nr   F)r   prefill_seq_lenscontext_lensr   curr_seq_lensr   r   multimodal_placeholder_mapsr   r   r   r   r   r   total_blocksr   r~   rx   rr   rg   ra   rb   rc   )r   Zinferred_paramsr+   r+   r,   prepare]  s0   
z!FlashInferMetadataBuilder.prepare
inter_dataz,ModelInputForGPUBuilder.InterDataForSeqGroupchunked_prefill_enabledc              
   C   s  |j }|j}|j}t|jdd |jD |j|j|j|j	|j
D ]\}}}}	}
}}| j	| |r[|j}|rF| D ]\}}| j| | q9|  jd7  _|  j|7  _| j| n|
dkshJ d|||
|  j|
7  _| j|	 g }|jr}|}n|s|s|dur|| | d }| j| t|}t||
|| j}t|| j||||| j|j |r|| _ dS || }| || qdS )zAdd a sequence group to the metadata. Specifically update/append
        1. context length.
        2. block table.
        3. slot mapping.
        c                 S   s   g | ]}t |qS r+   )rs   ).0tr+   r+   r,   
<listcomp>  s    z<FlashInferMetadataBuilder._add_seq_group.<locals>.<listcomp>rA   z+seq_len: {}, context_len: {}, query_len: {}N)	is_promptr   computed_block_numszipZseq_idsr   Zorig_seq_lensseq_lens
query_lensr   Zcurr_sliding_window_blocksappendZmulti_modal_placeholder_mapsri   r   extendr   r   r   formatr   r   Zprefix_cache_hitr   r   rl   r   r   r:   r   _update_paged_kv_tensors)r   r   r   r   r   r   Zseq_idZ	token_lenseq_lenZcurr_seq_lenZ	query_lenZcontext_lenZcurr_sliding_window_blockZmm_mapsmodalityZplaceholdersblock_tabler   Z	start_idxr+   r+   r,   _add_seq_group  sf   


z(FlashInferMetadataBuilder._add_seq_groupr   r   c                 C   s   |  j t|7  _ || j dkr|| j d n|| j }| j|d |  | j| jd |  || j }|dkr=| j}| j| d S )Nr   rA   rh   )r   rs   r:   r   r   r   r   r   )r   r   r   r   Zlast_page_lenr+   r+   r,   r     s   
z2FlashInferMetadataBuilder._update_paged_kv_tensorsr   r   cuda_graph_pad_sizer   c                 C   s  | j jD ]
}| || j j q| jj}|dk}t| jdd}t| jdd}	| j	}
t|| j
d dd}|r| jtg|  | jg |  || j }
| jjd| }|jd }t| jD ]#\}}|rt|}||krt|||d|f< q]|d| ||d|f< q]t|j|dd}| jd }| j|g|  | jdg|  n
t| jdtj|d}|dusJ t|tj|| jj}t|tj|| jj}t| jtj|| jj}tj|jd d tj|d	}tj|jd d tj|d	}d
d | j ! D }tj"|d|j#|dd d tj"|d|j#|dd d t| jdkrX| j$dg| j%t| j$   tj&| j$dtjd}tj&| jdtjd}tj&| jdtjd}tjt| jd dtjd}nd}d}d}d}| jj'(drpt)*| jj'}n
t+| jj'| jj,j#}t-d.i d|d| j
d|d|ddd| jd|
d|d|	d|d|d|d|d|d|d | jj,.| jj/d!| jj,0| jj/d"| jj,1 d#| j2d$|d%|d&|d'|d(| jj,j#d)|d*| j3d+| j4d,| j5d-| j6S )/a  Build attention metadata with on-device tensors.

        Args:
            seq_lens: The maybe padded sequence lengths of the input sequences.
            query_lens: The query lengths of the input sequences.
            cuda_graph_pad_size: The padding size for cuda graph.
                                 -1 if cuda graph is not used.
            batch_size: The maybe padded batch size.
        rh   r   )defaultNrA   Tr   )padr_   r   r   c                 S   s   i | ]	\}}||  qS r+   )Z	index_map)r   r   Zplaceholder_mapr+   r+   r,   
<dictcomp>)  s    z3FlashInferMetadataBuilder.build.<locals>.<dictcomp>)dimr_   outr   )r   r_   rT   r   r   r   r   r   Fr   r   r   r   r   r   r   r   r   r   r   r;   r   r   r   r   r   r   r   r   r   ra   rb   rc   r+   )7r   Zinter_data_listr   r   ry   r   maxr   r   r   r   r   r   r   r   r   r   r   	enumeraters   rU   r   r   r   r   r&   r]   r$   Z
pin_memoryr   r   r   r   ri   Zcumsumr_   r   r   tensorrS   r   r)   rW   r%   r   r3   r   r   r   r   r:   r   ra   rb   rc   )r   r   r   r   r   r   r   Zuse_captured_graphr   r   r   r   Zinput_block_tablesZ
max_blocksir   r9   r   Zlast_paged_kv_indptrr   Zquery_lens_tensorZslot_mapping_tensorr   r   Zplaceholder_index_mapsZpaged_kv_indices_tensorZpaged_kv_indptr_tensorZpaged_kv_last_page_len_tensorZblock_table_bound_tensorrS   r+   r+   r,   build  sL  












	

zFlashInferMetadataBuilder.buildN)r   r(   )rX   rY   rZ   r   r   r   r   r   r]   r   r  r+   r+   r+   r,   r5   N  s    
.
Cr5   c                   @   s   e Zd Zdejddfdededededeee  dee d	e	d
ee de	dee	 de
ddfddZ		ddedejdejdejdejdedeej deej dejfddZdS )r/   NF	num_headsr<   rm   r;   alibi_slopesrl   rS   rb   	attn_typekv_sharing_target_layer_name	use_iroper*   c                 C   s   |
d urt d|rtd || _|| _t|| _|| _|d ur)tj	|tj
d}|| _|d ur6|d dfnd| _|| _|| _| j| j | _|	tjkrOt dd S )Nz5KV sharing is not supported in V0 FLASHINFER backend.zgUsing irope in FlashInfer is not supported yet, it will fall back to global attention for long context.r   rA   r   )rh   rh   zaEncoder self-attention and encoder/decoder cross-attention are not implemented for FlashInferImpl)NotImplementedErrorr   Zwarning_oncer  r<   rf   rm   r;   rU   r  Zfloat32r	  rl   rS   rb   Znum_queries_per_kvr   DECODER)r   r  r<   rm   r;   r	  rl   rS   rb   r
  r  r  r+   r+   r,   r   z  s2   

zFlashInferImpl.__init__rp   queryro   valuekv_cacher   outputoutput_scalec	                 C   s6  |d urt d| j}	| j}
| j}| j}| j}| j}| j}| j}|j	\}}|
d|	|
}|
d||
}|
d||
}| dkrlt|||d d df |d d df |j ||j|j |drlt|}|
|}|j}|j}|j	d || ksJ d|j	 d| d| |j	d || ksJ d	|j	 d
| d| | }||d  }|d | }|d | }|d | }|j	d |ksJ |j	d |ksJ |d ur|d nd}d }|dkrtj|j	|j|jd}nd }t }|j }rT| dkrt||||j|j|j |j |d||d}nC|d usJ |j!d us J |j!j"s'J |j!j#|ks0J |j!j$|p7dks<J |j!j%|ksEJ |j!j&||j'| |j(|j)d}|j* }r|d usaJ |j+d usiJ |j+j#|ksrJ |j+j$|pydks~J |j+j%|ksJ t,||j-||j.|j|j/s|j+j&||j'| |j(|j)|d n$|j+j0}t12 dksJ t3||j'| ||j4|j5|j-|j(| |j)|d	 |d u r|d ur||}}n;|d u r|d ur||}}n+|d usJ |d usJ |d usJ |j6dksJ |7d}tj8||gdd}|
||S )NzAfused output quantization is not yet supported for FlashInferImplrh   r   rA   rT   zkey : z : #prefill tokens z : #decode tokens zvalue : z : #prefill toks z : #decode toks r   T)qkvZcu_seqlens_qZcu_seqlens_kZmax_seqlen_qZmax_seqlen_ksoftmax_scaler   rq   r	  g        )k_scalev_scale)r  r  r  r@   )	r  r  workspace_bufferr   r   Zmax_seq_lenZ
bmm1_scaleZ
bmm2_scaler  )r  )9r  r  r<   r;   rS   rm   rl   r	  rb   r   viewZnumelopsZreshape_and_cache_flashr   flattenZ_k_scaleZ_v_scaler   r)   rW   r   r   
contiguousrU   r   r_   r   rG   r   r   r   r   r   Z_causalZ_window_leftZ_logits_soft_capZ	_sm_scalerunZpermuteZ_k_scale_floatZ_v_scale_floatr   r   r'   r   r   r   Z_float_workspace_bufferr7   rD   r   r   r   r   Zsqueezecat)r   rp   r  ro   r  r  r   r  r  r  r<   r;   rS   r  rq   r	  rb   Z
num_tokensZhidden_sizeZtorch_dtyper   r   Zdecode_queryra   Zprefill_outputZdecode_outputrF   Zprefill_metaZdecode_metar  r+   r+   r,   forward  s  




	
zFlashInferImpl.forward)NN)rX   rY   rZ   r   r  r]   rf   r	   r   r\   r   r   r   rU   r^   r3   r!  r+   r+   r+   r,   r/   x  sf    
	

1	
r/   )Kr   collectionsr   
contextlibr   r   typingr   r   r   r   r	   r
   r   r   Zvllm.multimodalr   Z
flashinferr   Zflashinfer.decoder   r   Zflashinfer.prefillr   Zvllm.vllm_flash_attnr   r   ImportErrorrU   Z	vllm.envsr   Zvllmr   r  Z vllm.attention.backends.abstractr   r   r   r   r   r   r   Zvllm.attention.backends.utilsr   r   r   r   Zvllm.attention.layerr   Zvllm.attention.ops.paged_attnr    Zvllm.configr!   r"   Zvllm.loggerr#   Z
vllm.utilsr$   r%   r&   Zvllm.utils.flashinferr'   rX   r   Zvllm.worker.model_runnerr(   r)   r`   r\   rr   rx   r7   r3   r5   r/   r+   r+   r+   r,   <module>   sv   ($D



 [ 7  ,