o
    )i.                     @   sV  d dl Z d dlmZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. ee/Z0dZ1G dd dZ2dej3de.de4ej3ej3f fddZ5dS )    N)replace)Optional)	Attention)CompilationLevel
VllmConfigget_layers_from_vllm_config)get_pp_group)set_forward_context)init_logger)	get_model)supports_multimodal)Eagle3LlamaForCausalLM)current_platform)is_pin_memory_available)FlashAttentionMetadata)AiterFlashAttentionMetadata)TreeAttentionMetadataTreeAttentionMetadataBuilder)TritonAttentionMetadata)CommonAttentionMetadata)KVCacheConfig)SamplingMetadatac                   @   s   e Zd Z	d"dedejfddZ	d"dejdejdejd	ejd
ede	de
eej  dejfddZdedejdejdejd
edeej fddZd
edejdeeejf fddZdejddfddZe deddfddZdeddfd d!ZdS )#EagleProposerNvllm_configdevicec           
      C   s  || _ |j| _| jj| _| jj| _|| _|jj| _|jj| _|jj	| _	| jj
| _
|jj| _t| j| _| j | _|jj| _| j jjtjkoL| j jj | _tt| j jj| _tj| jtj |d| _!tj| jtj"|d| _#tj| j| jf| j|d| _$|jj%}tj|d |tj d| _tj| j| jf| j|d| _&| jj'}t()|| _*t+| j*d }dg| }| j*D ]}|t+|d   d7  < q|d g| _,|d g| _-t.d|D ]}	| j,/| j,d ||	   | j-/||	 ||	d    qtjdt+| j*d |tj d0|d| _1d S )N)dtyper      r   r   r   r   )2r   speculative_configdraft_model_configmethodrunnermodel_configr   max_model_lenZcache_config
block_sizenum_speculative_tokensZscheduler_configZmax_num_batched_tokensZmax_num_tokensnparangetoken_arange_npZget_hidden_sizehidden_sizeis_multimodal_modelZcompilation_configlevelr   Z	PIECEWISEZenforce_eageruse_cuda_graphlistreversedZcudagraph_capture_sizescudagraph_batch_sizestorchzerosint32	input_idsZint64	positionshidden_statesZmax_num_seqsinputs_embedsZspeculative_token_treeastliteral_evalZtree_choiceslencu_drafts_per_levelchild_drafts_per_levelrangeappendrepeattree_draft_pos_offsets)
selfr   r   r"   Zmax_batch_sizeZspec_token_tree
tree_depthZnum_drafts_per_levelnoder,    rD   e/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/v1/spec_decode/eagle.py__init__'   s   










zEagleProposer.__init__target_token_idstarget_positionstarget_hidden_statesnext_token_idscommon_attn_metadatasampling_metadata	mm_embedsreturnc              	   C   s  |j d }|j d }	|jdd  d }
| jdkr0t| jts J | j|}|j d | jks0J |dd  | jd |d < || j|
< | j	d usIJ | j	j
d d jj|dd}i }| jD ]}|||< q\| jrt|| jd krt| j|}n|}|| jd |< || jd |< | jr| jd | }| jj||pd d}|| jd |< | jd | }d }n	d }| jd | }t|| j|d' | j|| jd | | jd | |d}| jd	kr|}n|\}}W d    n1 sw   Y  ||
 }| j|d }||
 }||
 }t|tr| j|	||||d
}tj|ddS |jdd}| jdkr%|ddS t ! r6t|t"t#t$fs5J nt|t$s>J |g}| jrT|	| jd krT| j|	}n|	}|	|_%d|_&| j'd |	d  |_t(| jd D ]}|d ) }|d7 }|| j*k}t+|d|}| j,d7  _,| j-d7  _-t.|j,| j*|_,|j-/|d || j0 }|j1j2d|ddd}|d}|| j0 || j0  |_3|j3/|t4 || jd |	< || jd |	< || jd |	< | jr| j|}|| jd |	< | jd | }d }n	d }| jd | }t|| j|d | j|| jd | | jd | |d\}}W d    n	1 s,w   Y  |d |	 }| j|d |	 d }|jdd}|5| qmtj6|dd}|S )Nr   r   eagle3r   rK   Zdraft_index)Zmultimodal_embeddings
num_tokensr4   r5   r6   r7   Zdeepseek_mtp)
batch_sizelogitsr5   r6   rK   dimrW   index)7shapequery_start_locr!   
isinstancemodelr   Zcombine_hidden_statesr*   r4   r"   attn_groupsmetadata_builderbuild_for_draftingattn_layer_namesr-   r0   r   pad_for_cudagraphr5   r6   r+   Zget_input_embeddingsr7   r	   compute_logitsr   propose_treer1   catargmaxr&   viewr   Zis_rocmr   r   r   num_actual_tokensmax_query_lenr(   r=   intr$   wheremax_seq_lenseq_lensminmasked_fill_r%   block_tablegatherslot_mappingPADDING_SLOT_IDr>   stack)rA   rG   rH   rI   rJ   rK   rL   rM   rR   rT   Zlast_token_indicesattn_metadataper_layer_attn_metadata
layer_namenum_input_tokensr4   r7   Zret_hidden_stateslast_hidden_statesr6   Zsample_hidden_statesrU   r5   draft_token_ids_listdraft_token_idsZinput_batch_size_exceeds_max_model_lenZclamped_positionsblock_numbers	block_idsrD   rD   rE   propose|   s  










	zEagleProposer.proposerT   rU   r5   r6   c           "   	   C   s*  | j jd d j}t|tsJ | jd }|}| jd }	|	dkr+|jdd|d}
nt	j
||	ddj|d}
|
g}||dd}t	jd| jj| jjd}t	jd| jj| jjd}t	jd| jj| jjd}||d| jd |d d f  }t| j}t|d D ]}||d  }|| | jk}t	|d||d}|dkr|j|dd}|	dkr|j|	dd}t	j||
gdd}t	j||gdd}t	j||gdd}|}t||| jd |d   |j| || |d}|j||d d}i }| jD ]}|||< qt|j| j|_|j |d |d d ||| f }|| j! }|j"j#d|d}|| j! || j!  }t$||< |d|_%|j&}|d}|| jd |< |d| jd |< ||d| jd |< | j'ro|| j(d kro| j)*|}n|}t+|| j)|d	" | j,| jd | | jd | | jd | d d
\} }W d    n	1 sw   Y  |d | ||dd d | d f }| d | ||dd d | d f }!| j,-|!.|| dd }| j|d  }	|	dkr|jdd|d}
nt	j
||	ddj|d}
|/|
 | j|d  | }| j|d  }q|S )Nr   r   r   rV   r   )r[   rm   rh   ri   rP   rX   rQ   rS   )0r"   r^   r_   r\   r   r;   r<   rf   rg   r1   Ztopkindicesemptyr4   r   r   r5   r6   r@   r:   r=   r$   rk   Zrepeat_interleavere   r   r(   rm   r`   ra   rn   rl   ro   r%   rp   rq   rs   rr   rh   r-   r0   r   rb   r	   r]   rc   Zreshaper>   )"rA   rT   rU   r5   r6   rK   Ztree_attn_metadata_builderZtotal_num_draftsZlevel_num_draftsZnum_childrenr{   rz   Zdraft_hidden_statesZtree_input_idsZtree_positionsZtree_hidden_statesZflattened_draft_positionsrB   r,   Zdraft_positionsr}   Z	query_lenru   rv   rw   Zquery_positionsr~   r   rr   rR   r4   rx   ry   Zdraft_last_hidden_statesrD   rD   rE   rd   H  s4  














zEagleProposer.propose_treenum_rejected_tokensc                 C   s$  |j j}|j}|j| }|dd |dd  }|| }| }tj|jtjt	 d}	|	 }
t
j||
dd d |
d }t
|
dd |}| jd| | }t
|dd  |}|| }t|j|dd}t|	j|dd|j|dd|	||j|j||  |j|j| dd}||fS )	a&  
        This function is used to prepare the inputs for the spec decode.
        It updates to the common_attn_metadata to account for the rejected
        tokens (and newly sampled tokens). It also returns the token indices
        of the tokens that should be fed to the speculator.
        r   Nr   )r   Z
pin_memory)outT)Znon_blocking)r[   rm   query_start_loc_cpuseq_lens_cpunum_computed_tokens_cpunum_reqsrh   ri   block_table_tensorrr   Zcausal)r[   r   r   r   numpyr1   r2   rZ   r3   r   r'   Zcumsumr?   r)   Z
from_numpytor   r   r   maxitemr   rr   )rA   rK   r   r   r   Znew_seq_lens_cpuZnew_query_len_per_reqZnew_num_tokens_per_reqZnew_num_tokens_per_req_npZnew_query_start_loc_cpuZnew_query_start_loc_npZtotal_num_tokensZnew_query_start_locs_expandedZtoken_offestsZold_query_start_locs_expandedZtoken_indices_npZtoken_indicesZspec_common_attn_metadatarD   rD   rE   prepare_inputs  s`   



zEagleProposer.prepare_inputstarget_modelc                 C   s.  | j jj}tt| j t }ddlm} |d t	| j |d| _
W d    n1 s,w   Y  t| j t | }t|| _t|rP|jj| j
j_| }n|}t jdkrv| j
j
jjj|j
jjjkrvtd | j
j
`|j
j| j
j
_ntd | j jjdkrt|d	rtd
 |j| j
_d S d S d S )Nr   )set_model_tagZ
eagle_head)r   r#   r   zNAssuming the EAGLE head shares the same vocab embedding with the target model.zQThe EAGLE head's vocab embedding will be loaded separately from the target model.rO   lm_headz4Loading EAGLE LM head weights from the target model.)r   r   r    setr   r   keysZvllm.compilation.backendsr   r   r]   r.   ra   r   configZimage_token_indexZget_language_modelr   Z
world_sizeZembed_tokensweightrZ   loggerinfor!   hasattrr   )rA   r   r    Ztarget_attn_layer_namesr   Zdraft_attn_layer_namesZtarget_language_modelrD   rD   rE   
load_modelP  sP   






zEagleProposer.load_modelrR   c                 C   s   t d | j|d2 | jrd }| jd | }n	| jd | }d }| j|| jd | | jd | |d W d    d S 1 s=w   Y  d S )NrQ   rS   )r	   r   r+   r7   r4   r]   r5   r6   )rA   rR   r4   r7   rD   rD   rE   	dummy_run  s   "zEagleProposer.dummy_runkv_cache_configc                    sX   i  t |jD ]\}}|jD ]}| |< qqtt fdd| jD dks*J ddS )z
        Validate that all eagle layers belong to the same KVCacheGroup.
        Need this assumption to ensure all eagle layers can use the
        same AttentionMetadata.
        May extend to multiple AttentionMetadata in the future.
        c                    s   g | ]} | qS rD   rD   ).0rw   kv_cache_groupsrD   rE   
<listcomp>  s    z>EagleProposer.validate_same_kv_cache_group.<locals>.<listcomp>r   z9All eagle layers should belong to the same kv cache groupN)	enumerater   Zlayer_namesr:   r   ra   )rA   r   idZkv_cache_grouprw   rD   r   rE   validate_same_kv_cache_group  s   

z*EagleProposer.validate_same_kv_cache_group)N)__name__
__module____qualname__r   r1   r   rF   Tensorr   r   r   r.   r   rj   rd   tupler   nnModuler   Zinference_moder   r   r   rD   rD   rD   rE   r   %   sv    
a	

 M	

 *
_1r   rU   rL   rN   c                 C   s   |j r| }| jdd}||fS |jdk}t|d|j}| |dd | jdtjd}t	|}|
  ||jddd}|jsR|jdd}t|||}||fS )Nr   rV   g      ?r   )rW   r   )Z
all_greedyrf   temperaturer1   rk   Zdiv_rg   ZsoftmaxZfloat32Z
empty_likeZexponential_divZ
all_random)rU   rL   ZprobsrJ   Z	is_greedyr   qZgreedy_token_idsrD   rD   rE   #compute_probs_and_sample_next_token  s&   

r   )6r8   dataclassesr   typingr   r   r'   r1   Ztorch.nnr   Zvllm.attention.layerr   Zvllm.configr   r   r   Zvllm.distributed.parallel_stater   Zvllm.forward_contextr	   Zvllm.loggerr
   Z vllm.model_executor.model_loaderr   Zvllm.model_executor.modelsr   Z'vllm.model_executor.models.llama_eagle3r   Zvllm.platformsr   Z
vllm.utilsr   Z%vllm.v1.attention.backends.flash_attnr   Z(vllm.v1.attention.backends.rocm_aiter_far   Z$vllm.v1.attention.backends.tree_attnr   r   Z&vllm.v1.attention.backends.triton_attnr   Z vllm.v1.attention.backends.utilsr   Zvllm.v1.kv_cache_interfacer   Zvllm.v1.sample.metadatar   r   r   rs   r   r   r   r   rD   rD   rD   rE   <module>   sL        