o
    81 iw                     @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZmZmZ d dlZd dlm  mZ d dlmZmZ d dlmZ d dlmZmZmZ z
d d	lmZmZ W n eyn   ed
ddgZedddgZY nw eG dd dZdd Z dd Z!d.ddZ"e# 									d/ddZ$d.ddZ%e# 										d0dd Z&G d!d" d"Z'ej(fd#ee)ef fd$d%Z*eG d&d' d'Z+e# 	(			)d1d*d+Z,	)d2d,d-Z-dS )3    N)
namedtuple)	dataclassfield)partial)CallableOptionalSequenceUnion)	rearrangerepeat)Tensor)ProfilerActivityprofilerecord_function)GreedySearchDecoderOnlyOutputSampleDecoderOnlyOutputr   	sequencesscoresr   c                   @   sd   e Zd ZU dZeed< eed< dZeed< dZeed< ee	dZ
e	ed< d	Zee ed
< dd Zd	S )InferenceParamszInference parameters that are passed to the main model in order
    to efficienly calculate and store the context during inference.
max_seqlenmax_batch_sizer   seqlen_offsetbatch_size_offsetdefault_factorykey_value_memory_dictNlengths_per_samplec                 C   s.   || _ || _d| _| jd ur| j  d S d S )Nr   )r   r   r   r   Zzero_)selfr   r    r   g/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/flash_attn/utils/generation.pyreset#   s   
zInferenceParams.reset)__name__
__module____qualname____doc__int__annotations__r   r   r   dictr   r   r   r   r    r   r   r   r   r      s   
 r   c                 C   s,   | t | |d d k }| |td dS )z<Set the logits for none top-k values to -inf. Done in-place.r   ).Nz-InfN)torchtopkmasked_fill_float)logitstop_kindices_to_remover   r   r   !modify_logits_for_top_k_filtering-   s   r0   c                 C   sh   |dks|dkr
dS t j| dd\}}|jddjdd}|d| k}|d||}| |td	 dS )
z<Set the logits for none top-p values to -inf. Done in-place.              ?NF)Z
descendingr(   dim   z-inf)r)   sortsoftmaxZcumsumZscatterr+   r,   )r-   top_pZsorted_logitsZsorted_indicesZcumulative_probsZsorted_indices_to_remover/   r   r   r   !modify_logits_for_top_p_filtering5   s   r9   r5   r1   r2   c                 C   s   |dkr
| j ddS |dkr|dksJ d|dkrVt|| d}tj| |dd\}}|dkr4|| }t|| |tj|jd |jdtj	tj
|dddd	jddf S |dkr^| | n|  }t|| tj	tj
|dddd	jddS )
zfSample from top-k logits.
    Arguments:
        logits: Tensor of shape (batch_size, vocab_size)
    r5   r(   r3   r1   r2   top-p should be in (0, 1].r   deviceZnum_samples)Zargmaxminsizer)   r*   r9   arangeshaper<   multinomialr7   squeezeclone)r-   r.   r8   temperatureZ
logits_topindicesr   r   r   sampleE   s(   

rG   Fc                    s  | j \ }durj d ndr1tdsd_tj ||	d_jj}|  nt d} 
fdd}	fd	d
}fdd}tjj	|d}tjj	|d}|rn|	dkrjtj
  |  g | g}}||d |s|||d | | j|d j d 7  _|||d | ||d |r{|r|  |	dkrtj
  tj  td||dd dkrtnt}|tj|ddt|dS )a  Decoding, either greedy or with top-k or top-p sampling.
    If top-k = 0, don't limit the number of candidates (pure sampling).
    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
    then top-p.
    We assume that all sequences in the same batch have the same length.

    Arguments:
        input_ids: (batch, seq_len)
        max_length: int
        teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the
            logits, the next token is taken from the teacher_outputs. Useful for testing.
    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
        sequences: (batch, max_length)
        scores: tuples of (batch, vocab_size)
    Nr5   r   _decoding_cache)tensor_parallelr   r   c                    s   |j dk}|rtj df|j tj| jd}nd }r|s*| ||ddjjdd}nj| ||j jdd}d urC|dd f S |S )Nr   r5   dtyper<   position_idsinference_paramsnum_last_tokensr3   .)	r   r)   fulllongr<   r-   rC   rH   run)	input_idsrO   decodingrN   r-   )
batch_sizecgmodel
vocab_sizer   r   
get_logits   s0   
zdecode.<locals>.get_logitsc                    s@   d u s	 |j krt| d}n	d d |j f }|dS )Nr.   r8   rE   r5   )r   rG   	unsqueeze)r-   rO   token)teacher_output_lenteacher_outputsrE   r.   r8   r   r   sample_tokens   s   
zdecode.<locals>.sample_tokensc                    s<   |j dkrdS  d ur|  k rdS |j d krdS dS )Nr   FTr5   )r   all)current_tokenrO   )eos_token_id
max_lengthr   r   should_stop   s   
zdecode.<locals>.should_stop)enable_timingr(   #Prompt processing + decoding time: .0fmsr3   r   r   )rA   hasattrrH   update_graph_cacherO   r    r   r)   cudaEventdistributedbarrierrecordappendr   synchronizeprintZelapsed_timer   r   cattuple)rT   rX   rd   r.   r8   rE   rc   r_   rY   rI   rW   rf   	seqlen_ogrO   rZ   r`   re   startendr   r   
output_clsr   )rV   rW   rc   rd   rX   r^   r_   rE   r.   r8   rY   r   decodeb   sN   

	


r{   c                 C   s  | j \}}}|d }	|j ||	|fksJ |j ||	fksJ |jtjtjfv s(J |dkr4|dks4J d|dkr<| | n|  } |dkrH|| n| }|dkrbt|| d}t| | t|| t	| | t	|| tj
| dd}
tj
|dd}dd	 }tj||	|
jd
||| ||
ddddf |k}|jdd}t||	| jdd}tj|
ddddf | dd}tj||
ddddf gdd}t|jdt|d|ddd}tj|ddjdd}t|d}||dd|f< ||d fS )a  Algorithm 1 from [1]
    [1] Fast Inference from Transformers via Speculative Decoding
    Yaniv Leviathan, Matan Kalman, Yossi Matias
    https://arxiv.org/abs/2211.17192

    Arguments:
        logits: Tensor of shape (batch_size, seqlen + 1, vocab_size)
        logits_draft: Tensor of shape (batch_size, seqlen, vocab_size)
        tokens_draft: Tensor of shape (batch_size, seqlen)
    Return:
        tokens: Tensor of shape (batch_size, seqlen + 1)
        num_generated_tokens: Tensor of shape (batch_size), with value in [1, seqlen + 1].
            For each sequence in the batch, the number of valid tokens that were sampled by
            speculative sampling.
    r5   r1   r2   r:   r   r(   r3   c                 S   s   t | jdt |dddS )Nr(   z... -> ... 1r4   indexz... 1 -> ...)r
   gather)probstokensr   r   r   <lambda>   s    z$sample_speculative.<locals>.<lambda>r;   N)r>   z
b -> b 1 d)dr|   zb 1 d -> b dr=   )r   r5   )rA   rL   r)   Zint64int32rD   r>   r?   r0   r9   r7   Zrandr<   ra   wherer%   Zargminclampru   r
   r~   r   rB   rC   Fpad)r-   Zlogits_drafttokens_draftr.   r8   rE   batchZ
seqlen_p_1rY   seqlenr   Zprobs_draftr~   acceptedZaccepted_allZfirst_rejected_idxZ
probs_diffZresample_probsZresampler   r   r   r   sample_speculative   sF   



""r      c           /   
      s
  | j \}}|dksJ d|du sJ d|r[t|dsd|_t||j|||d|
d|_|jj}||| t|ds>d|_t||j|||td|d |
d|_|jj}||| nt||d	}t||d	}d% fdd	}d&dd}t|||d}t	t
fi |}t	|||d}t	|||d}t	||||d}t	||||d}|rddlm} |d}|r|
dkrtj  tj  t }| gg }}d}d} g }!||d kr|| dd\}"}#||" ||# nt||| d }$|| |$d\}%}&| |$7 } |r |tj| |%gdd|$d dj}'t|&|'ddddf     |tj| |%gdd||$d d}(|d7 }|rQ|tj| |%gdd|$d dj})t|(|)    t|(|&|%fi |\}"}*|!|*d  |rot|" t|* ||"ddd|*d f  ||(ddd|*d f  |*d  }+||+ d |_|+dkr|jd n|j|_|rtj| |d gdd},||,|*d  d dj}-t|d |-ddddf     	 |j|d krn.|j|d kr||d ddddf dd\}"}#||" ||# nt|||j d }$||d ddddf |$d\}%}&| |$7 } |rM|tj|,|%gdd|$d dj}'t|&|'ddddf     |tj|d ddddf |%gdd||$d d}(|d7 }|r|tj|,|%gdd|$d dj})t|(|)    t|(|&|%fi |\}"}*|!|*d  |rt|" t|* ||"ddd|*d f  ||(ddd|*d f  |*d  }+| j|+7  _|+dkr|jd n|j|_|rtj|,|d gdd},||,|*d  d dj}-t|d |-ddddf     q|rL|
dkrtj  tj  tdt | d dd td|  td t|!  |  d! d"d# tj|dd}tj|dd}|rv||j}-t||-dd|d df     |dkr}tnt }.|.||d$S )'a  
    TD: WIP, for my own understanding, lightly tested. Only support batch_size == 1 for now.

    Speculative decoding, either greedy or with top-k or top-p sampling.
    If top-k = 0, don't limit the number of candidates (pure sampling).
    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
    then top-p.
    We assume that all sequences in the same batch have the same length.

    Arguments:
        input_ids: (batch, seq_len)
        max_length: int
    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
        sequences: (batch, max_length)
        scores: tuples of (batch, vocab_size)
    r5   z>Speculative decoding implementation only supports batch_size=1Nz@Speculative decoding implementation doesn't support eos_token_idrH   )r5      )decoding_seqlensrI   r   rJ   Fc           
         s   |j dk}|r0| jd }	 tj| jd f|j tj| jd}|d d d f tj|tj| jd }nd }|r6|s@|| |||dj	}	n|| jd ksIJ |j
| ||j d d | d f }	 d urg|	dd  f S |	S )Nr   r5   TrK   rM   .)r   rA   r)   rQ   r   r<   r   r@   rR   r-   rH   rS   )
rT   rO   rX   rP   rW   rU   r   Zcache_seqlensrN   r-   rY   r   r   rZ   P  s<   




	z&decode_speculative.<locals>.get_logitsc                 S   s   |dksJ | gg }}t |D ]*}|||d |dddf  | j|d jd 7  _|||d d qtj|dd ddtj|ddfS )a  Sample `num_tokens` tokens from the model, given the previous logits.
        Also return the logits of the sampled tokens.
        Arguments:
            input_ids: (batch, seqlen)
        Return:
            tokens: (batch, num_tokens)
            scores: (batch, num_tokens), which contains @previous_logits and the logits of the next
                (num_tokens - 1) tokens. The logits of the last token isn't computed.
        r5   r(   Nr3   )rangerr   r   rA   r\   r)   ru   stack)rT   get_logits_fnrO   	sample_fn
num_tokensr   r   ir   r   r   r`   u  s   
 $z)decode_speculative.<locals>.sample_tokensr[   )rX   rW   )r   r   rO   r   )AutoTokenizerZgpt2)r   r3   )rP   r(   Trg   i  rh   ri   zNumber of calls to main model: zAcceptance rate: d   z.2f%rj   )r5   Fr5   )!rA   rk   rH   rl   rO   r    r   r   r'   r   rG   Ztransformersr   Zfrom_pretrainedr)   ro   rp   rm   rs   timerr   r>   ru   r-   rt   absmaxr   itemr   sumr   r   )/rT   rX   Zmodel_draftrd   Zspeculative_lookaheadr.   r8   rE   rc   rY   rI   rW   rf   debugrV   rw   Zinference_params_draftrO   rZ   r`   Zsampling_kwargsr   Zget_logits_mainZget_logits_draftZsample_tokens_mainZsample_tokens_draftr   Z	tokenizerrx   r   r   Znum_main_model_callsZnum_draft_tokensZnum_accepted_tokens_historyr   Z
scores_newZn_spec_tokensr   Zscores_draftZscores_draft_refr-   Z
logits_refZnum_generated_tokensZnum_generatedZcur_idsZ
scores_refrz   r   r   r   decode_speculative  sn  
!


	
%



$


($

	
$$


(D


"
(r   c                   @   s*   e Zd Zd
ddZ					ddd	ZdS )GenerationMixinNc                 K   s   t N)NotImplementedError)r   rV   r   rL   kwargsr   r   r   allocate_inference_cache7  s   z(GenerationMixin.allocate_inference_cacher5   r1   r2   Fc           
      K   s4   t || |f|||d|}	|sd |	_|r|	S |	jS )Nr[   )r{   r   r   )
r   rT   rd   r.   r8   rE   Zreturn_dict_in_generateZoutput_scoresr   outputr   r   r   generate:  s   zGenerationMixin.generater   )r5   r1   r2   FF)r!   r"   r#   r   r   r   r   r   r   r   6  s    
r   layersc                    sN   t jt jt jfv sJ | |d||ft|trt|} fdd|D S )Nr   c                    s   i | ]}|t j d qS ))r<   rL   )r)   empty).0r   r<   rL   Zkv_cache_shaper   r   
<dictcomp>Z  s    z,allocate_inference_cache.<locals>.<dictcomp>)r)   float16Zbfloat16Zfloat32
isinstancer%   r   )r   r   Znheadsheaddimr   r<   rL   r   r   r   r   M  s
   	
r   c                   @   sd   e Zd ZU dZeed< dZeed< dZdZe	e
dZe
ed< dZdZee ed< dZee ed< dS )	DecodingCGCacher   r   r   Nr   	callablesrO   rS   )r!   r"   r#   r   r%   r&   r   r<   rL   r   r'   r   mempoolrO   r   r   rS   r   r   r   r   r   r   ]  s   
 r   r   r   c	              
      sv   d u rt   tt|  }	|	j}
|d u r|	j}|
|f j jfks-| jks-| jkri  _d  _	d  _
t  |
| _ _|| _ _t| drU| |||}nt| jd| jj| jj }t||| jj| || jj|
|}tj|f|tj|
d}t|||||d _
tjj  _	|D ]}||f jvrt|  j
||| j	|d j||f< q fdd}| _d j
_ S )	Nr   Zhead_dimrK   )r   r   r   r   r   )decoding_seqlenr   	n_warmupsc                    s(   | j d d \}} j||f | ||S )Nr   )rA   r   )rT   rN   r   rV   r   cacher   r   dispatch  s   z$update_graph_cache.<locals>.dispatchr   )r   nextiter
parametersr<   rL   r   r   r   r   rO   gcZcollectrk   r   getattrconfigZhidden_sizeZnum_attention_headsZnum_hidden_layersr)   rQ   r   r   rm   ZgraphsZgraph_pool_handlecapture_graphrS   r   )rX   r   rV   rw   r   r   rI   rL   r   Zparam_exampler<   Z	inf_cacher   r   r   r   r   r   r   rl   i  sp   



	
rl   c                    s^  t t|  j}tj||fdtj|dtj||fdtj|dj}|| _jjd d < tj	
 }	|	tj	  tj	|	& t|D ]}
| |djqI|	  tj rctj  W d    n1 smw   Y  tj	 |	 tj	  tj	j |d | |djW d    n1 sw   Y   fdd}|_|S )Nr   rK   rM   )poolc                    s2   |j d d < |  |     S r   )r   Zcopy_ZreplayrD   )Znew_input_idsZnew_position_idsr   graphrO   rT   r-   rN   r   r   rS     s
   

zcapture_graph.<locals>.run)r   r   r   r<   r)   rQ   rR   r   r   rm   ZStreamZwait_streamZcurrent_streamstreamr   r-   rs   ro   Zis_initializedrp   Z	CUDAGraphr   )rX   rO   rV   r   r   r   r   r<   Zseqlen_offset_ogs_rS   r   r   r   r     sL   




r   )r5   r1   r2   )	r5   r1   r2   NNNr5   FF)
r   r5   r1   r2   NNr5   FFF)r   r5   Nr   )r5   Nr   ).r   r   collectionsr   dataclassesr   r   	functoolsr   typingr   r   r   r	   r)   Ztorch.nn.functionalnnZ
functionalr   Zeinopsr
   r   r   Ztorch.profilerr   r   r   Ztransformers.generationr   r   ImportErrorr   r0   r9   rG   Zinference_moder{   r   r   r   r   r%   r   r   rl   r   r   r   r   r   <module>   s~   

n;  +

L