o
    )iY                     @   s  U d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ eeZdZejed	< dZejed
< dZG dd dejZdejdee dedejdeej dejdejdedejfddZdejdejdedejfddZ	 	 d5dejdejdeded edejfd!d"Zdedee d#eeej f d$ej!dejf
d%d&Z"dedee dejdejdeej dejded$ej!dejfd'd(Z#e	j$dgd)d*d+ Z%e	j$dgd)d,ejfd-d.Z&e	j$dd gd)d/ejfd0d1Z'e	j$d2ejd,ejfd3d4Z(dS )6    )OptionalN)init_logger)tltriton)SamplingMetadata)apply_top_k_top_p)SpecDecodeMetadataPLACEHOLDER_TOKEN_IDGREEDY_TEMPERATURE    c                   @   sb   e Zd ZdZdedeej dejdejdedejfdd	Z	e
d
ejdedeee  fddZdS )RejectionSamplerau  
    The implementation strictly follows the algorithm described in
        https://arxiv.org/abs/2211.17192.
    However, we want to clarify the terminology used in the implementation:
    accepted tokens: tokens that are accepted based on the relationship
            between the "raw" draft and target probabilities.
    recovered tokens: tokens that are sampled based on the adjusted probability
        distribution, which is derived from both the draft and target
        probabilities.
    bonus tokens:
        If all proposed tokens are accepted, the bonus token is added to the
        end of the sequence. The bonus token is only sampled from the target
        probabilities. We pass in the bonus tokens instead of sampling them
        in the rejection sampler to allow for more flexibility in the
        sampling process. For example, we can use top_p, top_k sampling for
        bonus tokens, while spec decode does not support these sampling
        strategies.
    output tokens:
        Tokens are finally generated with the rejection sampler.
        output tokens = accepted tokens + recovered tokens + bonus tokens
    metadatadraft_probstarget_logitsbonus_token_idssampling_metadatareturnc              	   C   s>   |j tksJ t||j|}t|j|j|j |j||||}|S )a*  
        Args:
            metadata:
                Metadata for spec decoding.
            draft_probs (Optional[torch.Tensor]):
                Probability distribution for the draft tokens. Shape is
                [num_tokens, vocab_size]. Can be None if probabilities are
                not provided, which is the case for ngram spec decode.
            target_logits (torch.Tensor):
                Target model's logits probability distribution.
                Shape is [num_tokens, vocab_size]. Here, probabilities from
                different requests are flattened into a single tensor because
                this is the shape of the output logits.
                NOTE: `target_logits` can be updated in place to save memory.
            bonus_token_ids_tensor (torch.Tensor):
                A tensor containing bonus tokens. Shape is [batch_size, 1].
                Bonus tokens are added to the end of the sequence if all
                proposed tokens are accepted. We generate the bonus tokens
                outside of the rejection sampler with the default sampling
                strategy. It allows for more flexibility in the sampling
                process such as top_p, top_k sampling.
            sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
                Additional metadata needed for sampling, such as temperature,
                top-k/top-p parameters, or other relevant information.
        Returns:
            output_token_ids (torch.Tensor):
                A tensor containing the final output token IDs.
        )max_spec_lenMAX_SPEC_LENcompute_probscu_num_draft_tokensrejection_sampledraft_token_idsnum_draft_tokens)selfr   r   r   r   r   target_probsoutput_token_ids r   l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/v1/sample/rejection_sampler.pyforward.   s"   '
zRejectionSampler.forwardr   
vocab_sizec                    s6   |    }|tk||k @   fddt|D }|S )a  Parse the output of the rejection sampler.

        Args:
            output_token_ids: The sampled token IDs in shape
                [batch_size, max_spec_len + 1]. The rejected tokens are
                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                and will be filtered out in this function.
            vocab_size: The size of the vocabulary.

        Returns:
            A list of lists of token IDs.
        c                    s    g | ]\}}| |    qS r   )tolist).0irowZ
valid_maskr   r   
<listcomp>   s    z1RejectionSampler.parse_output.<locals>.<listcomp>)cpunumpyr
   	enumerate)r   r!   Zoutput_token_ids_npZoutputsr   r&   r   parse_outputk   s   
zRejectionSampler.parse_outputN)__name__
__module____qualname____doc__r   r   torchTensorr   r    staticmethodintlistr+   r   r   r   r   r      s.    	

=
r   r   r   r   r   r   r   r   r   r   c                 C   sv  | j dksJ |d u s|j dksJ |j dksJ |j dks J t|}| jd }	|jd }
|j}|  s7J |d u sA| sAJ | sGJ | sMJ |j|	|
fksVJ tj||d ftj|d}|t	 |j
rnd }n|jtk}|js|jdd}t|f ||| ||||dd |j
r|S t|	||j|}t|||| ||||}t|f ||| ||||||||
|d u dd |S )	N      r   r	   dtypedevice)dim)	num_warps)NO_DRAFT_PROBSr;   )ndimlenshaper9   Zis_contiguousr0   emptyZint32Zfill_r
   
all_greedytemperaturer   Z
all_randomargmaxrejection_greedy_sample_kernelgenerate_uniform_probs
generatorssample_recovered_tokensrejection_random_sample_kernel)r   r   r   r   r   r   r   r   
batch_size
num_tokensr!   r9   r   	is_greedyZtarget_argmaxuniform_probsrecovered_token_idsr   r   r   r      s   





	r   logitsc                 C   s   | j dksJ |j dksJ |jr| S | jd }t|j||tdd}| |d d}|jdur8t|j||}d}|j	durFt|j	||}t
| ||} | jdtjd}|S )a  Compute probability distribution from logits based on sampling metadata.

    This function applies temperature scaling to the logits and converts
    them to probabilities using softmax. For greedy decoding, it returns
    the original logits.

    Args:
        logits: Input logits tensor to be converted to probabilities.
        cu_num_draft_tokens: Cumulative number of draft tokens.
        sampling_metadata: Metadata containing sampling parameters such as
            temperature and whether greedy sampling is used.

    Returns:
        torch.Tensor: Probability distribution (softmax of scaled logits)
            if non-greedy sampling is used, otherwise returns the
            original logits.
    r6   r5   r   )replace_from
replace_tor	   N)r:   r8   )r=   rA   r?   expand_batch_to_tokensrB   r   Zdiv_Z	unsqueezetop_ktop_pr   Zsoftmaxr0   float32)rN   r   r   rJ   rB   rR   rS   Zoutput_probr   r   r   r      s<   


r   xcu_num_tokensrJ   rO   rP   c              	   C   sF   | j d }|j d |ksJ | |}t|f || |||tdd |S )a  Expand [batch_size] tensor to [num_tokens] tensor based on the number of
    tokens per batch in cu_num_tokens.

    For example, if x = [a, b, c] and cu_num_tokens = [2, 5, 6], then
    num_tokens = 6, and expanded_x = [a, a, b, b, b, c].

    Args:
        x: [batch_size] tensor to expand.
        cu_num_tokens: [batch_size] tensor containing the cumulative number of
            tokens per batch. Each element represents the total number of
            tokens up to and including that batch.
        num_tokens: Total number of tokens.
        replace_from: int = 0
            Value to be replaced if it is found in x.
        replace_to: int = 0
            Value to replace with when replace_from is found.
    Returns:
        expanded_x: [num_tokens] tensor.
    r   r5   )MAX_NUM_TOKENSr;   )r?   Z	new_emptyexpand_kernelr   )rU   rV   rJ   rO   rP   rI   Z
expanded_xr   r   r   rQ   (  s   

	rQ   rF   r9   c           
      C   sj   t j| ft j|d}d}t|D ]"\}}|dkrq|| }||}	|	dur0||| j|	d |}q|S )a  
    Generates a batch of uniform random samples, with optional seeding
    if available.

    This method creates a tensor of shape `(num_tokens, )` filled
    with uniform random values in the range [0, 1). If `generators` is provided,
    the requests with their own seeds will use the provided `torch.Generator`
    for reproducibility. The samples for the other requests will be generated
    without a seed.

    Args:
        num_tokens : int
            Total number of tokens.
        num_draft_tokens : List[List[int]]
            Number of draft tokens per request.
        generators : Optional[Dict[int, torch.Generator]]
            A dictionary mapping indices in the batch to
            `torch.Generator` objects.
        device : torch.device
            The device on which to allocate the tensor.
    Returns:
        uniform_rand : torch.Tensor
            A tensor of shape `(num_tokens, )` containing uniform
            random values in the range [0, 1).
    r7   r   N	generator)r0   ZrandrT   r*   getZuniform_)
rJ   r   rF   r9   rL   	start_idxreq_idxnend_idxrZ   r   r   r   rE   Q  s   
rE   c                 C   s   t |}|jd }	tj||	ftj|d}
|
  |j D ]\}}|| dkr/|
| j|d qt|}t	|| f ||||||
|	t
|	|d u d	 |S )Nr	   r7   r   rY   )r<   )r>   r?   r0   r@   rT   Zexponential_rF   itemsZ
empty_likesample_recovered_tokens_kernelr   Znext_power_of_2)r   r   r   r   r   r   r   r9   rI   r!   qr$   rZ   rM   r   r   r   rG     s2   


rG   )Zdo_not_specializec                 C   s   t d}|d u rd}nt || }|sd S |dkrd}	n	t || d }	t || }
|
|	 }d}t|D ]*}|sbt ||	 | }t ||	 | }t | ||d   | | ||krbd}q8|s|t || }t | ||d   | | d S d S )Nr   Tr5   Fr   
program_idloadrangestore)output_token_ids_ptrcu_num_draft_tokens_ptrdraft_token_ids_ptrZtarget_argmax_ptrbonus_token_ids_ptris_greedy_ptrr   r]   rK   r\   r_   r   rejectedposdraft_token_idZtarget_argmax_idbonus_token_idr   r   r   rD     s>   

rD   r<   c                 C   sP  t d}t || }|rd S |dkrd}n	t || d }t || }|| }d}t|D ][}|st || | }|rCd}nt ||| |
  | }t ||| |
  | }t || | }|dkrs|| |krs|}nd}t || | }t | ||	d   | | q1|st || }t | ||	d   | | d S d S )Nr   r5   FTrc   )rh   ri   rj   draft_probs_ptrtarget_probs_ptrrk   Zrecovered_token_ids_ptrZuniform_probs_ptrrl   r   r!   r<   r]   rK   r\   r_   r   rm   rn   ro   
draft_probtarget_probZuniform_probZtoken_idrp   r   r   r   rH     sV   


rH   rW   c                 C   s   t d}|dkrd}n	t || d }t || }|| }	t || }
t |
|k||
}
t d|}t j| | | |
||	k d d S )Nr   r5   )mask)r   rd   re   wherearangerg   )Z
output_ptrZ	input_ptrZcu_num_tokens_ptrrO   rP   rW   r]   r\   r_   rJ   Zsrc_valoffsetr   r   r   rX     s   
	
rX   PADDED_VOCAB_SIZEc	                 C   s  t d}	|	dkrd}
n	t ||	 d }
t ||	 }||
 }t d}||kr+d S t d|}|rjt ||
 | }t ||
| |  | }t ||
| |  | d t j||
| |  | ||k dd}n,t j||
| |  | ||k dd}t j||
| |  | ||k dd}t || d}t j||	|  | ||k tdd}t j|| dd}t | |
 | | |rt ||
| |  | | d S d S )Nr   r5   )ru   otherz-infr	   )Zaxis)r   rd   re   rw   rg   maximumfloatrC   )rh   ri   rj   rq   rr   Zq_ptrr!   ry   r<   r]   r\   r_   r   rn   Zvocab_offsetro   Z	orig_probZprobrs   rt   rb   Zrecovered_idr   r   r   ra   8  sh   


ra   )r   r   ))typingr   r0   Ztorch.nnnnZvllm.loggerr   Zvllm.triton_utilsr   r   Zvllm.v1.sample.metadatar   Z$vllm.v1.sample.ops.topk_topp_samplerr   Zvllm.v1.spec_decode.metadatar   r,   loggerr
   Z	constexpr__annotations__r   r   Moduler   r1   r4   r3   r   r   rQ   dict	Generatorr9   rE   rG   ZjitrD   rH   rX   ra   r   r   r   r   <module>   s   p	
d
A
)
2

-
/>	