o
    á)iÞY  ã                   @   s  U d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ eeƒZdZejed	< dZejed
< dZG dd„ dejƒZdejdee dedejdeej dejdejdedejfdd„Zdejdejdedejfdd„Z	 	 d5dejdejdeded edejfd!d"„Zdedee d#eeej f d$ej!dejf
d%d&„Z"dedee dejdejdeej dejded$ej!dejfd'd(„Z#e	j$dgd)d*d+„ ƒZ%e	j$dgd)d,ejfd-d.„ƒZ&e	j$dd gd)d/ejfd0d1„ƒZ'e	j$d2ejd,ejfd3d4„ƒZ(dS )6é    )ÚOptionalN)Úinit_logger)ÚtlÚtriton)ÚSamplingMetadata)Úapply_top_k_top_p)ÚSpecDecodeMetadataéÿÿÿÿÚPLACEHOLDER_TOKEN_IDÚGREEDY_TEMPERATUREé    c                   @   sb   e Zd ZdZdedeej dejdejdedejfdd	„Z	e
d
ejdedeee  fdd„ƒZdS )ÚRejectionSamplerau  
    The implementation strictly follows the algorithm described in
        https://arxiv.org/abs/2211.17192.
    However, we want to clarify the terminology used in the implementation:
    accepted tokens: tokens that are accepted based on the relationship
            between the "raw" draft and target probabilities.
    recovered tokens: tokens that are sampled based on the adjusted probability
        distribution, which is derived from both the draft and target
        probabilities.
    bonus tokens:
        If all proposed tokens are accepted, the bonus token is added to the
        end of the sequence. The bonus token is only sampled from the target
        probabilities. We pass in the bonus tokens instead of sampling them
        in the rejection sampler to allow for more flexibility in the
        sampling process. For example, we can use top_p, top_k sampling for
        bonus tokens, while spec decode does not support these sampling
        strategies.
    output tokens:
        Tokens are finally generated with the rejection sampler.
        output tokens = accepted tokens + recovered tokens + bonus tokens
    ÚmetadataÚdraft_probsÚtarget_logitsÚbonus_token_idsÚsampling_metadataÚreturnc              	   C   s>   |j tksJ ‚t||j|ƒ}t|j|j|j |j||||ƒ}|S )a*  
        Args:
            metadata:
                Metadata for spec decoding.
            draft_probs (Optional[torch.Tensor]):
                Probability distribution for the draft tokens. Shape is
                [num_tokens, vocab_size]. Can be None if probabilities are
                not provided, which is the case for ngram spec decode.
            target_logits (torch.Tensor):
                Target model's logits probability distribution.
                Shape is [num_tokens, vocab_size]. Here, probabilities from
                different requests are flattened into a single tensor because
                this is the shape of the output logits.
                NOTE: `target_logits` can be updated in place to save memory.
            bonus_token_ids_tensor (torch.Tensor):
                A tensor containing bonus tokens. Shape is [batch_size, 1].
                Bonus tokens are added to the end of the sequence if all
                proposed tokens are accepted. We generate the bonus tokens
                outside of the rejection sampler with the default sampling
                strategy. It allows for more flexibility in the sampling
                process such as top_p, top_k sampling.
            sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
                Additional metadata needed for sampling, such as temperature,
                top-k/top-p parameters, or other relevant information.
        Returns:
            output_token_ids (torch.Tensor):
                A tensor containing the final output token IDs.
        )Úmax_spec_lenÚMAX_SPEC_LENÚcompute_probsÚcu_num_draft_tokensÚrejection_sampleÚdraft_token_idsÚnum_draft_tokens)Úselfr   r   r   r   r   Útarget_probsÚoutput_token_ids© r   úl/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/v1/sample/rejection_sampler.pyÚforward.   s"   'ýø
zRejectionSampler.forwardr   Ú
vocab_sizec                    s6   |   ¡  ¡ }|tk||k @ ‰ ‡ fdd„t|ƒD ƒ}|S )aÀ  Parse the output of the rejection sampler.

        Args:
            output_token_ids: The sampled token IDs in shape
                [batch_size, max_spec_len + 1]. The rejected tokens are
                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                and will be filtered out in this function.
            vocab_size: The size of the vocabulary.

        Returns:
            A list of lists of token IDs.
        c                    s    g | ]\}}|ˆ |    ¡ ‘qS r   )Útolist)Ú.0ÚiÚrow©Z
valid_maskr   r   Ú
<listcomp>€   s    ÿÿz1RejectionSampler.parse_output.<locals>.<listcomp>)ÚcpuÚnumpyr
   Ú	enumerate)r   r!   Zoutput_token_ids_npZoutputsr   r&   r   Úparse_outputk   s   ÿ
þzRejectionSampler.parse_outputN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   ÚtorchÚTensorr   r    ÚstaticmethodÚintÚlistr+   r   r   r   r   r      s.    þüúø	÷

ö=ÿþ
ýr   r   r   r   r   r   r   r   r   r   c                 C   sv  | j dksJ ‚|d u s|j dksJ ‚|j dksJ ‚|j dks J ‚t|ƒ}| jd }	|jd }
|j}|  ¡ s7J ‚|d u sA| ¡ sAJ ‚| ¡ sGJ ‚| ¡ sMJ ‚|j|	|
fksVJ ‚tj||d ftj|d}| t	¡ |j
rnd }n|jtk}|js|jdd}t|f ||| ||||dd |j
r|S t|	||j|ƒ}t|||| ||||ƒ}t|f ||| ||||||||
|d u dd |S )	Né   é   r   r	   ©ÚdtypeÚdevice)Údim)Ú	num_warps)ÚNO_DRAFT_PROBSr;   )ÚndimÚlenÚshaper9   Zis_contiguousr0   ÚemptyZint32Zfill_r
   Ú
all_greedyÚtemperaturer   Z
all_randomÚargmaxÚrejection_greedy_sample_kernelÚgenerate_uniform_probsÚ
generatorsÚsample_recovered_tokensÚrejection_random_sample_kernel)r   r   r   r   r   r   r   r   Ú
batch_sizeÚ
num_tokensr!   r9   r   Ú	is_greedyZtarget_argmaxÚuniform_probsÚrecovered_token_idsr   r   r   r   ‡   sˆ   


ý

ø
ü	øór   Úlogitsc                 C   s¬   | j dksJ ‚|j dksJ ‚|jr| S | jd }t|j||tdd}|  | d¡¡ d}|jdur8t|j||ƒ}d}|j	durFt|j	||ƒ}t
| ||ƒ} | jdtjd}|S )aÉ  Compute probability distribution from logits based on sampling metadata.

    This function applies temperature scaling to the logits and converts
    them to probabilities using softmax. For greedy decoding, it returns
    the original logits.

    Args:
        logits: Input logits tensor to be converted to probabilities.
        cu_num_draft_tokens: Cumulative number of draft tokens.
        sampling_metadata: Metadata containing sampling parameters such as
            temperature and whether greedy sampling is used.

    Returns:
        torch.Tensor: Probability distribution (softmax of scaled logits)
            if non-greedy sampling is used, otherwise returns the
            original logits.
    r6   r5   r   )Úreplace_fromÚ
replace_tor	   N)r:   r8   )r=   rA   r?   Úexpand_batch_to_tokensrB   r   Zdiv_Z	unsqueezeÚtop_kÚtop_pr   Zsoftmaxr0   Úfloat32)rN   r   r   rJ   rB   rR   rS   Zoutput_probr   r   r   r   ë   s<   
û
ý
ýr   ÚxÚcu_num_tokensrJ   rO   rP   c              	   C   sF   | j d }|j d |ksJ ‚|  |¡}t|f || |||tdd |S )a  Expand [batch_size] tensor to [num_tokens] tensor based on the number of
    tokens per batch in cu_num_tokens.

    For example, if x = [a, b, c] and cu_num_tokens = [2, 5, 6], then
    num_tokens = 6, and expanded_x = [a, a, b, b, b, c].

    Args:
        x: [batch_size] tensor to expand.
        cu_num_tokens: [batch_size] tensor containing the cumulative number of
            tokens per batch. Each element represents the total number of
            tokens up to and including that batch.
        num_tokens: Total number of tokens.
        replace_from: int = 0
            Value to be replaced if it is found in x.
        replace_to: int = 0
            Value to replace with when replace_from is found.
    Returns:
        expanded_x: [num_tokens] tensor.
    r   r5   )ÚMAX_NUM_TOKENSr;   )r?   Z	new_emptyÚexpand_kernelr   )rU   rV   rJ   rO   rP   rI   Z
expanded_xr   r   r   rQ   (  s   

ù	rQ   rF   r9   c           
      C   sj   t j| ft j|d}d}t|ƒD ]"\}}|dkrq|| }| |¡}	|	dur0|||… j|	d |}q|S )a×  
    Generates a batch of uniform random samples, with optional seeding
    if available.

    This method creates a tensor of shape `(num_tokens, )` filled
    with uniform random values in the range [0, 1). If `generators` is provided,
    the requests with their own seeds will use the provided `torch.Generator`
    for reproducibility. The samples for the other requests will be generated
    without a seed.

    Args:
        num_tokens : int
            Total number of tokens.
        num_draft_tokens : List[List[int]]
            Number of draft tokens per request.
        generators : Optional[Dict[int, torch.Generator]]
            A dictionary mapping indices in the batch to
            `torch.Generator` objects.
        device : torch.device
            The device on which to allocate the tensor.
    Returns:
        uniform_rand : torch.Tensor
            A tensor of shape `(num_tokens, )` containing uniform
            random values in the range [0, 1).
    r7   r   N©Ú	generator)r0   ZrandrT   r*   ÚgetZuniform_)
rJ   r   rF   r9   rL   Ú	start_idxÚreq_idxÚnÚend_idxrZ   r   r   r   rE   Q  s   ý
rE   c                 C   sš   t |ƒ}|jd }	tj||	ftj|d}
|
 ¡  |j ¡ D ]\}}|| dkr/|
| j|d qt |¡}t	|| f ||||||
|	t
 |	¡|d u d	 |S )Nr	   r7   r   rY   )r<   )r>   r?   r0   r@   rT   Zexponential_rF   ÚitemsZ
empty_likeÚsample_recovered_tokens_kernelr   Znext_power_of_2)r   r   r   r   r   r   r   r9   rI   r!   Úqr$   rZ   rM   r   r   r   rG   ƒ  s2   
ý€

÷rG   )Zdo_not_specializec                 C   sü   t  d¡}|d u rd}nt  || ¡}|sd S |dkrd}	n	t  || d ¡}	t  || ¡}
|
|	 }d}t|ƒD ]*}|sbt  ||	 | ¡}t  ||	 | ¡}t  | ||d   | |¡ ||krbd}q8|s|t  || ¡}t  | ||d   | |¡ d S d S )Nr   Tr5   F©r   Ú
program_idÚloadÚrangeÚstore)Úoutput_token_ids_ptrÚcu_num_draft_tokens_ptrÚdraft_token_ids_ptrZtarget_argmax_ptrÚbonus_token_ids_ptrÚis_greedy_ptrr   r]   rK   r\   r_   r   ÚrejectedÚposÚdraft_token_idZtarget_argmax_idÚbonus_token_idr   r   r   rD   °  s>   

ÿ€ÿþýrD   r<   c                 C   sP  t  d¡}t  || ¡}|rd S |dkrd}n	t  || d ¡}t  || ¡}|| }d}t|ƒD ][}|sŒt  || | ¡}|rCd}nt  ||| |
  | ¡}t  ||| |
  | ¡}t  || | ¡}|dkrs|| |krs|}nd}t  || | ¡}t  | ||	d   | |¡ q1|s¦t  || ¡}t  | ||	d   | |¡ d S d S )Nr   r5   FTrc   )rh   ri   rj   Údraft_probs_ptrÚtarget_probs_ptrrk   Zrecovered_token_ids_ptrZuniform_probs_ptrrl   r   r!   r<   r]   rK   r\   r_   r   rm   rn   ro   Ú
draft_probÚtarget_probZuniform_probZtoken_idrp   r   r   r   rH   à  sV   

ÿþ
ÿþÿ€ÿþýrH   rW   c                 C   sŒ   t  d¡}|dkrd}n	t  || d ¡}t  || ¡}|| }	t  || ¡}
t  |
|k||
¡}
t  d|¡}t j| | | |
||	k d d S )Nr   r5   )Úmask)r   rd   re   ÚwhereÚarangerg   )Z
output_ptrZ	input_ptrZcu_num_tokens_ptrrO   rP   rW   r]   r\   r_   rJ   Zsrc_valÚoffsetr   r   r   rX     s   
	
þrX   ÚPADDED_VOCAB_SIZEc	                 C   sž  t  d¡}	|	dkrd}
n	t  ||	 d ¡}
t  ||	 ¡}||
 }t  d¡}||kr+d S t  d|¡}|rjt  ||
 | ¡}t  ||
| |  | ¡}t  ||
| |  | d¡ t j||
| |  | ||k dd}n,t j||
| |  | ||k dd}t j||
| |  | ||k dd}t  || d¡}t j||	|  | ||k tdƒd}t j|| dd}t  | |
 | |¡ |rÍt  ||
| |  | |¡ d S d S )Nr   r5   )ru   Úotherz-infr	   )Zaxis)r   rd   re   rw   rg   ÚmaximumÚfloatrC   )rh   ri   rj   rq   rr   Zq_ptrr!   ry   r<   r]   r\   r_   r   rn   Zvocab_offsetro   Z	orig_probZprobrs   rt   rb   Zrecovered_idr   r   r   ra   8  sh   

ÿþÿýÿý
ÿÿýþþþra   )r   r   ))Útypingr   r0   Ztorch.nnÚnnZvllm.loggerr   Zvllm.triton_utilsr   r   Zvllm.v1.sample.metadatar   Z$vllm.v1.sample.ops.topk_topp_samplerr   Zvllm.v1.spec_decode.metadatar   r,   Úloggerr
   Z	constexprÚ__annotations__r   r   ÚModuler   r1   r4   r3   r   r   rQ   ÚdictÚ	Generatorr9   rE   rG   ZjitrD   rH   rX   ra   r   r   r   r   Ú<module>   sÌ   pþüûù	÷õóò
ñdÿþý
üAûÿþýüû
ú)ÿþýü
û2ÿþüúø
öõô
ó-
/ô>úø	÷