o
    )ij                     @   s  d dl Z d dlmZmZ d dlmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZm Z m!Z!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z- d dl+m.Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; e1e<Z=de3ded de	e>e? e@f fddZAG dd de%ZBG dd deBZCdS )    N)AsyncGeneratorMapping)AnyFinalLiteralOptionalUnioncast)Request)assert_neveroverride)ModelConfig)EngineClient)ChatTemplateContentFormatOption)RequestLogger)EmbeddingChatRequestEmbeddingCompletionRequestEmbeddingRequestEmbeddingResponseEmbeddingResponseDataErrorResponse	UsageInfo)EmbeddingServeContextOpenAIServingRequestPromptServeContextTextTokensPrompt)OpenAIServingModels)EmbedsPrompt)TokensPrompt)init_logger)EmbeddingOutputEmbeddingRequestOutputPoolingOutputPoolingRequestOutputRequestOutput)PoolingParams)
chunk_listoutputencoding_format)floatbase64returnc                 C   sF   |dkr| j S |dkrtj| j dd }t|dS t| d S )Nr*   r+   float32Zdtypezutf-8)	embeddingnparraytobytesr+   	b64encodedecoder   )r(   r)   Zembedding_bytes r5   u/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/entrypoints/openai/serving_embedding.py_get_embedding+   s   r7   c                       sH  e Zd Z fddZededee fddZedede	e
ef fddZdefd	d
ZdefddZdedededeeedf  fddZdee dedef fddZdefddZdede	eef dededeeeef  dedee	eef df fddZededee f fdd Z ededee f fd!d"Z!  Z"S )#EmbeddingMixinc                    sJ   t  j|i | | jj}t|o|j| _|r |jr |j| _d S d | _d S N)super__init__model_configpooler_configboolZenable_chunked_processingsupports_chunked_processingmax_embed_len)selfargskwargsr=   	__class__r5   r6   r;   <   s   zEmbeddingMixin.__init__ctxr,   c                    s   t t|}zV| |j|_| j|jI d H }t|jtrC| j	|j||jj
|jjp,|j|jdd|j|jjd	I d H \}|_|_W d S | j|j||jj|j|jjdI d H \|_|_W d S  ttfy| } ztd | t|W  Y d }~S d }~ww )NF)chat_templatechat_template_content_formatZadd_generation_promptZcontinue_final_messagetruncate_prompt_tokensadd_special_tokens)rI   rJ   z$Error in preprocessing prompt inputs)r	   r   Z_maybe_get_adaptersrequestlora_requestengine_clientZget_tokenizer
isinstancer   Z_preprocess_chatmessagesrG   rH   rI   rJ   request_promptsengine_promptsZ_preprocess_completioninput
ValueError	TypeErrorlogger	exceptioncreate_error_responsestr)rA   rF   Z	tokenizer_er5   r5   r6   _preprocessG   sN   



zEmbeddingMixin._preprocessc                 C   s   g }d}t tt |j}t|D ]#\}}t|}t|t|j	|j
jd}|j}	|| |t|	7 }qt||d}
t|j|j|j||
dS )Nr   )indexr/   )Zprompt_tokensZtotal_tokens)idcreatedmodeldatausage)r	   listr$   final_res_batch	enumerater"   Z	from_baser   r7   outputsrK   r)   prompt_token_idsappendlenr   r   
request_idZcreated_time
model_name)rA   rF   itemsZnum_prompt_tokensZfinal_res_batch_checkedidxZ	final_resZembedding_resitemrf   ra   r5   r5   r6   _build_responseu   s6   

zEmbeddingMixin._build_responsec                 C   s   | j jS )z?Get the model's effective maximum sequence length for chunking.)r<   max_model_len)rA   r5   r5   r6   _get_max_position_embeddings   s   z+EmbeddingMixin._get_max_position_embeddingsc                 C   s   t |ttfo	| jS )z<Check if chunked processing should be used for this request.)rN   r   r   r?   )rA   rK   r5   r5   r6   _should_use_chunked_processing   s   z-EmbeddingMixin._should_use_chunked_processingoriginal_prompt
prompt_idxNc                    s   g }|d }|   }tt||D ]=\}	}
|j d| d|	 }t|
d}d}t||
d}| j||||jd | jj	||||j|t
|jdd	d
}|| q|S )z1Process a single prompt using chunked processing.rf   z-prompt--chunk-)rf    promptrf   paramsrL   priorityr   rL   trace_headersrz   )rp   rd   r'   ri   EngineTokensPromptr   _log_inputsrL   rM   encodegetattrrK   rg   )rA   rF   rr   pooling_paramsr|   rs   
generatorsZ	token_idsmax_pos_embeddingsZ	chunk_idxZchunk_tokensZchunk_request_idZchunk_engine_promptZ
chunk_textZchunk_request_promptZoriginal_generatorr5   r5   r6   _process_chunked_request   s@   		z'EmbeddingMixin._process_chunked_request	input_ids
input_textc                    s   t |}t|ttfrT| |}|  }| jdurd}| j}nd}| j}d}	d}
||kr6t|	j	|||d||krN|rDt
d|| n
t|
j	d||dt||d	S t |||S )
z>Override to support chunked processing for embedding requests.Nzmaximum embedding input lengthzmaximum context lengthzThis model's {length_type} is {max_length_value} tokens. However, you requested {token_num} tokens in the input for embedding generation. Please reduce the length of the input.zThis model's {length_type} is {max_length_value} tokens. However, you requested {token_num} tokens in the input for embedding generation. Please reduce the length of the input or enable chunked processing.)length_typemax_length_value	token_numzOInput length %s exceeds max_position_embeddings %s, will use chunked processingz"maximum position embeddings lengthrv   )rh   rN   r   r   rq   rp   r@   ro   rS   formatrU   infor   r:   _validate_input)rA   rK   r   r   r   Zenable_chunkedr   r   r   Zvalidation_error_msgZchunked_processing_error_msgrD   r5   r6   r      sR   

zEmbeddingMixin._validate_inputc                 C   s   t |tod|v od|vS )z?Check if a prompt is a TextTokensPrompt (has prompt_token_ids).rf   Zprompt_embeds)rN   dict)rA   rw   r5   r5   r6   _is_text_tokens_prompt  s   z%EmbeddingMixin._is_text_tokens_promptengine_promptrequest_promptr   r|   prompt_indexc              
      s\   |j  d| }| j||||jd ttttf |}| jj||||j|t	|j
dddS )zACreate a generator for a single prompt using standard processing.-rx   rz   r   r{   )ri   r~   rL   r	   r   r}   EngineEmbedsPromptrM   r   r   rK   )rA   rF   r   r   r   r|   r   Zrequest_id_itemr5   r5   r6   _create_single_prompt_generator!  s$   
z.EmbeddingMixin._create_single_prompt_generatorc              
      s  t t|}| |j}|st |I dH S g }z|jdu r!dn	| |jjI dH }| 	|}t
|tr8|W S z	|d| j W n ty[ } z| t|W  Y d}~W S d}~ww |jdu rg| dW S |jdu rr| dW S |  }t|jD ]H\}}	|j| }
| |
rt t|
}t|d |kr| |||||I dH }|| q{t tttf |	}| |||
|||I dH }|| q{ddlm } || |_!W dS  t"y } z| t|W  Y d}~S d}~ww )z'Override to support chunked processing.NembedEngine prompts not availableRequest prompts not availablerf   r   )merge_async_iterators)#r	   r   rq   rK   r:   _prepare_generatorsraw_requestZ_get_trace_headersheaders_create_pooling_paramsrN   r   verifyr<   rS   rW   rX   rQ   rP   rp   rd   r   r   rh   r   extendr   r}   r   r   rg   
vllm.utilsr   result_generator	Exception)rA   rF   use_chunkedr   r|   r   rZ   r   ir   r   Ztext_tokens_promptZchunk_generatorsZengine_prompt_typed	generatorr   rD   r5   r6   r   B  st   













z"EmbeddingMixin._prepare_generatorsc              
      s  t t|}z|jdu r| dW S | |j}|s&t j|dI dH W S |jdu r1| dW S |j	du r<| dW S i }i }|j	2 z3 dH W \}}d|j
v r|j
d}zt||dd	  }W n ttfyq   |}Y nw ||vrdd
d
|j
dd
 d||< || }	t|ts| dt|j   W S t|jdr|jj}
nt|jdr|jj}
n| dt|jj   W S t|
tjstj|
tjd}
|jdu r| d  W S t|j}|
jtjd| }|	d du r||	d< n|	d  |7  < |	d  |7  < |	d  d	7  < qC|j
d}zt|d }W n ttfy*   |}Y nw t t|||< qC6 g }t|j}t|D ]}||v r|| }	|	d }|	d }|durt|tjrt|tt fr|d
kr|| }t!|d}|j| }| "|s| d| d  W S t t#|d }t|	d ||dd}|$| q?| d|   W S ||v r|$t t||  q?| d|   W S t t%t&t'tf  ||_(W dS  t)y } z| t*|W  Y d}~S d}~ww )zCollect and aggregate batch results
        with support for chunked processing.
        
        For chunked requests, performs online aggregation to 
        minimize memory usage.
        For regular requests, collects results normally.
        Nr   )rF   r   zResult generator not availablert   r   rw      r   )weighted_sumtotal_weightchunk_countri   z9Expected PoolingRequestOutput for chunked embedding, got r`   r/   zUnsupported output type: r.   z6prompt_token_ids cannot be None for chunked processingr   r   r   )r`   zChunked prompt z is not a TextTokensPromptrf   ri   T)ri   rf   re   finishedz&Failed to aggregate chunks for prompt zResult not found for prompt )+r	   r   rQ   rW   rq   rK   r:   _collect_batchrP   r   ri   splitintr\   rS   
IndexErrorrN   r$   type__name__hasattrre   r`   r/   torchZTensorZtensorr-   rf   rh   toranger*   r#   r   r   rg   rb   r   r%   rc   r   rX   )rA   rF   r   Zprompt_aggregatorsZshort_prompts_resultsZ
result_idxresultpartsrs   Z
aggregatorZembedding_dataweightZweighted_embeddingrc   Znum_promptsr   r   Zfinal_embeddingZpooling_output_datarr   Zoriginal_token_idsZpooling_request_outputrZ   rD   r5   r6   r     s  















P










zEmbeddingMixin._collect_batch)#r   
__module____qualname__r;   r   r   r   r   r[   r   r   rn   r   rp   r>   rq   r   r   rb   r   r$   r   rX   r   r   r}   r   r   r&   r   r%   r   r   r   __classcell__r5   r5   rD   r6   r8   :   sx    -
$
2E

!Or8   c                       s   e Zd ZdZdedededee dee	 de
dd	f fd
dZ		ddedee deeef f fddZedee dee f fddZedee deeef f fddZ  ZS )OpenAIServingEmbeddingZembdrM   r<   modelsrequest_loggerrG   rH   r,   Nc                   s$   t  j||||d || _|| _d S )N)rM   r<   r   r   )r:   r;   rG   rH   )rA   rM   r<   r   r   rG   rH   rD   r5   r6   r;   K  s   

zOpenAIServingEmbedding.__init__rK   r   c                    sR   |  |j}| j d| ||j }t||||| j| jd}t 	|I dH S )z
        Embedding API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/embeddings/create
        for the API specification. This API mimics the OpenAI Embedding API.
        r   )rK   r   rj   ri   rG   rH   N)
Z_get_model_namer_   request_id_prefixZ_base_request_idri   r   rG   rH   r:   handle)rA   rK   r   rj   ri   rF   rD   r5   r6   create_embedding]  s   	z'OpenAIServingEmbedding.create_embeddingrF   c                    s"   t  | }r
|S |jj|_d S r9   )r:   _validate_requestrK   rI   )rA   rF   errorrD   r5   r6   r   x  s   
z(OpenAIServingEmbedding._validate_requestc              
      sb   t  |}t|tr|S z
|d| j W |S  ty0 } z| t|W  Y d }~S d }~ww )Nr   )	r:   r   rN   r   r   r<   rS   rW   rX   )rA   rF   r   rZ   rD   r5   r6   r     s   
z-OpenAIServingEmbedding._create_pooling_paramsr9   )r   r   r   r   r   r   r   r   r   rX   r   r;   r   r
   r   r   r   r   r   r   r   r&   r   r   r5   r5   rD   r6   r   H  sJ    	

r   )Dr+   collections.abcr   r   typingr   r   r   r   r   r	   numpyr0   r   Zfastapir
   Ztyping_extensionsr   r   Zvllm.configr   Zvllm.engine.protocolr   Zvllm.entrypoints.chat_utilsr   Zvllm.entrypoints.loggerr   Z vllm.entrypoints.openai.protocolr   r   r   r   r   r   r   Z&vllm.entrypoints.openai.serving_enginer   r   r   r   r   Z&vllm.entrypoints.openai.serving_modelsr   Zvllm.inputs.datar   r   r   r}   Zvllm.loggerr    Zvllm.outputsr!   r"   r#   r$   r%   Zvllm.pooling_paramsr&   r   r'   r   rU   rb   r*   rX   r7   r8   r   r5   r5   r5   r6   <module>   sD    $
    