o
    {qiok                     @  s   d Z ddlmZ ddlZddlZddlmZmZmZ ddl	m
Z
mZmZmZmZ ddlZddlZddlmZ ddlmZ ddlmZmZmZ dd	lmZmZmZmZmZ dd
lm Z  e!e"Z#dddZ$G dd deeZ%dS )z#Base classes for OpenAI embeddings.    )annotationsN)IterableMappingSequence)AnyLiteralOptionalUnioncast)
Embeddings)run_in_executor)from_envget_pydantic_field_namessecret_from_env)	BaseModel
ConfigDictField	SecretStrmodel_validator)Self	num_textsinttokenslist[Union[list[int], str]]batched_embeddingslist[list[float]]indices	list[int]
skip_emptyboolreturnlist[Optional[list[float]]]c           	        s"  dd t | D }dd t | D t t|D ]% |r%t|  dkr%q||   |   |   t|   qg }t | D ]J |  }t|dkrV|d  qDt|dkrd||d  qDt   fddt| D }tdd |D d	 |fd
d|D  qD|S )Nc                 S     g | ]}g qS  r#   .0_r#   r#   g/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/langchain_openai/embeddings/base.py
<listcomp>       z7_process_batched_chunked_embeddings.<locals>.<listcomp>c                 S  r"   r#   r#   r$   r#   r#   r'   r(   #   r)      r   c                   s,   g | ]}t d d t|  D  qS )c                 s  s    | ]	\}}|| V  qd S Nr#   )r%   valweightr#   r#   r'   	<genexpr>A   s    zA_process_batched_chunked_embeddings.<locals>.<listcomp>.<genexpr>)sumzip)r%   	embedding)inum_tokens_in_batchtotal_weightr#   r'   r(   @   s    c                 s  s    | ]}|d  V  qdS )   Nr#   r%   r,   r#   r#   r'   r.   H       z6_process_batched_chunked_embeddings.<locals>.<genexpr>g      ?c                   s   g | ]}|  qS r#   r#   r6   )	magnituder#   r'   r(   I   s    )rangelenappendr/   r0   )	r   r   r   r   r   results
embeddings_resultZaverager#   )r2   r8   r3   r4   r'   #_process_batched_chunked_embeddings   s.   	
r?   c                   @  s  e Zd ZU dZedddZded< edddZded< dZd	ed
< dZ	ded< 	 eZ
ded< eedddddZded< 	 ededdddZded< 	 eeddddZded< eeddddZded< dZded< 	 ed ed!dddZd"ed#< 	 ed$ed%d&gdddZded'< 	 dZd(ed)< dZd*ed+< d,Zded-< 	 d.Zded/< 	 edd0d1Zd2ed3< 	 dZded4< dZd5ed6< 	 dZded7< 	 d8Zd5ed9< 	 eedZd:ed;< 	 d8Zd5ed<< 	 dZ d=ed>< dZ!d?ed@< dAZ"dedB< 	 dCZ#dedD< 	 dZ$dEedF< 	 dZ%dEedG< 	 dZ&d5edH< 	 e'dIddJdKZ(e)dLdMe*dodPdQZ+e)dRdMdpdTdUZ,e-dqdVdWZ.drd[d\Z/dd]dsdadbZ0dd]dsdcddZ1	dtdudedfZ2	dtdudgdhZ3dvdkdlZ4dvdmdnZ5dS )wOpenAIEmbeddingsu	  OpenAI embedding model integration.

    Setup:
        Install ``langchain_openai`` and set environment variable ``OPENAI_API_KEY``.

        .. code-block:: bash

            pip install -U langchain_openai
            export OPENAI_API_KEY="your-api-key"

    Key init args — embedding params:
        model: str
            Name of OpenAI model to use.
        dimensions: Optional[int] = None
            The number of dimensions the resulting output embeddings should have.
            Only supported in ``'text-embedding-3'`` and later models.

    Key init args — client params:
        api_key: Optional[SecretStr] = None
            OpenAI API key.
        organization: Optional[str] = None
            OpenAI organization ID. If not passed in will be read
            from env var ``OPENAI_ORG_ID``.
        max_retries: int = 2
            Maximum number of retries to make when generating.
        request_timeout: Optional[Union[float, Tuple[float, float], Any]] = None
            Timeout for requests to OpenAI completion API

    See full list of supported init args and their descriptions in the params section.

    Instantiate:
        .. code-block:: python

            from langchain_openai import OpenAIEmbeddings

            embed = OpenAIEmbeddings(
                model="text-embedding-3-large"
                # With the `text-embedding-3` class
                # of models, you can specify the size
                # of the embeddings you want returned.
                # dimensions=1024
            )

    Embed single text:
        .. code-block:: python

            input_text = "The meaning of life is 42"
            vector = embeddings.embed_query("hello")
            print(vector[:3])

        .. code-block:: python

            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Embed multiple texts:
        .. code-block:: python

            vectors = embeddings.embed_documents(["hello", "goodbye"])
            # Showing only the first 3 coordinates
            print(len(vectors))
            print(vectors[0][:3])

        .. code-block:: python

            2
            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Async:
        .. code-block:: python

            await embed.aembed_query(input_text)
            print(vector[:3])

            # multiple:
            # await embed.aembed_documents(input_texts)

        .. code-block:: python

            [-0.009100092574954033, 0.005071679595857859, -0.0029193938244134188]

    NT)defaultexcluder   clientasync_clientztext-embedding-ada-002strmodelOptional[int]
dimensionszOptional[str]
deploymentZOPENAI_API_VERSION)rA   api_version)default_factoryaliasopenai_api_versionbase_urlZOPENAI_API_BASE)rL   rK   openai_api_baseZOPENAI_API_TYPE)rK   openai_api_typeZOPENAI_PROXYopenai_proxyi  r   embedding_ctx_lengthapi_keyZOPENAI_API_KEYzOptional[SecretStr]openai_api_keyorganizationZOPENAI_ORG_IDZOPENAI_ORGANIZATIONopenai_organizationz%Union[Literal['all'], set[str], None]allowed_specialz4Union[Literal['all'], set[str], Sequence[str], None]disallowed_speciali  
chunk_sizer5   max_retriestimeout)rA   rL   z0Optional[Union[float, tuple[float, float], Any]]request_timeoutheadersr   tiktoken_enabledtiktoken_model_nameFshow_progress_bardict[str, Any]model_kwargsr   zUnion[Mapping[str, str], None]default_headersz!Union[Mapping[str, object], None]default_query   retry_min_seconds   retry_max_secondszUnion[Any, None]http_clienthttp_async_clientcheck_embedding_ctx_lengthZforbidr#   )extraZpopulate_by_nameZprotected_namespacesbefore)modevaluesr    c              
   C  s   t | }|di }t|D ]*}||v rd| d}t|||vr8td| d| d| d ||||< q|| }|rLd| d	}t|||d< |S )
z>Build extra kwargs from additional params that were passed in.rb   zFound z supplied twice.z	WARNING! z/ is not default parameter.
                    zJ was transferred to model_kwargs.
                    Please confirm that z is what you intended.zParameters za should be specified explicitly. Instead they were passed in as part of `model_kwargs` parameter.)	r   getlist
ValueErrorwarningswarnpopintersectionkeys)clsro   Zall_required_field_namesrl   
field_namemsgZinvalid_model_kwargsr#   r#   r'   build_extra  s.   
zOpenAIEmbeddings.build_extraafterr   c           
   
   C  s|  | j dv rd}t|| jr| j nd| j| j| j| j| j| j	d}| j
rD| js,| jrD| j
}| j}| j}d|d|d|}t|| js| j
ro| jsozddl}W n tyf } zd	}t||d}~ww |j| j
d
| _d| ji}tjdi ||j| _| js| j
r| jszddl}W n ty } zd	}t||d}~ww |j| j
d
| _d| ji}	tjdi ||	j| _| S )z?Validate that api key and python package exists in environment.)ZazureZazure_adZazureadzEIf you are using Azure, please use the `AzureOpenAIEmbeddings` class.N)rS   rU   rN   r[   rZ   rc   rd   zwCannot specify 'openai_proxy' if one of 'http_client'/'http_async_client' is already specified. Received:
openai_proxy=z
http_client=z
http_async_client=r   zRCould not import httpx python package. Please install it with `pip install httpx`.)proxyri   r#   )rP   rr   rT   Zget_secret_valuerV   rO   r\   rZ   rc   rd   rQ   ri   rj   rC   httpxImportErrorZClientopenaiZOpenAIr=   rD   ZAsyncClientZAsyncOpenAI)
selfrz   Zclient_paramsrQ   ri   rj   r~   eZsync_specificZasync_specificr#   r#   r'   validate_environment!  st   




z%OpenAIEmbeddings.validate_environmentc                 C  s(   d| j i| j}| jd ur| j|d< |S )NrF   rH   )rF   rb   rH   )r   paramsr#   r#   r'   _invocation_params^  s   

z#OpenAIEmbeddings._invocation_paramstexts	list[str]<tuple[Iterable[int], list[Union[list[int], str]], list[int]]c                 C  s  g }g }| j p	| j}| js^zddlm} W n ty"   d}t|w |j|d}t|D ]/\}	}
|j	|
dd}t
dt|| jD ]}|||| j  }||}|| ||	 qAq-ngzt|}W n tys   td}Y nw dd	 | j| jd
 D }t|D ]>\}	}
| jdr|
dd}
|r|j	|
fi |}n||
}t
dt|| jD ]}||||| j   ||	 qq| jrzddlm} |t
dt||}W n ty   t
dt||}Y n	w t
dt||}|||fS )a  Take the input `texts` and `chunk_size` and return 3 iterables as a tuple.

        We have `batches`, where batches are sets of individual texts
        we want responses from the openai api. The length of a single batch is
        `chunk_size` texts.

        Each individual text is also split into multiple texts based on the
        `embedding_ctx_length` parameter (based on number of tokens).

        This function returns a 3-tuple of the following:

        _iter: An iterable of the starting index in `tokens` for each *batch*
        tokens: A list of tokenized texts, where each text has already been split
            into sub-texts based on the `embedding_ctx_length` parameter. In the
            case of tiktoken, this is a list of token arrays. In the case of
            HuggingFace transformers, this is a list of strings.
        indices: An iterable of the same length as `tokens` that maps each token-array
            to the index of the original text in `texts`.
        r   )AutoTokenizerzCould not import transformers python package. This is needed for OpenAIEmbeddings to work without `tiktoken`. Please install it with `pip install transformers`. )Zpretrained_model_name_or_pathF)Zadd_special_tokensZcl100k_basec                 S  s   i | ]\}}|d ur||qS r+   r#   )r%   kvr#   r#   r'   
<dictcomp>  s
    z.OpenAIEmbeddings._tokenize.<locals>.<dictcomp>)rW   rX   Z001
 )tqdm)r_   rF   r^   Ztransformersr   r   rr   Zfrom_pretrained	enumerateencoder9   r:   rR   decoder;   tiktokenZencoding_for_modelKeyErrorZget_encodingrW   rX   itemsendswithreplaceZencode_ordinaryr`   Z	tqdm.autor   )r   r   rY   r   r   Z
model_namer   rz   Z	tokenizerr2   textZ	tokenizedjZtoken_chunkZ
chunk_textencodingZencoder_kwargstokenr   _iterr#   r#   r'   	_tokenizee  sn   



zOpenAIEmbeddings._tokenize)rY   enginekwargsr   c                  s   |pj }i j|||\}}}g }	|D ](}
jjdd||
|
|  i}t|ts5| }|	dd |d D  qt	t
|||	|j}d d fdd	fd
d|D S )ac  Generate length-safe embeddings for a list of texts.

        This method handles tokenization and embedding generation, respecting the
        set embedding context length and chunk size. It supports both tiktoken
        and HuggingFace tokenizer based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        inputc                 s      | ]}|d  V  qdS r1   Nr#   r%   rr#   r#   r'   r.     r7   z<OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<genexpr>dataNr    list[float]c                    sD    d u r j jdddi} t| ts|  } | d d d   S Nr    r   r   r1   r#   )rC   create
isinstancedict
model_dumpZaverage_embedded_cached_empty_embeddingclient_kwargsr   r#   r'   empty_embedding  s   
zBOpenAIEmbeddings._get_len_safe_embeddings.<locals>.empty_embeddingc                   s   g | ]}|d ur
|n  qS r+   r#   r%   r   r   r#   r'   r(     s    z=OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<listcomp>r#   r    r   )rY   r   r   rC   r   r   r   r   extendr?   r:   r   r   r   r   rY   r   _chunk_sizer   r   r   r   r2   responser=   r#   r   r   r   r   r'   _get_len_safe_embeddings  s&   


	z)OpenAIEmbeddings._get_len_safe_embeddingsc                  s   |pj }i j|tdj||I dH \}}}g }	tdt||D ]+}
jjdd||
|
|  iI dH }t|t	sD|
 }|	dd |d D  q%tt|||	|j}d d fd	d
fdd|D I dH S )a  Asynchronously generate length-safe embeddings for a list of texts.

        This method handles tokenization and asynchronous embedding generation,
        respecting the set embedding context length and chunk size. It supports both
        `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        Nr   r   c                 s  r   r   r#   r   r#   r#   r'   r.     r7   z=OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<genexpr>r   r    r   c                    sL    d u r$j jdddiI d H } t| ts|  } | d d d   S r   )rD   r   r   r   r   r   r   r#   r'   r   %  s   

zCOpenAIEmbeddings._aget_len_safe_embeddings.<locals>.empty_embeddingc                   s&   g | ]}|d ur|n  I d H qS r+   r#   r   r   r#   r'   r(   0  s   $ z>OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<listcomp>r#   r   )rY   r   r   r   r9   r:   rD   r   r   r   r   r   r?   r   r   r#   r   r'   _aget_len_safe_embeddings  s,   



z*OpenAIEmbeddings._aget_len_safe_embeddingsc           
      K  s   |p| j }i | j|}| jsDg }tdt||D ](}| jjdd||||  i|}t|ts5|	 }|
dd |d D  q|S tt| j}	| j|f|	|d|S )	a  Call out to OpenAI's embedding endpoint for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.
            kwargs: Additional keyword arguments to pass to the embedding API.

        Returns:
            List of embeddings, one for each text.
        r   r   c                 s  r   r   r#   r   r#   r#   r'   r.   J  r7   z3OpenAIEmbeddings.embed_documents.<locals>.<genexpr>r   r   rY   Nr#   )rY   r   rk   r9   r:   rC   r   r   r   r   r   r
   rE   rI   r   
r   r   rY   r   Zchunk_size_r   r=   r2   r   r   r#   r#   r'   embed_documents2  s,   


z OpenAIEmbeddings.embed_documentsc           
        s   |p| j }i | j|}| jsHg }tdt||D ]+}| jjdd||||  i|I dH }t|ts9|	 }|
dd |d D  q|S tt| j}	| j|f|	|d|I dH S )	a  Call out to OpenAI's embedding endpoint async for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.
            kwargs: Additional keyword arguments to pass to the embedding API.

        Returns:
            List of embeddings, one for each text.
        r   r   Nc                 s  r   r   r#   r   r#   r#   r'   r.   l  r7   z4OpenAIEmbeddings.aembed_documents.<locals>.<genexpr>r   r   r#   )rY   r   rk   r9   r:   rD   r   r   r   r   r   r
   rE   rI   r   r   r#   r#   r'   aembed_documentsT  s.   


z!OpenAIEmbeddings.aembed_documentsr   r   c                 K  s   | j |gfi |d S )a  Call out to OpenAI's embedding endpoint for embedding query text.

        Args:
            text: The text to embed.
            kwargs: Additional keyword arguments to pass to the embedding API.

        Returns:
            Embedding for the text.
        r   )r   )r   r   r   r#   r#   r'   embed_queryv  s   
zOpenAIEmbeddings.embed_queryc                   s$   | j |gfi |I dH }|d S )a	  Call out to OpenAI's embedding endpoint async for embedding query text.

        Args:
            text: The text to embed.
            kwargs: Additional keyword arguments to pass to the embedding API.

        Returns:
            Embedding for the text.
        Nr   )r   )r   r   r   r=   r#   r#   r'   aembed_query  s   
zOpenAIEmbeddings.aembed_query)ro   ra   r    r   )r    r   )r    ra   )r   r   rY   r   r    r   )
r   r   r   rE   rY   rG   r   r   r    r   r+   )r   r   rY   rG   r   r   r    r   )r   rE   r   r   r    r   )6__name__
__module____qualname____doc__r   rC   __annotations__rD   rF   rH   rI   r   rM   rO   rP   rQ   rR   r   rT   rV   rW   rX   rY   rZ   r\   r]   r^   r_   r`   r   rb   r   rc   rd   rf   rh   ri   rj   rk   r   Zmodel_configr   classmethodr{   r   propertyr   r   r   r   r   r   r   r   r#   r#   r#   r'   r@   N   s   
 R


	<
g98#
"r@   )r   r   r   r   r   r   r   r   r   r   r    r!   )&r   
__future__r   loggingrs   collections.abcr   r   r   typingr   r   r   r	   r
   r   r   Zlangchain_core.embeddingsr   Zlangchain_core.runnables.configr   Zlangchain_core.utilsr   r   r   Zpydanticr   r   r   r   r   Ztyping_extensionsr   	getLoggerr   loggerr?   r@   r#   r#   r#   r'   <module>   s     

9