o
    `+ i2                     @  s  d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZ d dlmZmZ d dlmZ erTd d	lmZmZmZ d d
lmZ eeZ edddZ!G dd deeZ"G dd de"Z#G dd de$e
Z%eddG dd dZ&dddZ'dS )    )annotationsN)ABCabstractmethod)	dataclass)Enum)TYPE_CHECKINGAnyCallableLiteralOptionalTypeVarUnion)BaseDocumentTransformerDocument)Self)
CollectionIterableSequence)SetTSTextSplitter)boundc                   @  s   e Zd ZdZddedddfd@ddZedAddZ	dBdCddZdDd"d#Z	dEd'd(Z
dFd+d,ZedGd0d1Zed2de d3fdHd;d<ZdId>d?ZdS )Jr   z)Interface for splitting text into chunks.i     FT
chunk_sizeintchunk_overlaplength_functionCallable[[str], int]keep_separator$Union[bool, Literal['start', 'end']]add_start_indexboolstrip_whitespacereturnNonec                 C  s~   |dkrd| }t ||dk rd| }t |||kr+d| d| d}t ||| _|| _|| _|| _|| _|| _dS )ad  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                            in each corresponding chunk (True='start')
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                              every document
        r   zchunk_size must be > 0, got z chunk_overlap must be >= 0, got zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_functionZ_keep_separator_add_start_index_strip_whitespace)selfr   r   r   r   r    r"   msg r-   i/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/langchain_text_splitters/base.py__init__!   s$   


zTextSplitter.__init__textstr	list[str]c                 C  s   dS )z$Split text into multiple components.Nr-   )r+   r0   r-   r-   r.   
split_textI   s    zTextSplitter.split_textNtexts	metadatasOptional[list[dict[Any, Any]]]list[Document]c                 C  s   |pi gt | }g }t|D ]=\}}d}d}| |D ]/}	t|| }
| jr@|| | j }||	td|}||
d< t |	}t	|	|
d}|
| qq|S )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater3   copydeepcopyr)   r'   findmaxr   append)r+   r4   r5   Z
_metadatas	documentsir0   indexZprevious_chunk_lenchunkr:   offsetZnew_docr-   r-   r.   create_documentsM   s    	zTextSplitter.create_documentsrB   Iterable[Document]c                 C  s:   g g }}|D ]}| |j | |j q| j||dS )zSplit documents.)r5   )rA   r9   r:   rG   )r+   rB   r4   r5   docr-   r-   r.   split_documentsa   s
   
zTextSplitter.split_documentsdocs	separatorOptional[str]c                 C  s(   | |}| jr| }|dkrd S |S )N )joinr*   strip)r+   rK   rL   r0   r-   r-   r.   
_join_docsi   s   
zTextSplitter._join_docssplitsIterable[str]c           
      C  sz  |  |}g }g }d}|D ]}|  |}|| t|dkr|nd | jkr|| jkr6td| d| j  t|dkr| ||}	|	d urK||	 || jkse|| t|dkr[|nd | jkr|dkr||  |d t|dkrt|nd 8 }|dd  }|| jkse|| t|dkr|nd | jkr|dkse|| ||t|dkr|nd 7 }q| ||}	|	d ur||	 |S )Nr   zCreated a chunk of size z%, which is longer than the specified    )r(   r;   r&   loggerwarningrQ   rA   r'   )
r+   rR   rL   Zseparator_lenrK   Zcurrent_doctotald_lenrI   r-   r-   r.   _merge_splitsq   sN   







zTextSplitter._merge_splits	tokenizerr   kwargsc              
     sn   zddl m} t |sd}t|d fdd	}W n ty- } zd
}t||d}~ww | dd|i|S )z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBaser0   r1   r#   r   c                   s   t  | S N)r;   tokenizer0   r[   r-   r.   _huggingface_tokenizer_length   s   zNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_lengthz`Could not import transformers python package. Please install it with `pip install transformers`.Nr   r0   r1   r#   r   r-   )Z$transformers.tokenization_utils_baser]   
isinstancer%   ImportError)clsr[   r\   r]   r,   rb   errr-   ra   r.   from_huggingface_tokenizer   s   

z'TextSplitter.from_huggingface_tokenizergpt2allencoding_name
model_nameallowed_special'Union[Literal['all'], AbstractSet[str]]disallowed_special&Union[Literal['all'], Collection[str]]r   c              
     s   zddl }W n ty } zd}t||d}~ww |dur$||n||d fdd	}	t| trD|| d
}
i ||
}| dd|	i|S )z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.r0   r1   r#   r   c                   s   t j|  dS N)rm   ro   )r;   encoder`   rm   ro   encr-   r.   _tiktoken_encoder   s   z=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder)rk   rl   rm   ro   r   rc   r-   )tiktokenre   encoding_for_modelget_encoding
issubclassTokenTextSplitter)rf   rk   rl   rm   ro   r\   rv   rg   r,   ru   extra_kwargsr-   rs   r.   from_tiktoken_encoder   s(   



	z"TextSplitter.from_tiktoken_encoderSequence[Document]c                 K  s   |  t|S )z2Transform sequence of documents by splitting them.)rJ   list)r+   rB   r\   r-   r-   r.   transform_documents   s   z TextSplitter.transform_documents)r   r   r   r   r   r   r   r   r    r!   r"   r!   r#   r$   r0   r1   r#   r2   r^   )r4   r2   r5   r6   r#   r7   )rB   rH   r#   r7   )rK   r2   rL   r1   r#   rM   )rR   rS   rL   r1   r#   r2   )r[   r   r\   r   r#   r   )rk   r1   rl   rM   rm   rn   ro   rp   r\   r   r#   r   )rB   r}   r\   r   r#   r}   )__name__
__module____qualname____doc__r;   r/   r   r3   rG   rJ   rQ   rZ   classmethodrh   setr|   r   r-   r-   r-   r.   r      s2    (


*,c                      s8   e Zd ZdZdde dfd fddZdddZ  ZS )rz   z/Splitting text to tokens using model tokenizer.ri   Nrj   rk   r1   rl   rM   rm   rn   ro   rp   r\   r   r#   r$   c           
   
     sz   t  jdi | zddl}W n ty" } zd}t||d}~ww |dur-||}	n||}	|	| _|| _|| _dS )zCreate a new TextSplitter.r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.r-   )	superr/   rv   re   rw   rx   
_tokenizer_allowed_special_disallowed_special)
r+   rk   rl   rm   ro   r\   rv   rg   r,   rt   	__class__r-   r.   r/      s   	


zTokenTextSplitter.__init__r0   r2   c                   s2   d
 fdd}t  j j jj|d}t||dS )a  Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.

        Args:
            text (str): The input text to be split into smaller chunks.

        Returns:
            List[str]: A list of text chunks, where each chunk is derived from a portion
            of the input text based on the tokenization and chunking rules.
        _textr1   r#   	list[int]c                   s    j j|  j jdS rq   )r   rr   r   r   )r   r+   r-   r.   _encode  s
   z-TokenTextSplitter.split_text.<locals>._encode)r   tokens_per_chunkdecoderr   )r0   r[   N)r   r1   r#   r   )	Tokenizerr'   r&   r   r   split_text_on_tokens)r+   r0   r   r[   r-   r   r.   r3     s   zTokenTextSplitter.split_text)rk   r1   rl   rM   rm   rn   ro   rp   r\   r   r#   r$   r   )r   r   r   r   r   r/   r3   __classcell__r-   r-   r   r.   rz      s    rz   c                   @  s|   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlZsolcsharpcobolcluaperlhaskellelixir
powershellZvisualbasic6N)r   r   r   r   ZCPPZGOZJAVAZKOTLINZJSr   PHPPROTOPYTHONZRSTZRUBYZRUSTZSCALAZSWIFTMARKDOWNZLATEXHTMLZSOLZCSHARPCOBOLCZLUAZPERLZHASKELLZELIXIRZ
POWERSHELLZVISUALBASIC6r-   r-   r-   r.   r   &  s:    r   T)frozenc                   @  s8   e Zd ZU dZded< 	 ded< 	 ded< 	 ded< d	S )
r   zTokenizer data class.r   r   r   zCallable[[list[int]], str]r   zCallable[[str], list[int]]rr   N)r   r   r   r   __annotations__r-   r-   r-   r.   r   F  s   
 r   r0   r1   r[   r#   r2   c                 C  s   g }| | }d}t||j t|}||| }|t|k rN||| |t|kr0	 |S ||j|j 7 }t||j t|}||| }|t|k s|S )z6Split incoming text and return chunks using tokenizer.r   )rr   minr   r;   rA   r   r   )r0   r[   rR   Z	input_idsZ	start_idxZcur_idxZ	chunk_idsr-   r-   r.   r   T  s   
r   )r0   r1   r[   r   r#   r2   )(
__future__r   r=   loggingabcr   r   dataclassesr   enumr   typingr   r   r	   r
   r   r   r   Zlangchain_core.documentsr   r   Ztyping_extensionsr   collections.abcr   r   r   r   AbstractSet	getLoggerr   rU   r   r   rz   r1   r   r   r   r-   r-   r-   r.   <module>   s*    $

 I@ 