o
    {qib                     @  sj   d dl mZ d dlZd dlmZmZmZmZ d dlm	Z	m
Z
 G dd de
ZdddZG dd de
ZdS )    )annotationsN)AnyLiteralOptionalUnion)LanguageTextSplitterc                      s2   e Zd ZdZ		dd fddZdddZ  ZS )CharacterTextSplitterz(Splitting text that looks at characters.

F	separatorstris_separator_regexboolkwargsr   returnNonec                   s"   t  jdi | || _|| _dS )Create a new TextSplitter.N )super__init__
_separator_is_separator_regex)selfr   r   r   	__class__r   i/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/langchain_text_splitters/character.pyr      s   
zCharacterTextSplitter.__init__text	list[str]c                   sh    j r jnt j}t|| jd}d} j o#t fdd|D }d} js.|s. j} ||S )z=Split into chunks without re-inserting lookaround separators.keep_separator)z(?=z(?<!z(?<=z(?!c                 3  s    | ]	} j |V  qd S )N)r   
startswith).0pr   r   r   	<genexpr>%   s    
z3CharacterTextSplitter.split_text.<locals>.<genexpr> )r   r   reescape_split_text_with_regex_keep_separatorany_merge_splits)r   r   Zsep_patternsplitsZlookaround_prefixesZis_lookaroundZ	merge_sepr   r#   r   
split_text   s   
z CharacterTextSplitter.split_text)r
   F)r   r   r   r   r   r   r   r   r   r   r   r   )__name__
__module____qualname____doc__r   r-   __classcell__r   r   r   r   r	   	   s    r	   r   r   r   r   $Union[bool, Literal['start', 'end']]r   r   c                  s   |r]|rVt d| d|  |dkr# fddtdt d dD n fd	dtdt dD }t d dkrB| d
d  7 }|dkrNg | d
 n d g|}nt || }nt| }dd |D S )N()endc                       g | ]} |  |d    qS    r   r!   iZ_splitsr   r   
<listcomp>=        z*_split_text_with_regex.<locals>.<listcomp>r   r:      c                   r8   r9   r   r;   r=   r   r   r>   ?   r?   c                 S  s   g | ]}|d kr|qS )r%   r   )r!   sr   r   r   r>   L   s    )r&   splitrangelenlist)r   r   r   r,   r   r=   r   r(   4   s    "r(   c                      sZ   e Zd ZdZ			dd fddZd ddZd!ddZed"ddZe	d#ddZ
  ZS )$RecursiveCharacterTextSplitterzSplitting text by recursively look at characters.

    Recursively tries to split by different characters to find one
    that works.
    NTF
separatorsOptional[list[str]]r   r4   r   r   r   r   r   r   c                   s.   t  jdd|i| |pg d| _|| _dS )r   r   )r
   
 r%   Nr   )r   r   _separatorsr   )r   rH   r   r   r   r   r   r   r   V   s   
z'RecursiveCharacterTextSplitter.__init__r   r   r   c                 C  s(  g }|d }g }t |D ](\}}| jr|nt|}|dkr"|} nt||r4|}||d d } nq| jr:|nt|}t||| jd}	g }
| jrNdn|}|	D ]2}| || jk rb|
	| qR|
rq| 
|
|}|| g }
|sy|	| qR| ||}|| qR|
r| 
|
|}|| |S )z&Split incoming text and return chunks.rA   r%   r:   Nr   )	enumerater   r&   r'   searchr(   r)   Z_length_function_chunk_sizeappendr+   extend_split_text)r   r   rH   Zfinal_chunksr   Znew_separatorsr<   Z_sr   r,   Z_good_splitsrB   Zmerged_textZ
other_infor   r   r   rR   b   sD   

z*RecursiveCharacterTextSplitter._split_textc                 C  s   |  || jS )zSplit the input text into smaller chunks based on predefined separators.

        Args:
            text (str): The input text to be split.

        Returns:
            List[str]: A list of text chunks obtained after splitting.
        )rR   rL   )r   r   r   r   r   r-      s   	z)RecursiveCharacterTextSplitter.split_textlanguager   c                 K  s   |  |}| d|dd|S )a  Return an instance of this class based on a specific language.

        This method initializes the text splitter with language-specific separators.

        Args:
            language (Language): The language to configure the text splitter for.
            **kwargs (Any): Additional keyword arguments to customize the splitter.

        Returns:
            RecursiveCharacterTextSplitter: An instance of the text splitter configured
            for the specified language.
        T)rH   r   Nr   )get_separators_for_language)clsrS   r   rH   r   r   r   from_language   s   
z,RecursiveCharacterTextSplitter.from_languagec                 C  sR  | t jt jfv rg dS | t jkrg dS | t jkrg dS | t jkr'g dS | t jkr0g dS | t jkr9g dS | t jkrBg dS | t j	krKg dS | t j
krTg d	S | t jkr]g d
S | t jkrfg dS | t jkrog dS | t jkrxg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrd}d| dd| dd| dd| dd| dd d!d"d#d$d%d&d'd(d)d*d+d,gS | t jv rd-|  d.}t|d-|  d/tt  }t|)0a
  Retrieve a list of separators specific to the given language.

        Args:
            language (Language): The language for which to get the separators.

        Returns:
            List[str]: A list of separators appropriate for the specified language.
        )
class z
void z
int z
float z
double 
if 
for 
while 
switch 
case r
   rJ   rK   r%   )
func 
var 
const 
type rX   rY   r[   r\   r
   rJ   rK   r%   )rW   
public 
protected 	
private 
static rX   rY   rZ   r[   r\   r
   rJ   rK   r%   )rW   ra   rb   rc   z

internal z
companion z
fun 
val r^   rX   rY   rZ   z
when r\   
else r
   rJ   rK   r%   )

function r_   
let r^   rW   rX   rY   rZ   r[   r\   	
default r
   rJ   rK   r%   )
enum 
interface z
namespace r`   rW   rg   r_   rh   r^   rX   rY   rZ   r[   r\   ri   r
   rJ   rK   r%   )rg   rW   rX   	
foreach rZ   
do r[   r\   r
   rJ   rK   r%   )
z	
message z	
service rj   z
option 
import z
syntax r
   rJ   rK   r%   )rW   
def z
	def r
   rJ   rK   r%   )z
=+
z
-+
z
\*+
z

.. *

r
   rJ   rK   r%   )ro   rW   rX   
unless rZ   rY   rm   z
begin z
rescue r
   rJ   rK   r%   )ro   z
defp z
defmodule z
defprotocol z

defmacro z
defmacrop rX   rp   rZ   r\   z
cond z
with rY   rm   r
   rJ   rK   r%   )z
fn r_   rh   rX   rZ   rY   z
loop 
match r_   r
   rJ   rK   r%   )rW   z
object ro   re   r^   rX   rY   rZ   rq   r\   r
   rJ   rK   r%   )r]   rW   
struct rj   rX   rY   rZ   rm   r[   r\   r
   rJ   rK   r%   )	z
#{1,6} z```
z	
\*\*\*+
z
---+
z
___+
r
   rJ   rK   r%   )z
\\chapter{z
\\section{z
\\subsection{z
\\subsubsection{z
\\begin{enumerate}z
\\begin{itemize}z
\\begin{description}z
\\begin{list}z
\\begin{quote}z
\\begin{quotation}z
\\begin{verse}z
\\begin{verbatim}z
\\begin{align}z$$$rK   r%   )z<bodyz<divz<pz<brz<liz<h1z<h2z<h3z<h4z<h5z<h6z<spanz<tablez<trz<tdz<thz<ulz<olz<headerz<footerz<navz<headz<stylez<scriptz<metaz<titler%   )rk   rj   z
implements z

delegate 
event rW   z

abstract ra   rb   rc   rd   z
return rX   z

continue rY   rl   rZ   r[   z
break r\   rf   
try z
throw 	
finally 
catch r
   rJ   rK   r%   )z
pragma z
using z

contract rk   z	
library z
constructor r`   rg   rt   z

modifier z
error rr   rj   rX   rY   rZ   z

do while z

assembly r
   rJ   rK   r%   )z
IDENTIFICATION DIVISION.z
ENVIRONMENT DIVISION.z
DATA DIVISION.z
PROCEDURE DIVISION.z
WORKING-STORAGE SECTION.z
LINKAGE SECTION.z
FILE SECTION.z
INPUT-OUTPUT SECTION.z
OPEN z
CLOSE z
READ z
WRITE z
IF z
ELSE z
MOVE z	
PERFORM z
UNTIL z	
VARYING z
ACCEPT z	
DISPLAY z

STOP RUN.rJ   rK   r%   )
z
local rg   rX   rY   rZ   z
repeat r
   rJ   rK   r%   )z	
main :: z
main = rh   z
in rm   z
where 
:: z
= 
data z	
newtype r`   rx   z
module rn   z
qualified z
import qualified rW   z

instance r\   z
| ry   z
= {z
, r
   rJ   rK   r%   )rg   z
param rX   rl   rY   rZ   r[   rW   ru   rw   rv   r
   rJ   rK   r%   z*(?:Public|Private|Friend|Global|Static)\s+z\n(?!End\s)z?Sub\s+z?Function\s+z?Property\s+(?:Get|Let|Set)\s+z?Type\s+z?Enum\s+z\n(?!End\s)If\s+z\nElseIf\s+z	\nElse\s+z\nSelect\s+Case\s+z	\nCase\s+z\nFor\s+z\nDo\s+z
\nWhile\s+z	\nWith\s+z\n\nz\nrK   r%   z	Language z is not implemented yet!z& is not supported! Please choose from )r   CZCPPZGOZJAVAZKOTLINZJSZTSPHPPROTOPYTHONZRSTZRUBYZELIXIRZRUSTZSCALAZSWIFTMARKDOWNZLATEXHTMLZCSHARPZSOLCOBOLZLUAZHASKELLZ
POWERSHELLZVISUALBASIC6_value2member_map_
ValueErrorrF   )rS   Zvismsgr   r   r   rT      s   



















$



&




z:RecursiveCharacterTextSplitter.get_separators_for_language)NTF)
rH   rI   r   r4   r   r   r   r   r   r   )r   r   rH   r   r   r   r.   )rS   r   r   r   r   rG   )rS   r   r   r   )r/   r0   r1   r2   r   rR   r-   classmethodrV   staticmethodrT   r3   r   r   r   r   rG   O   s    

*rG   )r   r   r   r   r   r4   r   r   )
__future__r   r&   typingr   r   r   r   Zlangchain_text_splitters.baser   r   r	   r(   rG   r   r   r   r   <module>   s    
+