o
    {qi                     @  s8   d dl mZ d dlmZ d dlmZ G dd deZdS )    )annotations)Any)TextSplitterc                      s8   e Zd ZdZ		dddd fddZdddZ  ZS )NLTKTextSplitterz"Splitting text using NLTK package.

englishF)use_span_tokenize	separatorstrlanguager   boolkwargsr   returnNonec             
     s   t  jdi | || _|| _|| _| jr | jdkr d}t|zddl}| jr3|j| j| _	W dS |jj
| _	W dS  tyM } zd}t||d}~ww )zInitialize the NLTK splitter. z6When use_span_tokenize is True, separator should be ''r   NzANLTK is not installed, please install it with `pip install nltk`. )super__init__
_separator	_language_use_span_tokenize
ValueErrornltktokenizeZ_get_punkt_tokenizer
_tokenizerZsent_tokenizeImportError)selfr	   r   r   r   msgr   err	__class__r   d/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/langchain_text_splitters/nltk.pyr      s"   	
zNLTKTextSplitter.__init__text	list[str]c           	      C  s   | j r=t| j|}g }t|D ]*\}\}}|dkr0||d  d }||| |||  }n||| }|| qn| j|| jd}| || jS )z&Split incoming text and return chunks.r      )r   )	r   listr   Zspan_tokenize	enumerateappendr   Z_merge_splitsr   )	r   r"   spansZsplitsistartendZprev_endZsentencer   r   r!   
split_text&   s   zNLTKTextSplitter.split_text)r   r   )
r	   r
   r   r
   r   r   r   r   r   r   )r"   r
   r   r#   )__name__
__module____qualname____doc__r   r,   __classcell__r   r   r   r!   r      s    r   N)
__future__r   typingr   Zlangchain_text_splitters.baser   r   r   r   r   r!   <module>   s    