o
    0 i                     @   sh   d dl mZmZ ddlmZ d dlZd dlZ			ddejdee dee deee	  d	ef
d
dZ
dS )    )ListOptional   )LLTokenizerNvocabn_vocab	eos_tokenslicesreturnc              	   C   s  t | }|du rt | }d}t|d }g }t|D ]?}t | |||dd}	|	dk r:td| d| d|	 |	|ks@J t|d|	 }
t 	| |}|t j
@ rWd	|
 }
||
 q|durrt||k rr|d
 t||k sgtt j jjtjj}tj|| |||dS )a  
    Create a new tokenizer from a llama.cpp vocab object.
    This is an expensive operation (~1s), so the result should be cached.

    Args:
        vocab: llama_cpp.llama_vocab_p - the vocab object to use
        n_vocab: int - override the size of the vocabulary
        eos_token: int - override the EOS token
        slices: List[str] - configuration for slicer optimization; pass [] to disable,
            or None to use the default configuration
    Ni @  r   r   TzError writing token z to buffer of size z	. Error:        )tokensZ	vocab_ptrZtokenize_fptrr   r	   )	llama_cppZllama_vocab_n_tokensZllama_vocab_eosctypesZcreate_string_bufferrangeZllama_token_to_piece
ValueErrorbytesZllama_token_get_attrZLLAMA_TOKEN_ATTR_CONTROLappendlencast_libZllama_tokenizeZc_void_pvaluer   Zfrom_llamacpp)r   r   r   r	   ZntokZ
buffer_lenbufferr   tokenntokattrZfptr r   _/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/llguidance/llamacpp.pylltokenizer_from_vocab   sF   



r   )NNN)typingr   r   r   r   r   r   Zllama_vocab_pintstrr   r   r   r   r   <module>   s$    
