o
    )i (                     @   s   d dl mZmZ d dlZd dlmZmZmZmZm	Z	m
Z
mZ d dlZddlmZ ddlmZmZmZ ddlmZmZ G dd	 d	ZG d
d dZdS )    )	dataclassfieldN)CallableDictHashableListOptionalTupleUnion   )LMFormatEnforcerException)CharacterLevelParserForceStopParserCharacterLevelParserConfig)TokenizerPrefixTreeTokenizerPrefixTreeNodec                	   @   sN   e Zd ZdZdeeeeef  de	ee gef de
eee f fddZdS )TokenEnforcerTokenizerDatazTokenEnforcerTokenizerData contains all of the preprocessing for preparing the TokenEnforcer to work with a 
    specific tokenizer. It does some calculations, so it is recommended to reuse it for multiple TokenEnforcersregular_tokensdecodereos_token_idc                 C   s@   || _ t|| _|| _|| _ddd | jjj D | _	dS )aA  
        Create the tokenizer data that the TokenEnforcer needs. This can be reused for multiple TokenEnforcers if they work with the same tokenizer.
        :param regular_tokens: A list of tuples (token_id, token_string, is_new_word_token) for all the regular (not special) tokens in the tokenizer vocabulary.
        Note that token_string is expected to include leading / trailing whitespaces if relevant.
        :param decoder: A function that decodes a list of token ids into a string.
        :param eos_token_id: The token id(s) of the end-of-string token(s).
         c                 s   s     | ]}t |d kr|V  qdS )r   N)len).0Z	token_str r   j/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/lmformatenforcer/tokenenforcer.py	<genexpr>   s    z6TokenEnforcerTokenizerData.__init__.<locals>.<genexpr>N)
r   r   tokenizer_treer   r   joinrootchildrenkeystokenizer_alphabet)selfr   r   r   r   r   r   __init__   s
   
$z#TokenEnforcerTokenizerData.__init__N)__name__
__module____qualname____doc__r   r	   intstrboolr   r
   r#   r   r   r   r   r      s    r   c                	   @   s   e Zd ZdZeG dd dZdedefddZde	e
 d	e	e
 fd
dZdeddfddZdedede	e
 dee fddZddde	e
 fddZdS )TokenEnforcerzTokenEnforcer provides a token filtering mechanism, given a CharacterLevelParser and some information about the tokenizer.
    It is the main entry point for extending lm-format-enforcer to new inference libraries. See __init__() and get_allowed_tokens()c                   @   sB   e Zd ZU eed< eedZee	 ed< eedZ
ee	 ed< dS )TokenEnforcer.OutputTensorStateparser)default_factoryallowed_tokenscurrent_word_tokensN)r$   r%   r&   r   __annotations__r   listr/   r   r(   r0   r   r   r   r   OutputTensorState#   s   
 r3   tokenizer_datar-   c                 C   sH   i | _ || _|j| _|j| _|j| _|j| _i | _t|jd}||_	dS )z
        Create a new TokenEnforcer.
        :param tokenizer_data: Per tokenizer data that the token enforcer needs in order to operate.
        :param parser: A CharacterLevelParser that defines the allowed strings.
        )alphabetN)
prefix_statesroot_parserr   r   r   r   allowed_token_cacher   r!   config)r"   r4   r-   r9   r   r   r   r#   )   s   
zTokenEnforcer.__init__token_sequencereturnc                 C   s   t |}|dd }|| jv r| j| jS || jvr/tj| jd}|| j|< | || |jS | j| }| ||}|| j|< | || |jS )a  
        Get a list of allowed tokens, given a list of tokens that were already generated.
        :param token_sequence: The tokens that were already generated, and the next token will be generated for.
        :return: A list of token ids that are allowed to be selected next.
        Nr-   )tupler6   r/   r+   r3   r7   _compute_allowed_tokens_apply_new_characters)r"   r:   Z
sent_tupleZprev_step_tuplestateZprev_step_state	new_stater   r   r   get_allowed_tokens:   s   	




z TokenEnforcer.get_allowed_tokensstate_tokensrA   r,   c                 C   s  zVg }|j  }|d ur|| jv r| j| |_W d S |j  }| |j | jj|| |j  r?|	t
| jtr:| jn| jg |sEtd||_|d urT|| j|< W d S W d S  ty^     ty   tjtjd | t|}td| d t
| jtr| jn| jg|_Y d S w )Nz+Parser reached state with no allowed tokens)levelz+Unknown LMFormatEnforcer Problem. Prefix: 'z'
Terminating the parser. Please open an issue at 
https://github.com/noamgat/lm-format-enforcer/issues with the prefix and CharacterLevelParser parameters)r-   	cache_keyr8   r/   shortcut_key_collect_allowed_tokensr   r   Zcan_endextend
isinstancer   r2   
ValueErrorr   	ExceptionloggingbasicConfigERRORr   	exception)r"   rD   rA   r/   rF   rG   prefixr   r   r   r?   X   s0   


 "z%TokenEnforcer._compute_allowed_tokens	tree_noder/   rG   c                 C   s   | |j | }|j }t||}t|trU|d dkrUt	|dks)J |\}}	}
}| j
j}t|jtd|
|	 }t|j||	 }| ||| |dg}|D ]}||}|j| }| |||d  qWd S )Nr   Zjson_freetext   ")rI   tokensZget_allowed_charactersr   r    setintersectionrJ   r>   r   r   Zjson_freetext_tokensminZmax_token_lenmaxZlookup_allowed_tokensadd_characterrH   )r"   r-   rR   r/   rG   Zallowed_charactersZrelevant_charactersZcharacters_to_explore_cur_lenmin_lenmax_lencacheZmin_remainingZmax_allowed_len	characterZnext_parserZnext_tree_noder   r   r   rH   x   s"   


z%TokenEnforcer._collect_allowed_tokensc           
      C   s   t j|jd}|d }|| jjv r|g|_| jj| }n|j|g |_| |j}| |j}|t|d  }|D ]-}z	|j	||_W q9 t
yf }	 ztd| d|	 d t |_W Y d }	~	q9d }	~	ww |S )Nr=   r<   zReceived an invalid character 'z+', switching to ForceStopParser (Exception:))r+   r3   r-   r   Znew_word_tokensr0   Ztokens_to_strsr   r   rZ   rL   rM   debugr   )
r"   rA   r:   rB   Z	new_tokenZnew_charactersZprev_decodedZnew_decodedr`   er   r   r   r@      s$   z#TokenEnforcer._apply_new_charactersN)r$   r%   r&   r'   r   r3   r   r   r#   r   r(   rC   r	   r?   r   r   r   rH   r@   r   r   r   r   r+       s    " r+   )dataclassesr   r   systypingr   r   r   r   r   r	   r
   rM   
exceptionsr   Zcharacterlevelparserr   r   r   Ztokenizerprefixtreer   r   r   r+   r   r   r   r   <module>   s    $