o
    )i 3                     @  sR  d dl mZ d dlZd dlZd dlZd dlmZ d dlZ	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ erYd dlZd dlmZ d dlm  m  mZ d dlmZ ned	e d
Zede dZede dZeeZ dZ!G dd dZ"d-ddZ#dd Z$e	%dZ&e	%dZ'd.ddZ(d/d!d"Z)d0d%d&Z*d1d'd(Z+d2d+d,Z,dS )3    )annotationsN)TYPE_CHECKING)LRUCache)Cache)init_logger)
LazyLoader)AnyTokenizerocoutlines_core
file_utilsztransformers.file_utilstokenization_gpt2z*transformers.models.gpt2.tokenization_gpt2c                   @  s   e Zd ZdZd	ddZdS )
OutlinesVocabularyzo
    Wrapper class for `outlines_core.Vocabulary`,
    which allows us to store a hash with the vocabulary
    
vocabularyoc.VocabularyreturnNonec                 C  s2   || _ t| d }t|d}|| _d S )Nutf-8   )innerhashlibsha256__repr__encode	hexdigestint_hash)selfr   hex_strZhash_int r   k/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/v1/structured_output/utils.py__init__-   s   

zOutlinesVocabulary.__init__N)r   r   r   r   )__name__
__module____qualname____doc__r    r   r   r   r   r   '   s    r   r   strc                  C  s   t d} t d}t jd}| r| S |rt j|ddS t j|r0|dkr0t j|ddS ddl}| }t j|ddS )	zFGet the context object that contains previously-computed return valuesZOUTLINES_CACHE_DIRXDG_CACHE_HOME~z.cacheZoutlines/r   N)osgetenvpath
expanduserjoinisdirtempfile
gettempdir)Zoutlines_cache_dirZxdg_cache_homeZhome_dirr/   tempdirr   r   r   get_outlines_cache_path8   s   

r2   c                  C  sf   t  } tjr.td t| ddd}tjd}|	dd}||kr&|
  |d| |S tdd	S )
z3Get the Cache instance to be used for index cachingzEnabling outlines cache. This is an unbounded on-disk cache. It may consume a lot of disk space and should not be used with untrusted clients.noner   )Zeviction_policyZ
cull_limitr
   __version__N   )maxsize)r2   envsZVLLM_V1_USE_OUTLINES_CACHEloggerwarningr   	importlibmetadataversiongetclearsetr   )	cache_dircacheZoutlines_versionZcached_versionr   r   r   get_outlines_cacheR   s   

rB   z^<0x[0-9A-F]{2}>$u   ^.{0,6}�+.{0,6}$	tokenizerr   eos_token_idr   dict[bytes, list[int]]c           
        s  dd t   D d fdd}i }g }   D ]n\}}| jv r&q||}|rt|ttfr8t|}n?d|v rrt	|srt
	|rStt|d	d
 dg}n$fdd|D }	d|	v rmtd| d| d| t|	}n|d}||kr||g | q|| q|S )zCreate a map from vocabulary tokens to lists of equivalent token ids.

    Returns:
        A Dict of token string -> equivalent token ids
    c                 S  s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>t   s    z'_reduced_vocabulary.<locals>.<dictcomp>tokenr%   r   c                   s8     | g}t| tu r| tjs| dkrd| S |S )Nz<0x20> )Zconvert_tokens_to_stringtyper%   
startswithr   ZSPIECE_UNDERLINE)rJ   string)rC   r   r   convert_token_to_stringy   s   
z4_reduced_vocabulary.<locals>.convert_token_to_stringu   �      r   c                   s   g | ]}  |qS r   )r=   rF   c)unicode_to_bytesr   r   
<listcomp>   s    z'_reduced_vocabulary.<locals>.<listcomp>NzCannot convert token `z` (z) to bytes: r   )rJ   r%   r   r%   )r   Zbytes_to_unicodeitemsZ	get_vocabZall_special_tokens
isinstancebytes	bytearrayre_replacement_seqmatchre_llama_byte_tokenr   RuntimeErrorr   
setdefaultappend)
rC   rD   rO   r   Zempty_token_idsrJ   Z	token_idxZ	token_strZtoken_bytesZ	byte_valsr   )rC   rT   r   _reduced_vocabularyj   sD   






r`   r   c              
   C  s   t | dr| jS z+t | dr| jdur| j}n
tdt|  dt| |}tt||}|| _|W S  t	yJ } ztdt|  d|d}~ww )z7Get the `Vocabulary` object for a given tokenizer.
    _outlines_vocabularyrD   Nz?Error during structured outputs setup for outlines: Tokenizer (zi) has no `eos_token_id` property, but `eos_token_id` is required for structured outputs to work properly.z,Cannot get the vocabulary of the tokenizer (z0). The tokenizer should have a get_vocab method.)
hasattrra   rD   
ValueErrorrL   r`   r   r	   Z
VocabularyAttributeError)rC   rD   Zreduced_vocabr   er   r   r   get_outlines_vocabulary   s:   


rf   grammar_strboolc                 C  sL   | rt | ts	dS | dD ]}tdd| }|sqd|v r# dS qdS )aV  
    Check if grammar appears to use Lark syntax.

    Args:
        grammar_str: Input grammar string

    Returns:
        bool: True if grammar appears to be in Lark format, False otherwise

    Examples:
        >>> grammar_is_likely_lark("rule: 'abc'")
        True
        >>> grammar_is_likely_lark("rule ::= 'abc'")
        False
    F
	(#|//).*$ z::=T)rW   r%   splitresubstrip)rg   liner   r   r   grammar_is_likely_lark   s   rq   c                   s  t | tstdt|  |  stdt }t }g }d.dd d/dd}d0dd} fdd| dD }d}t|dD ]F\}}	|	rK|	drLq@d|	v rz |	ddd  d}
|	|
 |du ri|
}|
dkrod}W q@ t
y } z	td| d|d}~ww q@|std|d|  d}g }t|dD ]\}}	|	sqz|d|	v r|	ds|r|| d d!|  |	dd\}
}|
 d}||d"| d#| td$d%|}||| | g}n5|	dr |std&| d'|	dd  }||d(| d#| td$d%|}||| || W q ty; } ztd)| d*t| |d}~ww |rL|| d d!|  || d+h }|rbtd,d-t| d|S )1a  
    Convert a Lark grammar string to EBNF format.

    EBNF reference:
    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
    Lark grammar reference:
    https://lark-parser.readthedocs.io/en/latest/grammar.html

    Args:
        grammar_str: Input grammar in Lark format

    Returns:
        str: Converted grammar in EBNF format

    Examples:
        >>> print(convert_lark_to_ebnf("rule: 'hello'"))
        root ::= rule
        rule ::= "hello"
    zGrammar must be a string, got zGrammar string cannot be emptyrp   r%   r   c                 S  s   t dd|  S )z)Remove comments and whitespace from line.rj   rk   )rm   rn   ro   )rp   r   r   r   
clean_line  s   z(convert_lark_to_ebnf.<locals>.clean_linetext	rule_nameline_numr   r   c                 S  s<   |  dd dks|  dd dkrtd| d| dS )z Validate quote matching in text.'   r   "zMismatched quotes in z	 on line N)countrc   )rs   rt   ru   r   r   r   check_quotes  s
   $z*convert_lark_to_ebnf.<locals>.check_quotesset[str]c                 S  s,   t dd| } t dd| } tt d| S )z"Extract rule references from text.z"[^"]*"rk   z[+*?()|\[\]{}]rK   z\b[a-zA-Z_][a-zA-Z0-9_]*\b)rm   rn   r?   findall)rs   r   r   r   extract_references  s   z0convert_lark_to_ebnf.<locals>.extract_referencesc                   s   g | ]} |qS r   r   )rF   rp   rr   r   r   rU     s    z(convert_lark_to_ebnf.<locals>.<listcomp>ri   N   |:r   ?startzInvalid rule format on line z". Expected 'rule_name: definition'zNo valid rules found in grammar	root ::= z ::=  | zrule 'rv   z	'([^']*)'z"\1"zAlternative '|' on line z$ without a preceding rule definitionzalternative for rule 'zError on line z: rootz"Referenced rules are not defined: z, )rp   r%   r   r%   )rs   r%   rt   r%   ru   r   r   r   )rs   r%   r   r{   )rW   r%   rc   rL   ro   r?   rl   	enumeraterM   add
IndexErrorr_   r-   rm   rn   updatesorted)rg   Zdefined_rulesZreferenced_rulesoutput_linesrz   r}   linesZ
first_ruleru   rp   namere   Zcurrent_ruleZcurrent_definitionZ
definitionZalt_defZundefined_rulesr   r~   r   convert_lark_to_ebnf   s   






r   choice	list[str]c                   s8   ddd  fdd| D }dd	 d
d |D  }|S )Nsr%   r   c                 S  s   t dd| S )z+Escape special characters in a EBNF string.z(["\\])z\\\1)rm   rn   )r   r   r   r   escape_ebnf_stringn  s   z-choice_as_grammar.<locals>.escape_ebnf_stringc                 3  s    | ]} |V  qd S )Nr   rR   r   r   r   	<genexpr>s  s    z$choice_as_grammar.<locals>.<genexpr>r   r   c                 s  s    | ]	}d | d V  qdS )rx   Nr   rR   r   r   r   r   t  s    )r   r%   r   r%   )r-   )r   Zescaped_choicesZgrammarr   r   r   choice_as_grammarl  s   
r   )r   r%   )rC   r   rD   r   r   rE   )rC   r   r   r   )rg   r%   r   rh   )rg   r%   r   r%   )r   r   r   r%   )-
__future__r   r   importlib.metadatar:   r)   typingr   regexrm   Z
cachetoolsr   Z	diskcacher   Z	vllm.envsr7   Zvllm.loggerr   Z
vllm.utilsr   r
   r	   Ztransformers.file_utilsr   Z*transformers.models.gpt2.tokenization_gpt2modelsZgpt2r   Z!vllm.transformers_utils.tokenizerr   globalsr!   r8   ZCACHEr   r2   rB   compiler\   rZ   r`   rf   rq   r   r   r   r   r   r   <module>   sF   




E
 
 }