o
    0 i4Q                     @  s"  d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZmZmZmZmZmZ zd dlmZ W n eyC   dZY nw d dlZddlmZ eeZG dd	 d	ZeG d
d deZeG dd deeZG dd deZG dd deZG dd deZG dd deZ dS )    )annotationsN)Path)AnyCallableSequenceMappingIterableProtocolClassVarruntime_checkable)SentencePieceProcessor   )
GGUFWriterc                   @  s   e Zd ZU ded< ded< ded< ded< 			
	
d-d.ddZd/ddZd0d1ddZd2d d!Zd3d"d#Zd4d'd(Z	d3d)d*Z
d3d+d,Zd
S )5SpecialVocab	list[str]mergeszdict[str, bool]add_special_tokendict[str, int]special_token_idsz(str | Sequence[Mapping[str, str]] | Nonechat_templateFNpathstr | os.PathLike[str]load_mergesboolspecial_token_typesIterable[str] | Nonen_vocab
int | Nonec                 C  sL   i | _ i | _|| _|| _g | _d | _|d ur|| _nd| _| t| d S )N)ZbosZeosZunkseppadclsmask)	r   r   r   r   r   r   r   _loadr   )selfr   r   r   r    r$   V/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/gguf/vocab.py__init__   s   zSpecialVocab.__init__returnstrc                 C  s    d t| j| jp
d| jpdS )NzG<SpecialVocab with {} merges, special tokens {}, add special tokens {}>unset)formatlenr   r   r   r#   r$   r$   r%   __repr__-   s   zSpecialVocab.__repr__gwr   quietNonec                 C  sL  | j r|stdt| j  d || j  n| jr td | j D ]0\}}t	|d| dd }|d u rDtd| d| d q%|sQtd	| d
|  || q%| j
 D ]0\}}t	|d| dd }|d u rztd| d| d q[|std| d|  || q[| jd ur|std| j  || j d S d S )NzAdding z
 merge(s).zJAdding merges requested but no merges found, output may be non-functional.add_	_token_idz"No handler for special token type z	 with id  - skippingzSetting special token type z to Zadd_add__tokenzNo handler for add_z_token with value zSetting add_z
_token to zSetting chat_template to )r   loggerinfor+   Zadd_token_mergesr   warningr   itemsgetattrr   r   Zadd_chat_template)r#   r.   r/   typZtokidZ
id_handlervalueadd_handlerr$   r$   r%   add_to_gguf2   s6   



zSpecialVocab.add_to_ggufr   c                 C  s6   |  | | | | jr| js| | d S d S d S N)_try_load_from_tokenizer_json_try_load_from_config_jsonr   r   _try_load_merges_txt)r#   r   r$   r$   r%   r"   N   s
   

zSpecialVocab._loadc           	      C  s   |d }|  s
dS t|dddW}t|d }|ds&|d d}nd	}g }|D ]4}|d	7 }| }|s9q,|d d
}t|dkrRt	|j
 d| d q,||d  d|d	   q,W d    n1 skw   Y  || _dS )Nz
merges.txtFrutf-8encoding #r   r         z: Line z: Entry malformed, ignoring T)is_fileopennextstrip
startswithseeksplitr+   r5   r7   nameappendr   )	r#   r   Zmerges_filefp
first_lineline_numr   linepartsr$   r$   r%   rA   T   s0   

z!SpecialVocab._try_load_merges_txtr:   tidr   c              	   C  s~   t |tsd S |dk rtd| d| | jd u s|| jk r-|| jv r&d S || j|< d S td| d| d| j d d S )Nr   z%invalid value for special token type z: zSpecial token type z, id z out of range, must be under r3   )
isinstanceint
ValueErrorr   r   r5   r7   )r#   r:   rY   r$   r$   r%   _set_special_tokenm   s   


$zSpecialVocab._set_special_tokenc                   s  |d }|  rt|dd}t|}W d    n1 sw   Y  | jr}|di d}t|tr}|r}t|d trA|| _	n<t|d tryt
|d dkryt|d d trytdd	 |D rptd
ttdd  dd |D | _	ntd|di }ni }|d }|  sdS t|dd}t|}W d    n1 sw   Y  d }	|d }
|
  rt|
dd}t|d}	W d    n1 sw   Y  |d|	}|d u st|ttfr|| _n	td|d | jD ]O}|d| d}t|tr
|| j|< || d}t|tr| nt|tr0|d}t|ts-q| nqt fdd	|D d }| || qdS )Ntokenizer.jsonrC   rD   modelr   r   rI   c                 s  s"    | ]}|D ]}d |v V  qqdS )rJ   Nr$   ).0pairsr$   r$   r%   	<genexpr>   s     z=SpecialVocab._try_load_from_tokenizer_json.<locals>.<genexpr>z'Spaces in merges detected, encoding as rJ      c                 S      g | ]}d  dd |D qS )rJ   c                 S  re   )rF   c                 s  s,    | ]}|d krt t|d n|V  qdS )rJ   rd   N)chrord)r`   cr$   r$   r%   rc      s
    
zSSpecialVocab._try_load_from_tokenizer_json.<locals>.<listcomp>.<listcomp>.<genexpr>join)r`   partr$   r$   r%   
<listcomp>   s    
zISpecialVocab._try_load_from_tokenizer_json.<locals>.<listcomp>.<listcomp>ri   )r`   ra   r$   r$   r%   rl      s    z>SpecialVocab._try_load_from_tokenizer_json.<locals>.<listcomp>zUnknown tokenizer merges formatadded_tokensztokenizer_config.jsonTzchat_template.jsonr   z$Bad type for chat_template field in z - ignoringr1   r4   contentc                 3  s(    | ]}| d  kr| dV  qdS )rn   idN)get)r`   ZatokZ
tc_contentr$   r%   rc      s   & )rK   rL   jsonloadr   rp   rZ   listr(   r   r+   anyr5   r7   rf   rg   r\   r   r   r   r   dictrM   r]   )r#   r   Ztokenizer_filef	tokenizerr   rm   Ztokenizer_config_fileZtokenizer_configZchat_template_altZchat_template_filer   r:   	add_entryentryZentry_contentZmaybe_token_idr$   rq   r%   r?   y   sl   0



z*SpecialVocab._try_load_from_tokenizer_jsonc                 C  sr   |d }|  s
dS t|dd}t|}W d    n1 s w   Y  | jD ]}| ||| d q(dS )Nzconfig.jsonFrC   rD   r2   T)rK   rL   rr   rs   r   r]   rp   )r#   r   config_filerw   configr:   r$   r$   r%   r@      s   
z'SpecialVocab._try_load_from_config_json)FNN)r   r   r   r   r   r   r   r   r'   r(   )F)r.   r   r/   r   r'   r0   )r   r   r'   r0   )r   r   r'   r   )r:   r(   rY   r   r'   r0   )__name__
__module____qualname____annotations__r&   r-   r=   r"   rA   r]   r?   r@   r$   r$   r$   r%   r      s    
 




Gr   c                   @  s   e Zd ZU ded< ded< dS )	BaseVocabzClassVar[str]tokenizer_modelrR   N)r~   r   r   r   r$   r$   r$   r%   r      s   
 r   c                   @  sB   e Zd ZU ded< ded< ded< ded< dd
dZdddZdS )Vocabr[   
vocab_sizer   added_tokens_dictr   added_tokens_listr   fname_tokenizer	base_pathc                 C     d S r>   r$   )r#   r   r$   r$   r%   r&          zVocab.__init__r'   -Iterable[tuple[bytes, float, gguf.TokenType]]c                 C  r   r>   r$   r,   r$   r$   r%   
all_tokens   r   zVocab.all_tokensNr   r   r'   r   )r~   r   r   r   r&   r   r$   r$   r$   r%   r      s   
 
r   c                   @  s   e Zd ZdZdZdddZdS )NoVocabZno_vocabr'   r(   c                 C     dS )Nz3<NoVocab for a model without integrated vocabulary>r$   r,   r$   r$   r%   r-      s   zNoVocab.__repr__Nr}   )r~   r   r   r   rR   r-   r$   r$   r$   r%   r      s    r   c                   @  F   e Zd ZdZdZdddZdd	d
ZdddZdddZdddZ	dS )BpeVocabZgpt2Zbper   r   c              	     s  i }|d  }  rOt|dd}t| _W d    n1 s!w   Y  zt|d dd}t|}W d    n1 s?w   Y  W n[ tyN   Y nSw |d }t|dd}t|}W d    n1 siw   Y  |d }|d dks|d	d
s|d d dkrtd|d  _|d }d ur fdd|D }t j}tt	||t| }	t
| }
|	|
kr|t|
 d }tdt|
 d| d| d|
 t
| dd d}| _dd |D  _| _ jt j  _| _d S )Nz
vocab.jsonrC   rD   added_tokens.jsonr^   r_   typeBPEbyte_fallbackFdecoderZ	ByteLevelzCannot find GPT-2 BPE tokenizervocabrm   c                   s(   i | ]}|d   j vr|d  |d qS )rn   ro   )r   )r`   itemr,   r$   r%   
<dictcomp>	  s
    z%BpeVocab.__init__.<locals>.<dictcomp>r   zExpected the z1 added token ID(s) to be sequential in the range z - z; got c                 S     | d S Nr   r$   )Ztext_idxr$   r$   r%   <lambda>      z#BpeVocab.__init__.<locals>.<lambda>keyc                 S  s   g | ]\}}|qS r$   r$   )r`   textidxr$   r$   r%   rl         z%BpeVocab.__init__.<locals>.<listcomp>)existsrL   rr   rs   r   FileNotFoundErrorrp   r+   rt   rangesortedvaluesr\   r8   r   r   vocab_size_baser   r   )r#   r   rm   r   rw   tokenizer_jsonr   addedr   Zexpected_idsZ
actual_idsZexpected_end_idr8   r$   r,   r%   r&      sX   



zBpeVocab.__init__r'   r   c                 c  sB    dd | j  D }t| j D ]\}}|| dtjjfV  qd S )Nc                 S     i | ]\}}||qS r$   r$   r`   Zencoded_tokro   r$   r$   r%   r     s    z'BpeVocab.bpe_tokens.<locals>.<dictcomp>g        )r   r8   	enumerategguf	TokenTypeNORMAL)r#   reverse_vocabi_r$   r$   r%   
bpe_tokens  s
   zBpeVocab.bpe_tokensc                 c  ,    | j D ]}d}|d|tjjfV  qd S N     @rC   )r   encoder   r   CONTROLr#   r   scorer$   r$   r%   rm   "  
   
zBpeVocab.added_tokensc                 c  "    |   E d H  |  E d H  d S r>   )r   rm   r,   r$   r$   r%   r   '     zBpeVocab.all_tokensr(   c                 C     d| j  dt| j dS )Nz<BpeVocab with  base tokens and  added tokens>r   r+   r   r,   r$   r$   r%   r-   +     zBpeVocab.__repr__Nr   r   r}   )
r~   r   r   r   rR   r&   r   rm   r   r-   r$   r$   r$   r%   r      s    

4

r   c                   @  r   )SentencePieceVocabllamaZspmr   r   c                   sB  t d u rtdi }|d  } r;zt|d dd}t|}W d    n1 s+w   Y  W n ty:   Y nw |jd  } sHtdt  | _| j	t
| | j fdd| D  ttt  }t  }||krtd	| d
| || _ fdd|D | _| _| jt| j | _|| _d S )Nzsentencepiece is not installedztokenizer.modelr   rC   rD   zCannot find tokenizer.modelc                   s   i | ]\}}| kr||qS r$   r$   )r`   piecero   )r   r$   r%   r   G  s    z/SentencePieceVocab.__init__.<locals>.<dictcomp>zExpected new token IDs z to be sequential; got c                   s   g | ]} | qS r$   r$   )r`   ro   )
new_tokensr$   r%   rl   P  r   z/SentencePieceVocab.__init__.<locals>.<listcomp>)r   RuntimeErrorr   rL   rr   rs   r   parentsentencepiece_tokenizerZLoadFromFiler(   r   r8   rt   r   r+   r   keysr\   r   r   r   r   )r#   r   rm   r   rw   Zexpected_new_idsZactual_new_idsr$   )r   r   r%   r&   3  s6   

zSentencePieceVocab.__init__r'   r   c                 c  s    | j }t| D ]?}||}|d}||}tjj}|	|r(tjj
}||r1tjj}||r:tjj}||rCtjj}|||fV  q
d S )NrC   )r   r   r   Z	IdToPiecer   ZGetScorer   r   r   Z	IsUnknownUNKNOWNZ	IsControlr   ZIsUnusedZUNUSEDZIsByteBYTE)r#   rx   r   r   r   r   toktyper$   r$   r%   sentencepiece_tokensU  s"   






z'SentencePieceVocab.sentencepiece_tokensc                 c  r   r   )r   r   r   r   USER_DEFINEDr   r$   r$   r%   rm   m  r   zSentencePieceVocab.added_tokensc                 c  r   r>   )r   rm   r,   r$   r$   r%   r   r  r   zSentencePieceVocab.all_tokensr(   c                 C  r   )Nz<SentencePieceVocab with r   r   r   r,   r$   r$   r%   r-   v  r   zSentencePieceVocab.__repr__Nr   r   r}   )
r~   r   r   r   rR   r&   r   rm   r   r-   r$   r$   r$   r%   r   /  s    

"

r   c                   @  sb   e Zd ZdZdZd!ddZd"d	d
Zd#ddZd$ddZd"ddZ	dd Z
d"ddZd%ddZd S )&LlamaHfVocabr   Zhfftr   r   c              
     s  |d }t |dd}t|}W d    n1 sw   Y  |d }|d dko5|ddo5|d	d
 }|r<td|sV|d dksR|d	drR|d d dkrVtdzddlm} W n tyo } ztd|d }~ww |j	||d
d _
 j
jsJ g  _t  _t  _t j
  dd dD ]\}	}
|
 j
jkr j|	 |
 j|	<  j|
 q fdd j
jD  _t j
j _ j
j _ jt j  _| _d S )Nr^   rC   rD   r_   r   r   Zignore_mergesFr   Tz'Llama 3 must be converted with BpeVocabr   r   zCannot find Llama BPE tokenizerr   )AutoTokenizerzsTo use LlamaHfVocab, please install the `transformers` package. You can install it with `pip install transformers`.)	cache_dirZlocal_files_onlyc                 S  r   r   r$   )xr$   r$   r%   r     r   z'LlamaHfVocab.__init__.<locals>.<lambda>r   c                   s   i | ]
}| j  | qS r$   )rx   	get_vocab)r`   tokr,   r$   r%   r     s    z)LlamaHfVocab.__init__.<locals>.<dictcomp>)rL   rr   rs   rp   	TypeErrorr   Ztransformersr   ImportErrorZfrom_pretrainedrx   Zis_fastr   rv   r   setadded_tokens_idsr   Zget_added_vocabr8   r   rS   addZall_special_tokensspecialsZall_special_idsspecial_idsr   r+   r   )r#   r   r   rw   r   r   Z	is_llama3r   er   Ztokidxr$   r,   r%   r&   ~  s`   



zLlamaHfVocab.__init__r'   r   c                 c  sf    dd | j   D }t| jD ]}|| jv rq|| d}|| || ||| j	fV  qd S )Nc                 S  r   r$   r$   r   r$   r$   r%   r     s    
z*LlamaHfVocab.hf_tokens.<locals>.<dictcomp>rC   )
rx   r   r8   r   r   r   r   get_token_scoreget_token_typer   )r#   r   token_id
token_textr$   r$   r%   	hf_tokens  s   

zLlamaHfVocab.hf_tokensr   r[   r   bytesr   set[int]gguf.TokenTypec                 C  s,   t d|r
tjjS ||v rtjjS tjjS )Ns   <0x[0-9A-Fa-f]{2}>)re	fullmatchr   r   r   r   r   )r#   r   r   r   r$   r$   r%   r     s   zLlamaHfVocab.get_token_typefloatc                 C  r   )Nr   r$   )r#   r   r$   r$   r%   r     s   zLlamaHfVocab.get_token_scorec                 c  sb    | j D ]*}|| jv r| | j| d| j}| | j| }ntjj}d}|d||fV  qd S )N    r   rC   )	r   r   r   r   r   r   r   r   r   )r#   r   r   r   r$   r$   r%   rm     s   

zLlamaHfVocab.added_tokensc                 C  s   d| j jv pd| j jv S )Nz<0x0A>
)rx   r   r,   r$   r$   r%   has_newline_token  s   zLlamaHfVocab.has_newline_tokenc                 c  r   r>   )r   rm   r,   r$   r$   r%   r     r   zLlamaHfVocab.all_tokensr(   c                 C  r   )Nz<LlamaHfVocab with r   r   r   r,   r$   r$   r%   r-     r   zLlamaHfVocab.__repr__Nr   r   )r   r[   r   r   r   r   r'   r   )r   r[   r'   r   r}   )r~   r   r   r   rR   r&   r   r   r   rm   r   r   r-   r$   r$   r$   r%   r   z  s    

B



r   )!
__future__r   r   loggingrr   ospathlibr   typingr   r   r   r   r   r	   r
   r   Zsentencepiecer   r   r   Zgguf_writerr   	getLoggerr~   r5   r   r   r   r   r   r   r   r$   r$   r$   r%   <module>   s2    (
 6
KK