o
    0 iE                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZ eeZdddZdedefddZdd Z G dd dZ!G dd de!Z"dedefddZ#G dd dZ$G dd de$Z%G dd de$Z&G d d! d!e$Z'G d"d# d#e$Z(G d$d% d%e$Z)G d&d' d'e$Z*G d(d) d)e$Z+G d*d+ d+e$Z,G d,d- d-e$Z-G d.d/ d/e$Z.G d0d1 d1e$Z/G d2d3 d3e$Z0G d4d5 d5e0Z1G d6d7 d7e0Z2G d8d9 d9e0Z3G d:d; d;e0Z4G d<d= d=e0Z5G d>d? d?e0Z6G d@dA dAe0Z7G dBdC dCe0Z8G dDdE dEe0Z9G dFdG dGe0Z:G dHdI dIe0Z;G dJdK dKe0Z<G dLdM dMe0Z=G dNdO dOe0Z>G dPdQ dQe0Z?G dRdS dSe0Z@G dTdU dUe$ZAG dVdW dWe0ZBG dXdY dYe$ZCG dZd[ d[e$ZDG d\d] d]e$ZEG d^d_ d_e0ZFG d`da dae0ZGG dbdc dce0ZHG ddde dee$ZIG dfdg dge0ZJG dhdi die0ZKG djdk dke0ZLdldm ZMG dndo doZNi dpe1dqe-dre2dse%dteBdueEdve3dweCdxe*dye%dze/d{e4d|e%d}e%d~e%de%de%i de1de'de*de+de%de%de-de9de-de-de%deIde5de6de(de%de-i de7de)de>de,de%de;de<de%de-de.de8de%de?de@deAde9de:e&eFeHeHeGeHdZOdde	fddZPdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)Optional)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR c                 C   sj   t  rddlm} |S t r.dd l}t|jjtdk r&ddl	m} |S ddl	m
} |S tt| )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   Zgoogle.protobufr   parseprotobuf__version__Ztransformers.utilsr   ImportErrorr   format)error_messager   Zgoogle r   o/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.pyimport_protobuf#   s   r    add_prefix_spacereturnc                 C   s$   | rd}t |ddsd}|S d}|S )NalwayslegacyTfirstnever)getattr)r!   original_tokenizerprepend_schemer   r   r   _get_prepend_scheme4   s   r*   c           
         s   |d u}|r
t |n }g }| D ]<\}}g }tdt|D ]}|d | ||d  }}	| v r>|	 v r>|||	|f qt| fddd}|| qt|dd |d}dd |D }|S )	Nr   c                        | d   | d  fS Nr   r   r   xvocabr   r   <lambda>I       z!generate_merges.<locals>.<lambda>keyc                 S   s   | d t | d t | d fS )N   r   r   )lenvalr   r   r   r1   L   s    r4   reversec                 S   s   g | ]
}|d  |d fqS r   r   r   .0r8   r   r   r   
<listcomp>M       z#generate_merges.<locals>.<listcomp>)dictitemsranger6   appendsortedextend)
r0   vocab_scoresr:   mergesmergeZpiece_scorelocalindexpiece_lpiece_rr   r/   r   generate_merges>   s   rM   c                   @   sB   e Zd ZdZdefddZd	deeeef e	e f fddZ
dS )
SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 C   s.   t | d ddlm} | | _| j| d S )Nr   r   )SentencePieceProcessor)r   r   rP   spLoad)selfrO   rP   r   r   r   __init__V   s   
zSentencePieceExtractor.__init__Nr"   c                    s2   | j   fddt  D }t||}||fS )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                       i | ]}  ||qS r   Zid_to_piecer=   rJ   rQ   r   r   
<dictcomp>c   r2   z2SentencePieceExtractor.extract.<locals>.<dictcomp>)rQ   rB   GetPieceSizerM   rS   rF   r0   rG   r   rY   r   extract]   s   
zSentencePieceExtractor.extractN)__name__
__module____qualname____doc__strrT   tupler@   intlistr]   r   r   r   r   rN   Q   s    (rN   c                   @   s0   e Zd Zddeeeef ee f fddZdS )GemmaSentencePieceExtractorNr"   c                    sH   | j   fddt  D }d|vr|d|d< t||}||fS )rU   c                    rV   r   rW   rX   rY   r   r   rZ   q   r2   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>	<0x09>)rQ   rB   r[   getrM   r\   r   rY   r   r]   k   s   
z#GemmaSentencePieceExtractor.extractr^   )	r_   r`   ra   rd   r@   rc   re   rf   r]   r   r   r   r   rg   j   s    (rg   piecec                 C   s&   t | dk p| d dkp| d   S )Nr5   ,)r6   isdigit)rk   r   r   r   check_number_comma{   s   &rp   c                   @   s"   e Zd Zdd ZdefddZdS )	Converterc                 C   s
   || _ d S r^   )r(   )rS   r(   r   r   r   rT      s   
zConverter.__init__r"   c                 C   s   t  r^   )NotImplementedErrorrS   r   r   r   	converted   s   zConverter.convertedN)r_   r`   ra   rT   r   rt   r   r   r   r   rq      s    rq   c                   @      e Zd ZdefddZdS )BertConverterr"   c           
      C      | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )N	unk_tokenFbasic_tokenizerTZ
clean_textZhandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr(   r0   r   r   rc   ry   hasattrrz   tokenize_chinese_charsr|   do_lower_caser   BertNormalizer
normalizerr	   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr
   TemplateProcessingpost_processorr   decoder
rS   r0   	tokenizerr   r|   r   clssepr   r   r   r   r   rt      :   



zBertConverter.convertedNr_   r`   ra   r   rt   r   r   r   r   rv          rv   c                   @   ru   )SplinterConverterr"   c              
   C   sZ  | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}t| j j}d}	| j j}
| j j}| j j}| j d}| j jdkrx| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}tj| d| d|||
f||f||f|	|fgd|_tjdd|_|S )Nrx   Frz   Tr{   .rightr~    r   r   r   r   r   r   )r(   r0   r   r   rc   ry   r   rz   r   r|   r   r   r   r   r	   r   r   r   r   Zquestion_tokenr   r   question_token_idconvert_tokens_to_idsZpadding_sider
   r   r   r   r   )rS   r0   r   r   r|   r   r   r   questiondotr   r   r   Zdot_token_idr   r   r   r   rt      sL   



$"
zSplinterConverter.convertedNr   r   r   r   r   r      r   r   c                   @   ru   )FunnelConverterr"   c           
      C   rw   )Nrx   Frz   Tr{   z:2 $A:0 r   r   r   r   r   r   r   r   r   r   r   rt      r   zFunnelConverter.convertedNr   r   r   r   r   r      r   r   c                   @   ru   )MPNetConverterr"   c           
   
   C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	| d
||f||	fgd|_tjdd|_|S )Nrx   Frz   Tr{   r~   r   z:0 r   r   r   r   r   r   r   r   r   r   rt     s:   



zMPNetConverter.convertedNr   r   r   r   r   r     r   r   c                   @   ru   )OpenAIGPTConverterr"   c              	   C   s   | j j}t| j j }| j j}tt||d t|ddd}|	t|d ur/|
t|g tjdd|_t |_tjdd|_|S )N</w>F)r0   rG   dropoutry   end_of_word_suffixfuse_unkT)r}   suffix)r(   encoderrf   	bpe_rankskeysry   r   r   rc   Ztoken_to_idadd_special_tokensr   r   r   r	   r   r   r   
BPEDecoderr   rS   r0   rG   ry   r   r   r   r   rt   /  s&   
zOpenAIGPTConverter.convertedNr   r   r   r   r   r   .  r   r   c                	   @   B   e Zd Z	ddeeeef  deeeeef   de	fddZ
dS )GPT2ConverterNr0   rG   r"   c              	   C   s   |s| j j}|st| j j}tt||d dddd}t| j dd}tj|d|_	t
 |_t| j ddrP| j j}| j j}tj| d| d||fgd	|_|S tjdd
|_|S )Nr   Fr0   rG   r   continuing_subword_prefixr   r   r!   r!   Zadd_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r(   r   rf   r   r   r   r'   r	   	ByteLevelr   r   r   	bos_tokenbos_token_idr
   r   r   )rS   r0   rG   r   r!   Zbosr   r   r   r   rt   J  s:   
zGPT2Converter.convertedNNr_   r`   ra   r   r@   rc   re   rf   rd   r   rt   r   r   r   r   r   I      r   c                   @   ru   )HerbertConverterr"   c                 C   s   d}d}| j j}t| j j }||d d v r|dd  }tt||d | j j|d}tj	ddd|_
t |_tj|d|_tj| j j| j jf| j j| j jfd	|_|S )
Nz	#version:r   r   r   )r   ry   r   F)r}   r|   r   )r   r   )r(   r   rf   r   r   r   r   ry   r   r   r   r	   r   r   r   r   r   r
   ZBertProcessingr   r   r   r   r   )rS   Ztokenizer_info_strZtoken_suffixr0   rG   r   r   r   r   rt   r  s.   

zHerbertConverter.convertedNr   r   r   r   r   r   q  r   r   c                	   @   r   )Qwen2ConverterNr0   rG   r"   c                 C   s   |s| j j}|st| j j }tt||d d ddddd}t |_	t
t
jtddddt
jt| j ddddg|_t |_tjdd	|_|S )
Nr   F)r0   rG   r   ry   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr!   r!   	use_regexr   )r(   r   rf   r   r   r   r   r   NFCr   r	   SequenceSplitr   r   r'   r   r   r   r
   r   )rS   r0   rG   r   r   r   r   rt     sD   

zQwen2Converter.convertedr   r   r   r   r   r   r     r   r   c                   @   ru   )RobertaConverterr"   c              	   C   sv   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tj|j|jf|j|jf|j	dd|_|S )Nr   Fr   r   Tr   r   r!   r   )r(   r   rf   r   r   r   r   r	   r   r!   r   r   r   r
   RobertaProcessingr   r   r   r   r   rS   otr0   rG   r   r   r   r   rt     s,   


zRobertaConverter.convertedNr   r   r   r   r   r     r   r   c                   @   ru   )RoFormerConverterr"   c           
      C   s   ddl m} | jj}tt|t| jjd}d}d}t| jdr*| jj	j
}| jj	j}tjdd||d|_tj|||_t| jj}t| jj}| jj}| jj}	tj| d| d	| d| d
| d||f||	fgd|_tjdd|_|S )Nr   )JiebaPreTokenizerrx   Frz   Tr{   r~   r   r   r   r   r   r   )Z"models.roformer.tokenization_utilsr   r(   r0   r   r   rc   ry   r   rz   r|   r   r   r   r   r	   ZPreTokenizerZcustomr   r   r   r   r   r
   r   r   r   r   )
rS   r   r0   r   r|   r   r   r   r   r   r   r   r   rt     s8   

zRoFormerConverter.convertedNr   r   r   r   r   r     r   r   c                   @   ru   )DebertaConverterr"   c              	   C   s~   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjddd| j dfd| j dfgd	|_|S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r(   r   rf   r   r   r   r   r	   r   r!   r   r   r   r
   r   r   r   r   r   r   r   rt     s.   
	zDebertaConverter.convertedNr   r   r   r   r   r     r   r   c                       sn   e Zd ZdZeZi Z fddZdd Zdd Z	dd	 Z
d
d Zdd Zdd Zdd ZdefddZ  ZS )SpmConverterFc                    s   t | d t j|  t }| }t| jjd}||	  W d    n1 s+w   Y  || _
| j
jjrB| jsDtd d S d S d S )Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrT   r    
ModelProtoopenr(   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rS   args	model_pb2mf	__class__r   r   rT   &  s   
zSpmConverter.__init__c                 C      dd |j D S )Nc                 S      g | ]}|j |jfqS r   rk   scorer=   rk   r   r   r   r>   <  r2   z&SpmConverter.vocab.<locals>.<listcomp>piecesrS   r   r   r   r   r0   ;     zSpmConverter.vocabc                 C   s   |j jS r^   )r   unk_idr   r   r   r   r   >     zSpmConverter.unk_idc           	   	      s   |j j} |}|dkrtt| | jd}n-|dkrD  jj	
|\}}dd t|D }tt|||j jd jd d}ntd fd	d
t|jD }|dd
 t|dd dD  |S )Nr   r   r   r5   c                 S      i | ]	\}\}}||qS r   r   r=   iwordr   r   r   r   rZ   P      z*SpmConverter.tokenizer.<locals>.<dictcomp>Try   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                    8   g | ]\}}|j d v r||j|j dkp|j jv fqS )      r  typerk   r   r=   idprs   r   r   r>   e  
    
z*SpmConverter.tokenizer.<locals>.<listcomp>c                 S       g | ]\}}}t |d |dqS F
normalizedspecialr   r=   r  tokenr  r   r   r   r>   k      c                 S      | d S Nr   r   r-   r   r   r   r1   m      z(SpmConverter.tokenizer.<locals>.<lambda>r3   )r   
model_typer0   r   r   r   r   SpmExtractorr(   r   r]   	enumerater   	unk_piece	Exceptionr   
add_tokensrD   )	rS   r   r  rF   r   _rG   	bpe_vocabspm_added_tokensr   rs   r   r   A  sF   

zSpmConverter.tokenizerc                 C   sJ   |j j}tjdddttddg}|st|S tt|g| S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr   StripReplacer   r   PrecompiledrS   r   r#  Z_normalizersr   r   r   r   s  s   
zSpmConverter.normalizerc                 C      t || j}tj||dS Nreplacementr)   )r*   r(   r	   	MetaspacerS   r+  r!   r)   r   r   r   r   ~     zSpmConverter.pre_tokenizerc                 C      d S r^   r   rs   r   r   r   r        zSpmConverter.post_processorc                 C   r(  r)  )r*   r(   r   r,  r-  r   r   r   r     r.  zSpmConverter.decoderr"   c                 C   s   |  | j}| | j}|d ur||_d}d}t| jdr!| jj}| ||}|d ur.||_| |||_|  }|r>||_|S )Nr!  Tr!   )	r   r   r   r   r(   r!   r   r   r   )rS   r   r   r+  r!   r   r   r   r   r   rt     s    zSpmConverter.converted)r_   r`   ra   r   rN   r  r   rT   r0   r   r   r   r   r   r   r   rt   __classcell__r   r   r   r   r   !  s    2r   c                   @   $   e Zd Zdd Zdd Zdd ZdS )AlbertConverterc                 C   r   )Nc                 S   2   g | ]}t |jr|j|jfn|j|jd  fqS d   rp   rk   r   r   r   r   r   r>         $z)AlbertConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r0        zAlbertConverter.vocabc                 C      t ddt ddg}| jjs|t   |t   | jjr)|t   |j	j
}|r7|t | |t tdd t |S Nz``"z''r   r   r   r%  r(   keep_accentsrC   NFKDStripAccentsr   	Lowercaser"  r#  r&  r   r   rS   r   Zlist_normalizersr#  r   r   r   r        


zAlbertConverter.normalizerc                 C   ,   t jddd| jdfd| jdfgdS Nr   r   r   r   r   r
   r   r(   r   rs   r   r   r   r        zAlbertConverter.post_processorNr_   r`   ra   r0   r   r   r   r   r   r   r3        r3  c                   @      e Zd Zdd Zdd ZdS )BarthezConverterc                 C      d}|S Nr  r   rS   r   r   r   r   r   r        zBarthezConverter.unk_idc                 C   rD  Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   rF  rs   r   r   r   r     rG  zBarthezConverter.post_processorN)r_   r`   ra   r   r   r   r   r   r   rK    s    rK  c                   @   r2  )CamembertConverterc                 C   2   g d}|dd |j dd  D 7 }|dg7 }|S )N))z
<s>NOTUSED        <pad>rU  )z</s>NOTUSEDrU  z<unk>rU  )z<unk>NOTUSEDic                 S   r   r   r   r   r   r   r   r>     r2   z,CamembertConverter.vocab.<locals>.<listcomp>r   z<mask>rU  r   rS   r   r0   r   r   r   r0     s   
zCamembertConverter.vocabc                 C      dS rM  r   r   r   r   r   r        zCamembertConverter.unk_idc                 C   rD  rP  rF  rs   r   r   r   r     rG  z!CamembertConverter.post_processorNr_   r`   ra   r0   r   r   r   r   r   r   rS    s    rS  c                   @   r2  )DebertaV2Converterc                 C   sH   g }| j jr|tjdd t|| j }|tj||d t|S )Nr   )r   r*  )r(   Zsplit_by_punctrC   r	   Punctuationr*   r,  r   )rS   r+  r!   Zlist_pretokenizersr)   r   r   r   r     s   
z DebertaV2Converter.pre_tokenizerc                 C   sd   g }| j jr|t  |t  |jj}|r"|t| |t	t
dd t|S )Nr   r   )r(   r   rC   r   rA  r$  r"  r#  r&  r%  r   r   rB  r   r   r   r     s   
zDebertaV2Converter.normalizerc                 C   rD  rE  rF  rs   r   r   r   r   
  rG  z!DebertaV2Converter.post_processorN)r_   r`   ra   r   r   r   r   r   r   r   r^    s    r^  c                   @   r2  )MBartConverterc                 C   >   g d}|dd |j dd  D 7 }|g d7 }|dg7 }|S )NrQ  rU  rV  rR  rU  rX  c                 S   r   r   r   r   r   r   r   r>     r2   z(MBartConverter.vocab.<locals>.<listcomp>r  )Zar_ARrU  cs_CZrU  de_DErU  en_XXrU  Zes_XXrU  et_EErU  fi_FIrU  Zfr_XXrU  gu_INrU  hi_INrU  it_ITrU  Zja_XXrU  kk_KZrU  ko_KRrU  lt_LTrU  lv_LVrU  Zmy_MMrU  ne_NPrU  Znl_XXrU  ro_ROrU  ru_RUrU  si_LKrU  tr_TRrU  vi_VNrU  zh_CNrU  rY  r   rZ  r   r   r   r0     s
   
zMBartConverter.vocabc                 C   r[  rM  r   r   r   r   r   r   <  r0  zMBartConverter.unk_idc                 C   rD  )Nz$A </s> en_XXz$A $B </s> en_XXrk  rR  r   rF  rs   r   r   r   r   ?  rG  zMBartConverter.post_processorNr]  r   r   r   r   r`    s    &r`  c                   @   r2  )MBart50Converterc                 C   ra  )Nrb  c                 S   r   r   r   r   r   r   r   r>   R  r2   z*MBart50Converter.vocab.<locals>.<listcomp>r  )4re  rf  rh  rj  rl  rm  ro  rq  rr  rt  rv  rx  ry  r{  r}  r  r  r  r  r  r  r  r  r  r  )af_ZArU  )az_AZrU  )bn_INrU  )fa_IRrU  )he_ILrU  )hr_HRrU  )id_IDrU  )ka_GErU  )Zkm_KHrU  )mk_MKrU  )ml_INrU  )mn_MNrU  )mr_INrU  )pl_PLrU  )ps_AFrU  )Zpt_XXrU  )sv_SErU  )sw_KErU  )ta_INrU  )te_INrU  )th_THrU  )Ztl_XXrU  )uk_UArU  )ur_PKrU  )xh_ZArU  )gl_ESrU  )sl_SIrU  rY  r   rZ  r   r   r   r0   K  s
   
zMBart50Converter.vocabc                 C   r[  rM  r   r   r   r   r   r   W  r0  zMBart50Converter.unk_idc                 C   rD  )Nzen_XX $A </s>zen_XX $A $B </s>rk  rR  r   rF  rs   r   r   r   r   Z  rG  zMBart50Converter.post_processorNr]  r   r   r   r   r  J  s    r  c                   @   r2  )NllbConverterc                 C   (   g d}|dd |j dd  D 7 }|S )Nrb  c                 S   r   r   r   r   r   r   r   r>   m  r2   z'NllbConverter.vocab.<locals>.<listcomp>r  r   rZ  r   r   r   r0   f     zNllbConverter.vocabc                 C   r[  rM  r   r   r   r   r   r   p  r0  zNllbConverter.unk_idc                 C   rD  )Nzeng_Latn $A </s>zeng_Latn $A $B </s>Zeng_LatnrR  r   rF  rs   r   r   r   r   s  rG  zNllbConverter.post_processorNr]  r   r   r   r   r  e      
r  c                   @   r2  )SeamlessM4TConverterc                 C   r  )N)rV  rX  rc  rd  c                 S   r   r   r   r   r   r   r   r>     r2   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>r  r   rZ  r   r   r   r0     r  zSeamlessM4TConverter.vocabc                 C   s   | j jS r^   )r(   Zunk_token_idr   r   r   r   r     r   zSeamlessM4TConverter.unk_idc                 C   rD  )Nz__eng__ $A </s>z__eng__ $A $B </s>Z__eng__rR  r   rF  rs   r   r   r   r     rG  z#SeamlessM4TConverter.post_processorNr]  r   r   r   r   r  ~  r  r  c                   @   r2  )XLMRobertaConverterc                 C   rT  )Nrb  c                 S   r   r   r   r   r   r   r   r>     r2   z-XLMRobertaConverter.vocab.<locals>.<listcomp>r  rY  r   rZ  r   r   r   r0     s   
zXLMRobertaConverter.vocabc                 C   rL  rM  r   rN  r   r   r   r     rO  zXLMRobertaConverter.unk_idc                 C   rD  rP  rF  rs   r   r   r   r     rG  z"XLMRobertaConverter.post_processorNr]  r   r   r   r   r        r  c                   @   r2  )XLNetConverterc                 C   r   )Nc                 S   r4  r5  r7  r   r   r   r   r>     r8  z(XLNetConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r0     r9  zXLNetConverter.vocabc                 C   r:  r;  r=  rB  r   r   r   r     rC  zXLNetConverter.normalizerc                 C   rD  )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   rF  rs   r   r   r   r     rG  zXLNetConverter.post_processorNrH  r   r   r   r   r    rI  r  c                   @      e Zd ZdS )ReformerConverterNr_   r`   ra   r   r   r   r   r        r  c                   @   rJ  )RemBertConverterc                 C   s   t ddt ddt tddg}| jjs%|t   |t   | jjr0|t 	  |j
j}|r>|t | t |S r;  )r   r%  r   r(   r>  rC   r?  r@  r   rA  r"  r#  r&  r   rB  r   r   r   r     s   


zRemBertConverter.normalizerc                 C   rD  rE  rF  rs   r   r   r   r     rG  zRemBertConverter.post_processorN)r_   r`   ra   r   r   r   r   r   r   r    s    r  c                   @   r  )BertGenerationConverterNr  r   r   r   r   r    r  r  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
PegasusConverterc                 C   s   | j jdf| j jdfg}| j jd ur|| j jdfg7 }| j jd ur2| j j| j jk r2|| j jdfg7 }|dd td| j jD 7 }|dd |jdd  D 7 }|S )NrU  c                 S      g | ]
}d | ddfqS )z<unk_>g      Yr   r=   r   r   r   r   r>     r?   z*PegasusConverter.vocab.<locals>.<listcomp>r5   c                 S   r   r   r   r   r   r   r   r>     r2   )	r(   	pad_token	eos_tokenZmask_token_sentZ
mask_tokenZmask_token_idoffsetrB   r   rZ  r   r   r   r0      s   

zPegasusConverter.vocabc                 C   s   |j j| jj S r^   )r   r   r(   r  r   r   r   r   r     r   zPegasusConverter.unk_idc                 C   s(   t || j}tt tj||dgS r)  )r*   r(   r	   r   ZWhitespaceSplitr,  r-  r   r   r   r     s   zPegasusConverter.pre_tokenizerc                 C   s0   | j j}|| j jfg}tjd|gdd|g|dS )N$A$Br   )r(   r  eos_token_idr
   r   )rS   eosr   r   r   r   r     s   
zPegasusConverter.post_processorN)r_   r`   ra   r0   r   r   r   r   r   r   r   r    s
    	r  c                   @   rJ  )T5Converterc                 C   s:   | j j}dd |jD }|dd t|d ddD 7 }|S )Nc                 S   r   r   r   r   r   r   r   r>   *  r2   z%T5Converter.vocab.<locals>.<listcomp>c                 S   r  )z
<extra_id_r  rU  r   r  r   r   r   r>   +  r?   r   rl   )r(   Z
_extra_idsr   rB   )rS   r   Znum_extra_idsr0   r   r   r   r0   (  s   zT5Converter.vocabc                 C   &   t jddgg dd| jdfgdS Nr  rR  )r  rR  r  rR  r   rF  rs   r   r   r   r   .     zT5Converter.post_processorN)r_   r`   ra   r0   r   r   r   r   r   r  '  s    r  c                   @      e Zd Zdd ZdS )UdopConverterc                 C   r  r  rF  rs   r   r   r   r   9  r  zUdopConverter.post_processorNr_   r`   ra   r   r   r   r   r   r  8      r  c                   @   ru   )WhisperConverterr"   c           	   	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j j}| j |}| j j}| j j}ddd |D }tj| d| d	| d
| d||fgt||d|_|S )Nr   Fr   r   r   c                 S   s   g | ]}| d qS )r   r   r=   r  r   r   r   r>   Z  s    z.WhisperConverter.converted.<locals>.<listcomp>z $A:0 r   z $A:0 $B:1 r   r   )r(   r   rf   r   r   r   r   r	   r   r!   r   r   r   Zprefix_tokensconvert_ids_to_tokensr  r  joinr
   r   zipr   )	rS   r0   rG   r   Zprefix_token_idsprefixesr  r  Zprefix_templater   r   r   rt   D  s8   
	zWhisperConverter.convertedNr   r   r   r   r   r  C  r   r  c                   @   r  )BigBirdConverterc                 C   rD  rE  rF  rs   r   r   r   r   h  rG  zBigBirdConverter.post_processorNr  r   r   r   r   r  g  r  r  c                   @   ru   )CLIPConverterr"   c              
   C   s   | j j}t| j j }| j j}tt||d dddt|d}t	
t	 t	tddt	 g|_t
tjtddd	d
tjddg|_t |_tj| j j| j jf| j j| j jfddd|_|S )Nr   r   Fr0   rG   r   r   r   r   ry   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r(   r   rf   r   r   ry   r   r   rc   r   r   r   r%  r   rA  r   r	   r   r   r   r   r   r
   r   r  r  r   r   r   r   r   r   r   rt   t  sD   


zCLIPConverter.convertedNr   r   r   r   r   r  s  r   r  c                   @   ru   )LayoutLMv2Converterr"   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )Nrx   FTrz   r{   r~   r   r   r   r   r   r   r   r   r   r   r   rt     r   zLayoutLMv2Converter.convertedNr   r   r   r   r   r    r   r  c                   @   ru   )BlenderbotConverterr"   c              	   C   st   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjd|j d|j|jfgd|_|S )Nr   Fr   r   z$A:0 r   )r   r   )r(   r   rf   r   r   r   r   r	   r   r!   r   r   r   r
   r   r  r  r   r   r   r   r   rt     s*   

zBlenderbotConverter.convertedNr   r   r   r   r   r    r   r  c                   @   r2  )XGLMConverterc                 C   s4   g d}|dd |j dd  D 7 }|g d7 }|S )Nrb  c                 S   r   r   r   r   r   r   r   r>     r2   z'XGLMConverter.vocab.<locals>.<listcomp>r  ))z<madeupword0>rU  )z<madeupword1>rU  )z<madeupword2>rU  )z<madeupword3>rU  )z<madeupword4>rU  )z<madeupword5>rU  )z<madeupword6>rU  r   rZ  r   r   r   r0     s   zXGLMConverter.vocabc                 C   rL  rM  r   rN  r   r   r   r     rO  zXGLMConverter.unk_idc                 C   rD  )Nz</s> $Az</s> $A </s> </s> $BrQ  rR  r   rF  rs   r   r   r   r     rG  zXGLMConverter.post_processorNr]  r   r   r   r   r    r  r  c                   @   sF   e Zd ZdZeZddhZ	 dd Zdd Zdd	 Z	d
d Z
dd ZdS )GemmaConverterTz<start_of_turn>z<end_of_turn>c                 C      t ddS Nr   r!  )r   r%  r   r   r   r   r        zGemmaConverter.normalizerc                 C   s|   | j jdf| j jdf| j jdfg}|dd |jdd  D 7 }tdd |D s<tdd t|D d }|d ur<d||< |S )	NrU  c                 S   r   r   r   r   r   r   r   r>     r2   z(GemmaConverter.vocab.<locals>.<listcomp>r  c                 s   s    | ]	}|d  dkV  qdS )r   rh   Nr   )r=   r.   r   r   r   	<genexpr>  s    z'GemmaConverter.vocab.<locals>.<genexpr>c                 s   s$    | ]\}}|d  dkr|V  qdS )r   ri   Nr   )r=   r   r.   r   r   r   r    s   " )rh   rU  )r(   r  r  r   r   anynextr  )rS   r   r0   Zoverride_indexr   r   r   r0     s   


zGemmaConverter.vocabc                 C   r  )Nr   Zmerged_with_previous)r	   r   rS   r+  r!   r   r   r   r      r  zGemmaConverter.pre_tokenizerc                 C   rL  rM  r   rN  r   r   r   r   #  rO  zGemmaConverter.unk_idc                 C   s    t t ddt  t  gS )Nr!  r   )r   r   r%  ByteFallbackFuser  r   r   r   r   '  s   
zGemmaConverter.decoderN)r_   r`   ra   r   rg   r  r   r   r0   r   r   r   r   r   r   r   r    s    
r  c                   @   s@   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dS )LlamaConverterTc                 C   sN   | j ddf| j ddf| j ddfg}|dd |jdd  D 7 }|S )Nr   rU  r   r5   c                 S   r   r   r   r   r   r   r   r>   :  r2   z(LlamaConverter.vocab.<locals>.<listcomp>r  )r(   r  r   rZ  r   r   r   r0   4  s   zLlamaConverter.vocabc                 C   rL  r  r   rN  r   r   r   r   =  rO  zLlamaConverter.unk_idc                 C   <   t ddt  t  g}|r|t jdddg7 }t |S Nr!  r   r   )contentr  r   r%  r  r  r$  r   rS   r+  r!   sequencer   r   r   r   A     

zLlamaConverter.decoderc                 C   sT   t | jddr(g }t | jddr|tjddg7 }|tjdddg7 }t|S d S )Nr$   Tr!   r!  )prependr   )patternr  )r'   r(   r   Prependr%  r   )rS   r   r  r   r   r   r   K  s   
zLlamaConverter.normalizerc                 C   s.   t | jddst|| j}tj||ddS d S )Nr$   TFr+  r)   split)r'   r(   r*   r	   r,  r-  r   r   r   r   T  s   zLlamaConverter.pre_tokenizerc                 C   r/  r^   r   rs   r   r   r   r   Z  r\  zLlamaConverter.post_processorN)
r_   r`   ra   r   r0   r   r   r   r   r   r   r   r   r   r  1  s    	
	r  c                   @   ru   )MarkupLMConverterr"   c           	   
   C   s   | j }|j}t|j }tt||d ddd| j jd}tj	|j
d|_t	 |_t| j j}t| j j}| j j}| j j}tj| d| | d| d| ||f||fgd|_|S )Nr   Fr  r   z $A z $B r   )r(   r   rf   r   r   r   r   ry   r	   r   r!   r   r   r   rc   r   r   r   r   r
   r   r   )	rS   r   r0   rG   r   r   r   r   r   r   r   r   rt   `  s8   
	zMarkupLMConverter.convertedNr   r   r   r   r   r  _  r   r  c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )MoshiConverterTNc                 K   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S Nr   r   	r   rq   rT   r    r   r   r   r   r   )rS   r   Zmodel_max_lengthkwargsr   r   r   r   r   r   rT     s   

zMoshiConverter.__init__c                 C   s:   |j j}tddg}|st|S tt|g| S r  )r"  r#  r   r%  r   r&  r'  r   r   r   r     s   

zMoshiConverter.normalizerc                 C   r  r  r  r  r   r   r   r     r  zMoshiConverter.decoderc                 C   s   d}t j||ddS )Nr%   Fr  )r	   r,  r-  r   r   r   r     s   zMoshiConverter.pre_tokenizerr^   )r_   r`   ra   r   rT   r   r   r   r   r   r   r   r    s    


r  c                   @   sR   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdS )HeliumConverterTNc                 K   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S r  r  )rS   r   r  r   r   r   r   r   r   rT     s   

zHeliumConverter.__init__c                    s     |}tt| | jd} fddt|jD }|dd t|dd dD  |t	dd	d	d
g |j
ddd |S )Nr   c                    r   r  r  r  rs   r   r   r>     r	  z-HeliumConverter.tokenizer.<locals>.<listcomp>c                 S   s"   g | ]\}}}t |d |ddqS )FT)r  r  Zsingle_wordr  r  r   r   r   r>     s    c                 S   r  r  r   r-   r   r   r   r1     r  z+HeliumConverter.tokenizer.<locals>.<lambda>r3   
Fr  rW  r  )r  Zpad_id)r0   r   r   r   r   r  r   r  rD   r   Zenable_padding)rS   r   rF   r   r  r   rs   r   r     s&   

zHeliumConverter.tokenizerc                 C   sB   g }|j D ]}|jdkr|d|jfg7 }q||j|jfg7 }q|S )Nz<0x0A>r  )r   rk   r   )rS   r   r0   rk   r   r   r   r0     s   

zHeliumConverter.vocabc                 C   rL  r  r   rN  r   r   r   r     rO  zHeliumConverter.unk_idc                 C   s8   t ddt  t  g}|t jdddg7 }t |S r  r  r  r   r   r   r     s   

zHeliumConverter.decoderc                 C   s   t t dt ddgS r  )r   r   r  r%  r   r   r   r   r     s   zHeliumConverter.normalizerc                 C   s   t t ddgS )Nr  
contiguous)r	   r   r   r  r   r   r   r     s   zHeliumConverter.pre_tokenizerc                 C   s   t jddgg ddgdS )NrQ  r  )rQ  r  rQ  r  )rQ  r   r   )r
   r   rs   r   r   r   r     s   zHeliumConverter.post_processorr^   )r_   r`   ra   r   rT   r   r0   r   r   r   r   r   r   r   r   r   r    s    
		r  c                   @   s"   e Zd ZdZdddZdd ZdS )ParakeetConverterTNc                 G   sl   || _ t| d t| | t }| }t|d}||  W d    n1 s,w   Y  || _	d S r  )
r   r   rq   rT   r    r   r   r   r   r   )rS   r   r   r   r   r   r   r   r   rT   
  s   

zParakeetConverter.__init__c              	      s     |}  j|\}}dd t|D }tt|||jjd j	d d} fddt|j
D }|dd t|dd	 d
D  |S )Nc                 S   r   r   r   r   r   r   r   rZ     r   z/ParakeetConverter.tokenizer.<locals>.<dictcomp>Tr   c                    r   r  r  r  rs   r   r   r>   (  r	  z/ParakeetConverter.tokenizer.<locals>.<listcomp>c                 S   r
  r  r  r  r   r   r   r>   .  r  c                 S   r  r  r   r-   r   r   r   r1   0  r  z-ParakeetConverter.tokenizer.<locals>.<lambda>r3   )r0   r  r   r]   r  r   r   r   r  r   r   r  rD   )rS   r   rF   r  rG   r  r   r  r   rs   r   r     s,   

zParakeetConverter.tokenizerr^   )r_   r`   ra   r   rT   r   r   r   r   r   r    s    
r  c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r   )chr)r=   nr   r   r   r>   L  s    z$bytes_to_unicode.<locals>.<listcomp>)rf   rB   ordrC   r@   r  )bscsr   br   r   r   bytes_to_unicode8  s   L
r  c                   @   sF   e Zd ZdZ				dddZdefdd	Zd
d ZdefddZ	dS )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                 K   s4   || _ || _|| _t|tr| | _d S || _d S r^   )r   r  r!   
isinstancer@   r   additional_special_tokens)rS   r   r  r!   r	  r  r   r   r   rT   U  s   zTikTokenConverter.__init__tiktoken_urlc                    s   zddl m} W n ty   tdw || t fddg }i }  D ]P\}}|||< t|dkr:q)g }tdt|D ]%}|d | ||d  }	}
|	 v rh|
 v rh|	|
  v rh||	|
|f qCt	| fddd	d
}|
| q)t	|dd d	d
}fdd|D }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                    s   d  fdd| dD S )Nr   c                    s   g | ]} t | qS r   )r  )r=   charbyte_encoderr   r   r>   r  r2   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>zlatin-1)r  decode)r  r  r   r   token_bytes_to_stringq  s   zPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringr   c                    r+   r,   r   r-   )r   r   r   r1     r2   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>Fr9   c                 S   r  )Nr5   r   r7   r   r   r   r1     r  c                    s$   g | ]} |d   |d fqS r;   r   r<   )r  r   r   r>     s   $ zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)Ztiktoken.loadr  r  
ValueErrorr  rA   r6   rB   rC   rD   rE   )rS   r
  r  rG   r0   r  ZrankrI   rJ   rK   rL   r   )r   r  r  r   extract_vocab_merges_from_modelf  s6   z1TikTokenConverter.extract_vocab_merges_from_modelc                 C   s:   |  | j\}}tt||dd}t|jdrd|j_|S )NF)r   ignore_mergesT)r  r   r   r   r   rO   r  )rS   rF   rG   r   r   r   r   r     s
   zTikTokenConverter.tokenizerr"   c                 C   sh   |   }ttjt| jdddtj| jddg|_t	 |_
|dd | jD  tjdd|_|S )Nr   Fr   r   c                 S   s   g | ]	}t |d ddqS )FTr  r  r  r   r   r   r>     r   z/TikTokenConverter.converted.<locals>.<listcomp>r   )r   r	   r   r   r   r  r   r!   r   r   r   r   r	  r
   r   )rS   r   r   r   r   rt     s   
zTikTokenConverter.converted)Nr  FN)
r_   r`   ra   rb   rT   rc   r  r   r   rt   r   r   r   r   r  P  s    
r  ZAlbertTokenizerZBartTokenizerZBarthezTokenizerZBertTokenizerZBigBirdTokenizerZBlenderbotTokenizerZCamembertTokenizerZCLIPTokenizerZCodeGenTokenizerZConvBertTokenizerZDebertaTokenizerZDebertaV2TokenizerZDistilBertTokenizerZDPRReaderTokenizerZDPRQuestionEncoderTokenizerZDPRContextEncoderTokenizerZElectraTokenizerZFNetTokenizerZFunnelTokenizerZGPT2TokenizerZHerbertTokenizerZLayoutLMTokenizerZLayoutLMv2TokenizerZLayoutLMv3TokenizerZLayoutXLMTokenizerZLongformerTokenizerZLEDTokenizerZLxmertTokenizerZMarkupLMTokenizerZMBartTokenizerZMBart50TokenizerZMPNetTokenizerZMobileBertTokenizerZMvpTokenizerZNllbTokenizerZOpenAIGPTTokenizerZPegasusTokenizerZQwen2TokenizerZRealmTokenizerZReformerTokenizerZRemBertTokenizerZRetriBertTokenizerZRobertaTokenizerZRoFormerTokenizerZSeamlessM4TTokenizerZSqueezeBertTokenizerZT5TokenizerZUdopTokenizerZWhisperTokenizerZXLMRobertaTokenizerZXLNetTokenizer)ZSplinterTokenizerZXGLMTokenizerZLlamaTokenizerZCodeLlamaTokenizerZGemmaTokenizerZPhi3TokenizerFc                 C   sn   | j j}|tv r|st| }||  S ztd t| j| jd W S  t	y6   t
dtt  w )a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zConverting from Tiktoken)r   r	  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r   r_   SLOW_TO_FAST_CONVERTERSrt   loggerinfor  r   r	  r  r  rf   r   )Ztransformer_tokenizerZfrom_tiktokenZtokenizer_class_nameZconverter_classr   r   r   convert_slow_tokenizer  s&   

r  )r   )F)Qrb   r   typingr   	packagingr   Z
tokenizersr   r   r   r   r   r	   r
   Ztokenizers.modelsr   r   r   utilsr   r   r   r   Zutils.import_utilsr   Z
get_loggerr_   r  r    boolrc   r*   rM   rN   rg   rp   rq   rv   r   r   r   r   r   r   r   r   r   r   r   r3  rK  rS  r^  r`  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r   <module>   sT  $


'2''(.' %!5% ($+'4.&)Y1O	
 !"#$%&'()*+,-./01234=