o
    rqi"                     @   s   d Z ddlmZmZmZmZ ddlZddlZddlZddl	Z	ddl
mZ ddlmZmZmZmZ ddlZddlZddlmZmZ ddlmZ e Zdd	iZG d
d deZdS )zTokenization classes for QWen.    )absolute_importdivisionprint_functionunicode_literalsN)open)ListOptionalTupleUnion)
AddedTokenPreTrainedTokenizer)
get_logger
vocab_fileqwen.tiktokenc                       s  e Zd ZdZ	 eZ									d+ fdd	Zd	d
 Zdd Zdd Z	de
dee
 fddZde
dee
 fddZdee
 de
fddZedd Zdede
fddZde
defdd Zedee
 fd!d"Zedee fd#d$Zd%d& Z	d,d'eeee f d(ede
fd)d*Z  ZS )-QWenTokenizerzQWen tokenizer.replaceN<|endoftext|>FTc              	      s  t |trt|dddn|}t |trt|dddn|}t |tr(t|dddn|}t |tr6t|dddn|}t j|||||||	d |	| _|d urN|ntd| _|| _d}d}d}d}|
rv|||d	d
dddft	dd t
dD  }n|||f}d}dtddfdd}||}dd t|t|dD }|| _tj||||d}t|t| |jksJ t|t|  d|j d|| _| j| _dd | j D | _|| _| jj| _|| | _|| | _d S )NF)lstriprstrip)errors	unk_token	bos_token	eos_token	pad_tokenadd_prefix_spaceadd_bos_tokeng   mBZQwenr   z<|im_start|>z
<|im_end|>z<R>z<S>z<X>z<mask>z<sep>c                 S   s   g | ]}d | dqS )z<extra_> .0ir   r   n/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/nlp/qwen/tokenization.py
<listcomp>W   s    z*QWenTokenizer.__init__.<locals>.<listcomp>   zn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+tiktoken_bpe_filereturnzdict[bytes, int]c                 S   s*   t | d }dd dd | D D S )Nrbc                 S   s    i | ]\}}t |t|qS r   )base64	b64decodeint)r   tokenZrankr   r   r!   
<dictcomp>a   s    zEQWenTokenizer.__init__.<locals>.load_tiktoken_bpe.<locals>.<dictcomp>c                 s   s    | ]	}|r|  V  qd S N)split)r   liner   r   r!   	<genexpr>c   s    zDQWenTokenizer.__init__.<locals>.load_tiktoken_bpe.<locals>.<genexpr>)r   read
splitlines)r$   contentsr   r   r!   load_tiktoken_bpe_   s   z1QWenTokenizer.__init__.<locals>.load_tiktoken_bpec                 S      i | ]\}}||qS r   r   )r   indexr*   r   r   r!   r+   h   s    z*QWenTokenizer.__init__.<locals>.<dictcomp>)start)pat_strmergeable_ranksspecial_tokensz != z in encodingc                 S   r4   r   r   )r   kvr   r   r!   r+   z       )
isinstancestrr   super__init__r   r)   max_lenr   tuplerange	enumeratelenr9   tiktokenZEncodingn_vocabr8   encoderitemsdecoder	tokenizerZ	eot_tokenZeod_idZim_start_idZ	im_end_id)selfr   r   rA   r   r   r   r   r   r   Zadd_more_sp_tokenskwargsnameZ	ENDOFTEXTZIMSTARTZIMENDr9   ZPAT_STRr3   r8   enc	__class__r   r!   r@   !   s   		


zQWenTokenizer.__init__c                 C      | j jS r,   rK   rG   rL   r   r   r!   __len__   s   zQWenTokenizer.__len__c                 C   s   | j S r,   )r8   rT   r   r   r!   	get_vocab   s   zQWenTokenizer.get_vocabc                 C   s   g }t |tr|| jv r| j| S | j|S |D ]}|| jv r)|| j|  q|| j| qt|| jkrFt	d
t|| j |S )NzToken indices sequence length is longer than the specified maximum  sequence length for this model ({} > {}). Running this sequence through the model will result in indexing errors)r=   r>   r9   rH   getappendrE   rA   loggerwarningformat)rL   tokensidsr*   r   r   r!   convert_tokens_to_ids   s   



z#QWenTokenizer.convert_tokens_to_idssave_directoryr%   c                 K   s   t j|d}t|ddd)}| j D ]\}}t|dd t	| d }|
| qW d   |fS 1 s:w   Y  |fS )z
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).

        Returns:
            `Tuple(str)`: Paths to the files saved.
        r   wutf8)encoding 
N)ospathjoinr   r8   rI   r'   	b64encodedecoder>   write)rL   r_   rM   	file_pathr`   r:   r;   r.   r   r   r!   save_vocabulary   s    
zQWenTokenizer.save_vocabularytextc                 K   s6   g }t d|}| j|D ]
}|| j|  q|S )a  
        Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.

        Args:
            text (`str`):
                The sequence to be encoded.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific encode method. See details in
                [`~PreTrainedTokenizerBase.__call__`]

        Returns:
            `List[str]`: The list of tokens.
        NFC)unicodedata	normalizerK   Zencode_ordinaryrX   rJ   )rL   rm   rM   r\   tr   r   r!   tokenize   s
   zQWenTokenizer.tokenizer\   c                    s0   d |}t fdd|D jd jd}|S )z
        Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
        often want to remove sub-word tokenization artifacts at the same time.
         c                    s   g | ]} j | qS r   )Zbyte_decoder)r   crT   r   r!   r"      r<   z:QWenTokenizer.convert_tokens_to_string.<locals>.<listcomp>zutf-8)r   )rg   	bytearrayri   r   )rL   r\   rm   r   rT   r!   convert_tokens_to_string   s
   
z&QWenTokenizer.convert_tokens_to_stringc                 C   rR   r,   rS   rT   r   r   r!   
vocab_size   s   zQWenTokenizer.vocab_sizer5   c                 C   s    || j jkr	| jS | j |gS r,   )rK   rG   r   ri   )rL   r5   r   r   r!   _convert_id_to_token   s   z"QWenTokenizer._convert_id_to_tokenr*   c                 C   s&   | j |d| jj| jddd S )z*Converts a token to an id using the vocab.zUTF-8all)Zallowed_specialr   )rH   rW   encoderK   r   )rL   r*   r   r   r!   _convert_token_to_id   s   z"QWenTokenizer._convert_token_to_idc                 C      dd | j  D }|S )z
        `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.

        Convert tokens of `tokenizers.AddedToken` type to string.
        c                 S   s   g | ]}t |qS r   )r>   )r   sr   r   r!   r"      s    z4QWenTokenizer.all_special_tokens.<locals>.<listcomp>)r9   keys)rL   Zall_toksr   r   r!   all_special_tokens   s   z QWenTokenizer.all_special_tokensc                 C   r|   )zy
        `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
        c                 S   s   g | ]}|qS r   r   )r   r;   r   r   r!   r"      s    z1QWenTokenizer.all_special_ids.<locals>.<listcomp>)r9   values)rL   Zall_idsr   r   r!   all_special_ids   s   zQWenTokenizer.all_special_idsc                 K   s   t )a  
        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens.
        )NotImplementedError)rL   rm   rM   r   r   r!   	_tokenize   s   zQWenTokenizer._tokenize	token_idsskip_special_tokensc                    s2   t |tr|g}|r fdd|D } j|S )Nc                    s   g | ]	}| j vr|qS r   )r   r   rT   r   r!   r"      s    z)QWenTokenizer._decode.<locals>.<listcomp>)r=   r)   rK   ri   )rL   r   r   rM   r   rT   r!   _decode   s
   
zQWenTokenizer._decode)	r   Nr   r   r   NFFT)F)__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesr@   rU   rV   r^   r>   r	   rl   r   rr   rv   propertyrw   r)   rx   r{   r   r   r   r
   boolr   __classcell__r   r   rP   r!   r      sJ    _

	r   )r   
__future__r   r   r   r   r'   loggingre   ro   ior   typingr   r   r	   r
   jsonrF   Ztransformersr   r   Zmodelscope.utils.loggerr   rY   r   r   r   r   r   r!   <module>   s   