o
    rqi'                     @   sv   d dl Z d dlmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZ G dd dZG d	d
 d
e	ZdS )    N)DictListOptionalUnion)SentencePieceProcessor)PreTrainedTokenizer)BatchEncodingEncodedInput)PaddingStrategyc                
   @   s   e Zd ZdefddZdefddZ		ddeded	ed
ee fddZ	dee d
efddZ
dee d
efddZdd Zdd ZdS )SPTokenizer
model_pathc                 C   s   t j|s
J |t|d| _| j | _| j | _| j | _| j	 | _
| j | j ks4J g d}i | _i | _|D ]}| j| j|< || j| j< |  jd7  _q@d S )N)Z
model_file)z[MASK][gMASK]z[sMASK]sopZeop   )ospathisfiler   sp_model
vocab_sizen_wordsbos_ideos_idZunk_idpad_idZget_piece_sizespecial_tokensindex_special_tokens)selfr   r   token r   r/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/nlp/chatglm2/tokenization.py__init__   s   zSPTokenizer.__init__sc                 C      | j |S N)r   ZEncodeAsPieces)r   r    r   r   r   tokenize       zSPTokenizer.tokenizeFboseosreturnc                 C   s@   t |tu sJ | j|}|r| jg| }|r|| jg }|S r"   )typestrr   encoder   r   )r   r    r%   r&   tr   r   r   r*   #   s   zSPTokenizer.encoder+   c                 C   r!   r"   )r   decode)r   r+   r   r   r   r,   /   r$   zSPTokenizer.decodetokensc                 C   s   | j |}|S r"   )r   ZDecodePieces)r   r-   textr   r   r   decode_tokens2   s   zSPTokenizer.decode_tokensc                 C   s    || j v r
| j | S | j|S z2 Converts a token (str) in an id using the vocab. )r   r   Z	PieceToIdr   r   r   r   r   convert_token_to_id6   s   

zSPTokenizer.convert_token_to_idc                 C   s6   || j v s|| j| j| jfv s|dk rdS | j|S )=Converts an index (integer) in a token (str) using the vocab.r    )r   r   r   r   r   Z	IdToPiecer   indexr   r   r   convert_id_to_token<   s   zSPTokenizer.convert_id_to_tokenN)FF)__name__
__module____qualname__r)   r   r#   boolr   intr*   r,   r/   r2   r7   r   r   r   r   r   
   s$    
r   c                       sD  e Zd ZddiZg dZd1 fdd	Zdd Zed	efd
dZ	edd Z
ed	efddZedd Zedd Zdd Zdd Zdd Zdd Zdee d	efddZd2d d!Zd"d# Zd2d$d%Z	d2d&ee d'eee  d	ee fd(d)Zdejddfd*eeeef ef d+ee d,ed-ee d.ee  d	e!fd/d0Z"  Z#S )3ChatGLM2Tokenizer
vocab_fileztokenizer.model)Z	input_idsattention_maskposition_idsleftc                    sJ   d| _ || _t|| _| jj| jj| jjd| _t j	dd|i| d S )NZGLMTokenizer)z<bos><eos><pad>padding_sider   )
namer>   r   	tokenizerr   r   r   r   superr   )r   r>   rD   kwargs	__class__r   r   r   J   s   
zChatGLM2Tokenizer.__init__c                 C   s@   || j v r
| j | S || jj v sJ | d| j | jj | S )Nz is not a special token for )r   rF   rE   r1   r   r   r   get_commandV   s   

 zChatGLM2Tokenizer.get_commandr'   c                 C      dS )Nz<unk>r   r   r   r   r   	pad_token\      zChatGLM2Tokenizer.pad_tokenc                 C   
   |  dS )NrC   rK   rM   r   r   r   pad_token_id`      
zChatGLM2Tokenizer.pad_token_idc                 C   rL   )Nz</s>r   rM   r   r   r   	eos_tokend   rO   zChatGLM2Tokenizer.eos_tokenc                 C   rP   )NrB   rQ   rM   r   r   r   eos_token_idh   rS   zChatGLM2Tokenizer.eos_token_idc                 C   s   | j jS r"   )rF   r   rM   r   r   r   r   l   s   zChatGLM2Tokenizer.vocab_sizec                    s(    fddt  jD }| j |S )z Returns vocab as a dict c                    s   i | ]}  ||qS r   )_convert_id_to_token).0irM   r   r   
<dictcomp>r   s    
z/ChatGLM2Tokenizer.get_vocab.<locals>.<dictcomp>)ranger   updateZadded_tokens_encoder)r   Zvocabr   rM   r   	get_vocabp   s
   
zChatGLM2Tokenizer.get_vocabc                 K   r!   r"   )rF   r#   )r   r.   rH   r   r   r   	_tokenizey   r$   zChatGLM2Tokenizer._tokenizec                 C   r!   r0   )rF   r2   r1   r   r   r   _convert_token_to_id|      z&ChatGLM2Tokenizer._convert_token_to_idc                 C   r!   )r3   )rF   r7   r5   r   r   r   rV      r_   z&ChatGLM2Tokenizer._convert_id_to_tokenr-   c                 C   r!   r"   )rF   r/   )r   r-   r   r   r   convert_tokens_to_string   r$   z*ChatGLM2Tokenizer.convert_tokens_to_stringNc                 C   s   t j|rt j|| jd }n|}t| jd}| }W d   n1 s(w   Y  t|d}|| W d   |fS 1 sDw   Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        r>   rbNwb)	r   r   isdirjoinvocab_files_namesopenr>   readwrite)r   Zsave_directoryZfilename_prefixr>   ZfinZ	proto_strwriterr   r   r   save_vocabulary   s   

z!ChatGLM2Tokenizer.save_vocabularyc                 C   s   |  d|  dg}|S )Nr   r   rQ   )r   prefix_tokensr   r   r   get_prefix_tokens   s   z#ChatGLM2Tokenizer.get_prefix_tokensc                 C   sX   |d u rg }d}t |D ]\}\}}|d|d ||7 }q|dt|d |7 }|S )Nr4   u    [Round {}]

问：{}

答：{}

r   u   [Round {}]

问：{}

答：)	enumerateformatlen)r   queryhistorypromptrX   Z	old_queryresponser   r   r   build_prompt   s   
zChatGLM2Tokenizer.build_prompttoken_ids_0token_ids_1c                 C   s0   |   }|| }|dur|| | dg }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        NrB   )rl   rK   )r   ru   rv   rk   r   r   r    build_inputs_with_special_tokens   s   z2ChatGLM2Tokenizer.build_inputs_with_special_tokensencoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskc           
      C   s  | j dksJ || jd  }t|}|tjkrt|}|dur1|dur1|| dkr1|| d | }|tjko;t||k}d|vrGdg| |d< d|vrStt||d< |r|t| }	d|v rjdg|	 |d  |d< d|v rydg|	 |d  |d< | jg|	 | || jd < |S )a?  
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                `>= 7.5` (Volta).
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        rA   r   Nr   r?   r@   )	rD   model_input_namesro   r
   ZLONGEST
DO_NOT_PADlistrZ   rR   )
r   rx   ry   rz   r{   r|   Zrequired_inputZ
seq_lengthZneeds_to_be_padded
differencer   r   r   _pad   sN    
zChatGLM2Tokenizer._pad)rA   r"   )$r8   r9   r:   re   r}   r   rK   propertyr)   rN   rR   rT   rU   r   r\   r]   r^   rV   r   r`   rj   rl   rt   r<   r   rw   r
   r~   r   r   r	   r   r;   dictr   __classcell__r   r   rI   r   r=   E   s`    


	



r=   )r   typingr   r   r   r   Zsentencepiecer   Ztransformersr   Z$transformers.tokenization_utils_baser   r	   Ztransformers.utilsr
   r   r=   r   r   r   r   <module>   s    ;