o
    rqi                     @   s|   d dl mZmZ d dlZd dlmZ d dlmZ dedefddZ	d	e
dedefd
dZG dd deZG dd deZdS )    )ListUnionN)AutoTokenizer)GPT2TokenizerFaststart_extra_idmax_lenc                    sv   dt dtf fdd}d}d}| D ]!}|dkr(|d7 }| kr'|||}d}q|||}d}|| }q|||}|S )	z Encode whitespaces to extra tokens in GPT-J.

    >>> encode_whitespaces('a\n  b\n   c', 10, 10)
    'a\n<|extratoken_10|>b\n<|extratoken_11|>c'
    acc_lentextc                    sX   | dkr|S | dkr|d S |  ksJ d  d|  d |  }d| d}|| S )	Nr       zMax whitespace run length z, but found    <|extratoken_|> )r   r	   Zextra_idZextra_tokenr   r   r   o/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/nlp/codegeex/tokenizer.pypush_acc_space   s   z*encode_whitespaces.<locals>.push_acc_spacer    r   r
   )intstr)r	   r   r   r   r   reschr   r   r   encode_whitespaces	   s   




r   r	   c                 C   s@   t d|d D ]}|d | }d| d}| |d| } q| S )z Decode the whitespace-encoded strings produced by encode_whitespace.

    >>> text = 'a\n  b\n   c'
    >>> s, l = 10, 10
    >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
    True
    r   r
   r   r   r   )rangereplace)r	   r   r   ltoken_idtokenr   r   r   decode_whitespaces,   s
   r   c                   @   s   e Zd Z		ddedee defddZdefd	d
ZdefddZdedefddZ	dd Z
defddZdefddZdeeef fddZdd Zdd ZdS ) Code13BDictionaryN	dict_fileextra_token_idspad_to_vocab_sizec                 C   s   t  | _t  | _d| _g | _| dd | dd | dd | dd | | |d u r9dd tdd	D }|D ]}| |d q;|dkrO| | d S d S )
Nr   z<s>z<pad>z</s>z<unk>c                 S   s   g | ]}t |qS r   )r   .0xr   r   r   
<listcomp>O   s    z.Code13BDictionary.__init__.<locals>.<listcomp>iQ  i  )	dict_idx_count_num_symbols_symbols_add_symbol
_load_dictr   _pad_to_vocab_size)selfr!   r"   r#   r   r   r   r   __init__=   s    
zCode13BDictionary.__init__
vocab_sizec                 C   sB   |t |  }|dkrd S td|d D ]}| d|d qd S )Nr   r
   zvocab_pad_token{})lenr   r-   format)r0   r2   Znum_padir   r   r   r/   X   s   z$Code13BDictionary._pad_to_vocab_sizec                 C   sr   t |d*}|D ]}| }|dks|drq| \}}| |t| qW d    d S 1 s2w   Y  d S )Nrr   #)openstrip
startswithsplitr-   r   )r0   r!   flinesymcountr   r   r   r.   _   s   "zCode13BDictionary._load_dictr>   r?   c                 C   s4   | j | j|< || j|< | j| |  j d7  _ d S )Nr
   )r+   r)   r*   r,   append)r0   r>   r?   r   r   r   r-   h   s   
zCode13BDictionary._add_symbolc                 C   s   | j S N)r+   r0   r   r   r   __len__n   s   zCode13BDictionary.__len__c                 C   
   | j | S rA   )r)   )r0   r>   r   r   r   indexq      
zCode13BDictionary.indexidxc                 C   rD   rA   )r,   )r0   rG   r   r   r   stringt   rF   zCode13BDictionary.stringr   c                 C   s   t |tr	t|}| |S rA   )
isinstancer   r   rE   )r0   r   r   r   r   	map_tokenw   s   

zCode13BDictionary.map_tokenc                    s    fdd|D S )Nc                    s   g | ]}  |qS r   )rJ   r%   r   rB   r   r   r'   }   s    z0Code13BDictionary.map_tokens.<locals>.<listcomp>r   )r0   tokensr   rB   r   
map_tokens|   s   zCode13BDictionary.map_tokensc                    s     fdd|D }dd |D S )Nc                    s"   g | ]}|d kr
dn  |qS )iP  Z50256)rH   rK   rB   r   r   r'      s    z3Code13BDictionary.decode_tokens.<locals>.<listcomp>c                 S   s   g | ]}| d st|qS )Zvocab_pad_token)r:   r   r$   r   r   r   r'      s    r   )r0   rL   decodedr   rB   r   decode_tokens   s   
zCode13BDictionary.decode_tokens)Nr    )__name__
__module____qualname__r   r   r   r1   r/   r.   r-   rC   rE   rH   r   rJ   rM   rO   r   r   r   r   r   ;   s&    
	r   c                   @   sN   e Zd Z						ddedededed	ef
d
dZdefddZdd ZdS )CodeGeeXTokenizerNEleutherAI/gpt-j-6B
   codegeex-13b	tokenizertokenizer_pathr   r   r!   c                 C   s|   |d ur|nt || _|dvrtd| d|| _|| _|| _|d ur4| jdkr0t|ddnd | _nd | _| jj	| _	d S )N)rV   codegeex-python-13bzInvalid mode z5, choose from ['codegeex-13b', 'codegeex-python-13b']rY   i   )r#   )
r   Zfrom_pretrainedrW   
ValueErrorr   r   moder   	code_dictZeos_token_id)r0   rW   rX   r   r   r[   r!   r   r   r   r1      s&   	

zCodeGeeXTokenizer.__init__codec                 C   sr   | j dkrt|| j| j}| j|ddj}|S | j dkr7t|| j| j}| j| j|}t	
|dd}|S )NrV   F)Zis_split_into_wordsrY   r
   r    )r[   r   r   r   rW   	input_idsr\   rM   encodetorchZ
LongTensorZreshape)r0   r]   r^   r   r   r   encode_code   s   

zCodeGeeXTokenizer.encode_codec                 C   sr   | j dkr| jj|dd}t|| j| j}|S | j dkr7| j| d g}| jj|dd}t|| j| j}|S )NrV   F)Zskip_special_tokensrY   r   )	r[   rW   decoder   r   r   r\   rO   tolist)r0   r^   r	   Zoutput_coder   r   r   decode_code   s   

zCodeGeeXTokenizer.decode_code)NrT   rU   rU   rV   N)	rP   rQ   rR   r   r   r   r1   ra   rd   r   r   r   r   rS      s(    
rS   )typingr   r   r`   Ztransformersr   Ztransformers.models.gpt2r   r   r   r   r   objectr   rS   r   r   r   r   <module>   s   #L