o
    pi@                     @  s   d dl mZ d dlZd dlZd dlmZmZ d dlZd dl	m
Z
 d dlmZ er6d dlmZ ed Zed Zg ZdZd	ZG d
d deZdS )    )annotationsN)TYPE_CHECKINGLiteral)_check_exists_and_download)DatasetNGRAMSEQtraintestz<https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgzZ 30177ea32e27c525793142b6bf2c8e2dc                   @  s   e Zd ZU dZded< ded< ded< ded	< ded
< ded< 						d&d'ddZd(ddZd)ddZd*ddZd+d"d#Z	d,d$d%Z
dS )-Imikolova  
    Implementation of imikolov dataset.

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None.
        data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'.
        window_size(int): sliding window size for 'NGRAM' data. Default -1.
        mode(str): 'train' 'test' mode. Default 'train'.
        min_word_freq(int): minimal word frequencies for building word dictionary. Default 50.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True

    Returns:
        Dataset: instance of imikolov dataset

    Examples:

        .. code-block:: python

            >>> # doctest: +TIMEOUT(60)
            >>> import paddle
            >>> from paddle.text.datasets import Imikolov

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, src, trg):
            ...         return paddle.sum(src), paddle.sum(trg)


            >>> imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2)

            >>> for i in range(10):
            ...     src, trg = imikolov[i]
            ...     src = paddle.to_tensor(src)
            ...     trg = paddle.to_tensor(trg)
            ...
            ...     model = SimpleNet()
            ...     src, trg = model(src, trg)
            ...     print(src.item(), trg.item())
            2076 2075
            2076 2075
            675 674
            4 3
            464 463
            2076 2075
            865 864
            2076 2075
            2076 2075
            1793 1792

    
str | None	data_file_ImikolovDataType	data_typeintwindow_size_ImikolovDataSetModemodemin_word_freqdict[str, int]word_idxNr   r   2   TdownloadboolreturnNonec                 C  s   |  dv sJ d| |  | _| dv sJ d| | | _|| _|| _|| _| jd u rA|s8J dt|tt	d|| _| 
|| _|   d S )Nr   z,data type should be 'NGRAM', 'SEQ', but got r
   z(mode should be 'train', 'test', but got z;data_file is not set and downloading automatically disabledZimikolov)upperr   lowerr   r   r   r   r   URLMD5_build_work_dictr   
_load_anno)selfr   r   r   r   r   r    r&   d/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/paddle/text/datasets/imikolov.py__init__c   s(   	



zImikolov.__init__c                 C  sb   |d u r	t t}|D ]#}|  D ]
}||  d7  < q|d  d7  < |d  d7  < q|S )N   <s><e>)collectionsdefaultdictr   stripsplit)r%   f	word_freqlwr&   r&   r'   
word_count   s   
zImikolov.word_countcutoffc                   s   d}d}t  jQ}||}||} | |}d|v r%|d=  fdd| D }t|dd d}tt| \}	}
t	tt|	t
t|	}t|	|d< W d    |S 1 s^w   Y  |S )	Nz$./simple-examples/data/ptb.train.txtz$./simple-examples/data/ptb.valid.txt<unk>c                   s   g | ]}|d   j kr|qS )r)   )r   ).0xr%   r&   r'   
<listcomp>   s    z-Imikolov._build_work_dict.<locals>.<listcomp>c                 S  s   | d  | d fS )Nr)   r   r&   )r8   r&   r&   r'   <lambda>   s    z+Imikolov._build_work_dict.<locals>.<lambda>)key)tarfileopenr   extractfiler4   itemssortedlistzipdictrangelen)r%   r5   Ztrain_filenameZtest_filenametfZtrainfZtestfr1   Zword_freq_sortedwords_r   r&   r9   r'   r#      s&   



zImikolov._build_work_dictc              	     sp  g _ tj}dj d}||}jd  |D ]}jdkrfjdks-J ddg|	 
 d}t|jkre fd	d
|D }tjt|d D ]}j t||j |  qSqjdkr|	 
 } fdd
|D }jd g|}g |jd }jdkrt|jkrqj ||f qtdW d    d S 1 sw   Y  d S )Nz./simple-examples/data/ptb.z.txtr6   r   r   zInvalid gram lengthr*   r+   c                      g | ]	}j | qS r&   r   getr7   r3   ZUNKr%   r&   r'   r:          z'Imikolov._load_anno.<locals>.<listcomp>r)   r	   c                   rJ   r&   rK   rM   rN   r&   r'   r:      rO   r   zUnknown data type)datar=   r>   r   r   r?   r   r   r   r.   r/   rF   rE   appendtupleAssertionError)r%   rG   filenamer0   r2   iZsrc_seqZtrg_seqr&   rN   r'   r$      s2   


 
"zImikolov._load_annoidx1tuple[npt.NDArray[np.int_], npt.NDArray[np.int_]]c                 C  s   t dd | j| D S )Nc                 S  s   g | ]}t |qS r&   )nparray)r7   dr&   r&   r'   r:      s    z(Imikolov.__getitem__.<locals>.<listcomp>)rR   rP   )r%   rV   r&   r&   r'   __getitem__   s   zImikolov.__getitem__c                 C  s
   t | jS N)rF   rP   r9   r&   r&   r'   __len__   s   
zImikolov.__len__)Nr   r   r   r   T)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r\   )r5   r   r   r   )r   r   )rV   r   r   rW   )r   r   )__name__
__module____qualname____doc____annotations__r(   r4   r#   r$   r[   r]   r&   r&   r&   r'   r   $   s(   
 7
'


r   )
__future__r   r,   r=   typingr   r   numpyrX   Zpaddle.dataset.commonr   Z	paddle.ior   Znumpy.typingZnptr   r   __all__r!   r"   r   r&   r&   r&   r'   <module>   s   