o
    pi                     @  s   d dl mZ d dlZd dlZd dlZd dlZd dlmZmZ d dl	Z
d dlmZ d dlmZ er@d dlmZ d dlmZ ed Zg ZdZd	ZG d
d deZdS )    )annotationsN)TYPE_CHECKINGLiteral)_check_exists_and_download)Dataset)Patterntraintestz6https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gzZ 7c2ac02c03563afcf9b574c7e56c153ac                   @  s   e Zd ZU dZded< ded< ded< ded	< ded
< 				d&d'ddZd(ddZd)ddZd*ddZd+d"d#Z	d,d$d%Z
dS )-Imdba  
    Implementation of `IMDB <https://www.imdb.com/interfaces/>`_ dataset.

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None.
        mode(str): 'train' 'test' mode. Default 'train'.
        cutoff(int): cutoff number for building word dictionary. Default 150.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True.

    Returns:
        Dataset: instance of IMDB dataset

    Examples:

        .. code-block:: python

            >>> # doctest: +TIMEOUT(75)
            >>> import paddle
            >>> from paddle.text.datasets import Imdb

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, doc, label):
            ...         return paddle.sum(doc), label


            >>> imdb = Imdb(mode='train')

            >>> for i in range(10):
            ...     doc, label = imdb[i]
            ...     doc = paddle.to_tensor(doc)
            ...     label = paddle.to_tensor(label)
            ...
            ...     model = SimpleNet()
            ...     image, label = model(doc, label)
            ...     print(doc.shape, label.shape)
            [121] [1]
            [115] [1]
            [386] [1]
            [471] [1]
            [585] [1]
            [206] [1]
            [221] [1]
            [324] [1]
            [166] [1]
            [598] [1]

    
str | None	data_file_ImdbDataSetModemodedict[str, int]word_idxlistdocslabelsNr	      TcutoffintdownloadboolreturnNonec                 C  sj   |  dv sJ d| |  | _|| _| jd u r)|s J dt|ttd|| _| || _|   d S )Nr   z(mode should be 'train', 'test', but got z>data_file is not set and downloading automatically is disabledZimdb)	lowerr   r   r   URLMD5_build_work_dictr   
_load_anno)selfr   r   r   r    r"   `/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/paddle/text/datasets/imdb.py__init__c   s   


zImdb.__init__c           
        s   t t}td}| |D ]}|D ]
}||  d7  < qq fdd| D }t|dd d}tt	| \}}t
tt	|tt|}	t||	d< |	S )	Nz/aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$   c                   s   g | ]
}|d   kr|qS )r%   r"   ).0xr   r"   r#   
<listcomp>   s    z)Imdb._build_work_dict.<locals>.<listcomp>c                 S  s   | d  | d fS )Nr%   r   r"   )r'   r"   r"   r#   <lambda>   s    z'Imdb._build_work_dict.<locals>.<lambda>)key<unk>)collectionsdefaultdictr   recompile	_tokenizeitemssortedr   zipdictrangelen)
r!   r   Z	word_freqpatterndocword
dictionarywords_r   r"   r(   r#   r      s   

zImdb._build_work_dictr8   Pattern[str]list[list[str]]c              	   C  s   g }t | jB}| }|d urBt||jr2|||	 
dd tjd   | }|d usW d    |S W d    |S 1 sMw   Y  |S )Ns   
zlatin-1)tarfileopenr   nextr   matchnameappendextractfilereadrstrip	translatestringpunctuationencoder   split)r!   r8   dataZtarftfr"   r"   r#   r1      s,   


zImdb._tokenizec                   s   t dj d}t dj d}jd  g _g _|D ]}j fdd|D  jd q$|D ]}j fdd|D  jd	 q@d S )
NzaclImdb/z/pos/.*\.txt$z/neg/.*\.txt$r,   c                      g | ]	}j | qS r"   r   getr&   wZUNKr!   r"   r#   r)          z#Imdb._load_anno.<locals>.<listcomp>r   c                   rP   r"   rQ   rS   rU   r"   r#   r)      rV   r%   )r/   r0   r   r   r   r   r1   rE   )r!   Zpos_patternZneg_patternr9   r"   rU   r#   r       s   
zImdb._load_annoidx1tuple[npt.NDArray[np.int_], npt.NDArray[np.int_]]c                 C  s"   t | j| t | j| gfS N)nparrayr   r   )r!   rW   r"   r"   r#   __getitem__   s   "zImdb.__getitem__c                 C  s
   t | jS rY   )r7   r   )r!   r"   r"   r#   __len__   s   
zImdb.__len__)Nr	   r   T)
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   )r8   r>   r   r?   )r   r   )rW   r   r   rX   )r   r   )__name__
__module____qualname____doc____annotations__r$   r   r1   r    r\   r]   r"   r"   r"   r#   r   '   s"   
 5



r   )
__future__r   r-   r/   rJ   r@   typingr   r   numpyrZ   Zpaddle.dataset.commonr   Z	paddle.ior   r   Znumpy.typingZnptr   __all__r   r   r   r"   r"   r"   r#   <module>   s"   