o
    pi.                     @   s   d Z ddlZddlZddlZddlZddlZddlmZ g Z	dZ
dZdd Zdd	 Zed
dddddd Zed
dddddd Zed
dddddd Zed
dddddd Zed
dddddd ZdS )a  
IMDB dataset.

This module downloads IMDB dataset from
http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
of 25,000 highly polar movie reviews for training, and 25,000 for testing.
Besides, this module also provides API for building dictionary.
    N)
deprecatedz6https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gzZ 7c2ac02c03563afcf9b574c7e56c153ac                 c   s    t tjjtdt@}| }|durEt	| 
|jr5|| ddtjd  V  | }|dusW d   dS W d   dS 1 sPw   Y  dS )zQ
    Read files that match the given pattern.  Tokenize and yield each file.
    imdbNs   
zlatin-1)tarfileopenpaddledatasetcommondownloadURLMD5nextboolmatchnameextractfilereadrstrip	translatestringpunctuationencodelowersplit)patternZtarftf r   Z/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/paddle/dataset/imdb.pytokenize&   s"   
"r   c           	         s   t t}t| D ]}|D ]
}||  d7  < qq	 fdd| D }t|dd d}tt| \}}ttt|t	t
|}t
||d< |S )z
    Build a word dictionary from the corpus. Keys of the dictionary are words,
    and values are zero-based IDs of these words.
       c                    s   g | ]
}|d   kr|qS )r   r   ).0xcutoffr   r   
<listcomp>J   s    zbuild_dict.<locals>.<listcomp>c                 S   s   | d  | d fS )Nr   r   r   )r    r   r   r   <lambda>L   s    zbuild_dict.<locals>.<lambda>)key<unk>)collectionsdefaultdictintr   itemssortedlistzipdictrangelen)	r   r"   Z	word_freqdocword
dictionarywords_word_idxr   r!   r   
build_dict?   s   
r7   z2.0.0zpaddle.text.datasets.Imdbr   z>Please use new dataset API which supports paddle.io.DataLoader)ZsinceZ	update_tolevelreasonc                    sB   d g  fdd}||  d || d  fdd}|S )Nr&   c                    s0   t | D ]}| fdd|D |f qd S )Nc                    s   g | ]} | qS r   )get)r   wUNKr6   r   r   r#   _   s    z0reader_creator.<locals>.load.<locals>.<listcomp>)r   append)r   outlabelr1   r<   r   r   load]   s    zreader_creator.<locals>.loadr   r   c                   3   s     E d H  d S )Nr   r   )INSr   r   readerd   s   zreader_creator.<locals>.readerr   )Zpos_patternZneg_patternr6   rA   rC   r   )rB   r=   r6   r   reader_creatorS   s   rD   c                 C      t tdtd| S )a  
    IMDB training set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Training reader creator
    :rtype: callable
    zaclImdb/train/pos/.*\.txt$zaclImdb/train/neg/.*\.txt$rD   recompiler6   r   r   r   trainj   
   rJ   c                 C   rE   )a  
    IMDB test set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Test reader creator
    :rtype: callable
    zaclImdb/test/pos/.*\.txt$zaclImdb/test/neg/.*\.txt$rF   rI   r   r   r   test   rK   rL   c                   C   s   t tddS )za
    Build a word dictionary from the corpus.

    :return: Word dictionary
    :rtype: dict
    z/aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$   )r7   rG   rH   r   r   r   r   	word_dict   s   
rN   c                   C   s   t jjtdt d S )Nr   )r   r   r   r	   r
   r   r   r   r   r   fetch   s   rO   )__doc__r'   rG   r   r   Zpaddle.dataset.commonr   Zpaddle.utilsr   __all__r
   r   r   r7   rD   rJ   rL   rN   rO   r   r   r   r   <module>   s^   	



