o
    + i0"                     @  s   d dl mZ d dlZd dlmZmZmZ d dlZd dl	m
Z
 d dlmZ er0d dlmZ ed Zg ZdZdZd	Zd
ZdZdZdZdZG dd deZdS )    )annotationsN)TYPE_CHECKINGLiteraloverload)_check_exists_and_download)DatasettraintestgenzJhttp://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgzZ 7d7897317ddd8ba0ae5c5fa7248d3ff5z/http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgzZ 0791583d57d5beb693b9414c5b36798cz<s>z<e>z<unk>   c                   @  s   e Zd ZU dZded< ded< ded< ded	< ded
< ded< ded< ded< 				d.d/ddZd0ddZd1ddZd2dd Ze		!d3d4d%d&Z
e		!d3d5d)d&Z
e		!d3d6d+d&Z
d7d-d&Z
dS )8WMT14a  
    Implementation of `WMT14 <http://www.statmt.org/wmt14/>`_ test dataset.
    The original WMT14 dataset is too large and a small set of data for set is
    provided. This module will download dataset from
    http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz .

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None.
        mode(str): 'train', 'test' or 'gen'. Default 'train'.
        dict_size(int): word dictionary size. Default -1.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True.

    Returns:
        Dataset: Instance of WMT14 dataset
            - src_ids (np.array) - The sequence of token ids of source language.
            - trg_ids (np.array) - The sequence of token ids of target language.
            - trg_ids_next (np.array) - The next sequence of token ids of target language.
    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.text.datasets import WMT14

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, src_ids, trg_ids, trg_ids_next):
            ...         return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)

            >>> wmt14 = WMT14(mode='train', dict_size=50)

            >>> for i in range(10):
            ...     src_ids, trg_ids, trg_ids_next = wmt14[i]
            ...     src_ids = paddle.to_tensor(src_ids)
            ...     trg_ids = paddle.to_tensor(trg_ids)
            ...     trg_ids_next = paddle.to_tensor(trg_ids_next)
            ...
            ...     model = SimpleNet()
            ...     src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
            ...     print(src_ids.item(), trg_ids.item(), trg_ids_next.item())
            91 38 39
            123 81 82
            556 229 230
            182 26 27
            447 242 243
            116 110 111
            403 288 289
            258 221 222
            136 34 35
            281 136 137

    _Wmt14DataSetModemode
str | None	data_fileint	dict_sizezlist[list[int]]src_idstrg_idstrg_ids_nextdict[str, int]src_dicttrg_dictNr	   TdownloadboolreturnNonec                 C  st   |  dv sJ d| |  | _|| _| jd u r)|s J dt|ttd|| _|dks1J d|| _|   d S )Nr   z1mode should be 'train', 'test' or 'gen', but got z>data_file is not set and downloading automatically is disabledZwmt14r   z*dict_size should be set as positive number)lowerr   r   r   	URL_TRAIN	MD5_TRAINr   
_load_data)selfr   r   r   r    r$   f/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/text/datasets/wmt14.py__init__p   s   


zWMT14.__init__c                   s  ddd}g _ g _g _tjjdd}d	d
 |D }t|dks&J |||d j_	dd
 |D }t|dksAJ |||d j_
j dj   fdd
|D }|D ]s}||D ]k}| }| d}t|dkr|qh|d }| }fdd
tg|tD }	|d }
|
 }fdd
|D }t|	dkst|dkrqhg |j
t }j
t g|}j |	 j| j| qhqaW d    d S 1 sw   Y  d S )Nsizer   r   r   c                 S  s8   i }t | D ]\}}||k r|||  < q |S |S N)	enumeratestripdecode)fdr'   Zout_dict
line_countliner$   r$   r%   Z	__to_dict   s   z#WMT14._load_data.<locals>.__to_dictr)r   c                 S     g | ]}|j d r|j qS )zsrc.dictnameendswith.0Z	each_itemr$   r$   r%   
<listcomp>       
z$WMT14._load_data.<locals>.<listcomp>   r   c                 S  r0   )ztrg.dictr1   r4   r$   r$   r%   r6      r7   /c                   s   g | ]}|j  r|j qS r$   r1   r4   )	file_namer$   r%   r6      r7   	r   c                      g | ]	} j |tqS r$   )r   getUNK_IDXr5   wr#   r$   r%   r6      s    c                   r<   r$   )r   r=   r>   r?   rA   r$   r%   r6      s    P   )r'   r   r   r   )r   r   r   tarfileopenr   lenextractfiler   r   r   r   r+   r*   splitSTARTENDappend)r#   Z_WMT14__to_dictfnamesr2   r.   Z
line_splitZsrc_seqZ	src_wordsr   Ztrg_seqZ	trg_wordsr   r   r$   )r:   r#   r%   r"      sV   
	

"zWMT14._load_dataidxGtuple[npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_]]c                 C  s.   t | j| t | j| t | j| fS r(   )nparrayr   r   r   )r#   rM   r$   r$   r%   __getitem__   s   zWMT14.__getitem__c                 C  s
   t | jS r(   )rE   r   rA   r$   r$   r%   __len__   s   
zWMT14.__len__.reverseLiteral[True]%tuple[dict[int, str], dict[int, str]]c                 C     d S r(   r$   r#   rS   r$   r$   r%   get_dict      zWMT14.get_dictLiteral[False]%tuple[dict[str, int], dict[str, int]]c                 C  rV   r(   r$   rW   r$   r$   r%   rX      rY   Mtuple[dict[str, int], dict[str, int]] | tuple[dict[int, str], dict[int, str]]c                 C  rV   r(   r$   rW   r$   r$   r%   rX      s   Fc                 C  s>   | j | j}}|rdd | D }dd | D }||fS )a  
        Get the source and target dictionary.

        Args:
            reverse (bool): whether to reverse key and value in dictionary,
                i.e. key: value to value: key.

        Returns:
            Two dictionaries, the source and target dictionary.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import WMT14
                >>> wmt14 = WMT14(mode='train', dict_size=50)
                >>> src_dict, trg_dict = wmt14.get_dict()

        c                 S     i | ]\}}||qS r$   r$   r5   kvr$   r$   r%   
<dictcomp>      z"WMT14.get_dict.<locals>.<dictcomp>c                 S  r]   r$   r$   r^   r$   r$   r%   ra     rb   )r   r   items)r#   rS   r   r   r$   r$   r%   rX      s
   )Nr	   r   T)
r   r   r   r   r   r   r   r   r   r   )r   r   )rM   r   r   rN   )r   r   ).)rS   rT   r   rU   )rS   rZ   r   r[   )rS   r   r   r\   )F)__name__
__module____qualname____doc____annotations__r&   r"   rQ   rR   r   rX   r$   r$   r$   r%   r   -   s8   
 9

>
r   )
__future__r   rC   typingr   r   r   numpyrO   Zpaddle.dataset.commonr   Z	paddle.ior   Znumpy.typingZnptr   __all__ZURL_DEV_TESTZMD5_DEV_TESTr    r!   rH   rI   ZUNKr>   r   r$   r$   r$   r%   <module>   s(   