o
    + i."                     @  s   d dl mZ d dlZd dlZd dlmZmZmZ d dlZ	d dl
mZ d dlmZ er4d dlmZ ed Zg Zg dZdZd	ZG d
d dZG dd dZG dd deZdS )    )annotationsN)TYPE_CHECKINGAnyLiteral)_check_exists_and_download)Datasettraintest)         #   -   2   8   z3https://dataset.bj.bcebos.com/movielens%2Fml-1m.zipZ c4d9eecfca2ab87c1945afe126590906c                   @  sP   e Zd ZU dZded< ded< ded< dd
dZdd ZdddZdddZdS )	MovieInfozM
    Movie id, title and categories information are stored in MovieInfo.
    intindex	list[str]
categoriesstrtitlereturnNonec                 C  s   t || _|| _|| _d S N)r   r   r   r   )selfr   r   r    r   j/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/text/datasets/movielens.py__init__.   s   

zMovieInfo.__init__c                   s2   | j g fdd| jD fdd| j D gS )z/
        Get information from a movie.
        c                   s   g | ]} | qS r   r   ).0c)categories_dictr   r   
<listcomp>9   s    z#MovieInfo.value.<locals>.<listcomp>c                   s   g | ]} |   qS r   )lower)r    w)movie_title_dictr   r   r#   :   s    )r   r   r   split)r   r"   r&   r   )r"   r&   r   value3   s   zMovieInfo.valuec                 C  s   d| j  d| j d| j dS )Nz<MovieInfo id(z	), title(z), categories()>)r   r   r   r   r   r   r   __str__=   s   zMovieInfo.__str__c                 C  s   |   S r   )r+   r*   r   r   r   __repr__@      zMovieInfo.__repr__N)r   r   r   r   r   r   r   r   r   r   	__name__
__module____qualname____doc____annotations__r   r(   r+   r,   r   r   r   r   r   %   s   
 


r   c                   @  sX   e Zd ZU dZded< ded< ded< ded< dddZdd ZdddZdddZdS )UserInfozK
    User id, gender, age, and job information are stored in UserInfo.
    r   r   boolis_maleagejob_idr   genderr   r   c                 C  s2   t || _|dk| _tt || _t || _d S )NM)r   r   r7   	age_tabler8   r9   )r   r   r:   r8   r9   r   r   r   r   N   s   

zUserInfo.__init__c                 C  s$   | j g| jrdndg| jg| jggS )z.
        Get information from a user.
        r   r   )r   r7   r8   r9   r*   r   r   r   r(   T   s
   zUserInfo.valuec              	   C  s6   | j rdnd}d| j d| dt| j  d| j d	S )Nr;   Fz<UserInfo id(z
), gender(z), age(z), job(r)   )r7   r   r<   r8   r9   )r   r:   r   r   r   r+   _   s   (zUserInfo.__str__c                 C  s   t | S r   )r   r*   r   r   r   r,   c   r-   zUserInfo.__repr__N)
r   r   r:   r   r8   r   r9   r   r   r   r.   r/   r   r   r   r   r5   D   s   
 

r5   c                   @  s   e Zd ZU dZded< ded< ded< ded	< d
ed< ded< ded< ded< ded< 					d(d)ddZd*ddZd*d d!Zd+d$d%Zd,d&d'Z	dS )-	Movielensa  
    Implementation of `Movielens 1-M <https://grouplens.org/datasets/movielens/1m/>`_ dataset.

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None.
        mode(str): 'train' or 'test' mode. Default 'train'.
        test_ratio(float): split ratio for test sample. Default 0.1.
        rand_seed(int): random seed. Default 0.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True.

    Returns:
        Dataset: instance of Movielens 1-M dataset.

    Examples:

        .. code-block:: python

            >>> # doctest: +TIMEOUT(75)
            >>> import paddle
            >>> from paddle.text.datasets import Movielens

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, category, title, rating):
            ...         return paddle.sum(category), paddle.sum(title), paddle.sum(rating)


            >>> movielens = Movielens(mode='train')

            >>> for i in range(10):
            ...     category, title, rating = movielens[i][-3:]
            ...     category = paddle.to_tensor(category)
            ...     title = paddle.to_tensor(title)
            ...     rating = paddle.to_tensor(rating)
            ...
            ...     model = SimpleNet()
            ...     category, title, rating = model(category, title, rating)
            ...     print(category.shape, title.shape, rating.shape)
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []

    _MovieLensDataSetModemode
str | None	data_filefloat
test_ratior   	rand_seedzdict[int, MovieInfo]
movie_infozdict[str, int]r&   r"   zdict[int, UserInfo]	user_infozlist[list[float]]dataNr	   皙?r   Tdownloadr6   r   r   c                 C  s~   |  dv sJ d| |  | _|| _| jd u r)|s J dt|ttd|| _|| _|| _tj	
| |   |   d S )Nr   z(mode should be 'train', 'test', but got z>data_file is not set and downloading automatically is disabledZ	sentiment)r$   r@   rB   r   URLMD5rD   rE   nprandomseed_load_meta_info
_load_data)r   rB   r@   rD   rE   rJ   r   r   r   r      s"   


zMovielens.__init__c              
   C  s  t d}i | _i | _i | _i | _t| j}|	 D ]}t
|tjs&J t }t }|dR}t|D ]E\}}|jdd}| d\}	}
}|d}|D ]}|| qQ||
d}
t|	||
d| jt|	< |
 D ]	}||  qqq6W d    n1 sw   Y  t|D ]	\}}|| j|< qt|D ]	\}}|| j|< q|d	,}|D ]!}|jdd}| d\}}}}}t||||d
| jt|< qW d    n1 sw   Y  qW d    d S 1 sw   Y  d S )Nz^(.*)\((\d+)\)$zml-1m/movies.datlatinencoding::|r   )r   r   r   zml-1m/users.dat)r   r:   r8   r9   )recompilerF   r&   r"   rG   zipfileZipFilerB   infolist
isinstanceZipInfosetopen	enumeratedecodestripr'   addmatchgroupr   r   r$   r5   )r   patternpackageinfoZtitle_word_setZcategories_setZ
movie_fileilineZmovie_idr   r   r!   r%   Z	user_fileuidr:   r8   Zjob_r   r   r   rP      sT   

"zMovielens._load_meta_infoc           
   
   C  s  g | _ | jdk}t| jt}|dV}|D ]K}|jdd}tj | j	k |krb|
 d\}}}}t|}t|}t|d d }| j| }| j| }	| j |	 || j| j |gg  qW d    n1 smw   Y  W d    d S W d    d S 1 sw   Y  d S )Nr
   zml-1m/ratings.datrR   rS   rU      g      @)rH   r@   rY   rZ   rB   r_   ra   rM   rN   rD   rb   r'   r   rC   rF   rG   appendr(   r"   r&   )
r   Zis_testrg   Zratingrj   rk   Zmov_idrl   Zmovusrr   r   r   rQ      s6   



PzMovielens._load_dataidxtuple[npt.NDArray[Any], ...]c                 C  s   | j | }tdd |D S )Nc                 S  s   g | ]}t |qS r   )rM   array)r    dr   r   r   r#     s    z)Movielens.__getitem__.<locals>.<listcomp>)rH   tuple)r   rp   rH   r   r   r   __getitem__  s   
zMovielens.__getitem__c                 C  s
   t | jS r   )lenrH   r*   r   r   r   __len__	  s   
zMovielens.__len__)Nr	   rI   r   T)rB   rA   r@   r?   rD   rC   rE   r   rJ   r6   r   r   )r   r   )rp   r   r   rq   )r   r   )
r0   r1   r2   r3   r4   r   rP   rQ   ru   rw   r   r   r   r   r>   g   s*   
 7

'
r>   )
__future__r   rW   rY   typingr   r   r   numpyrM   Zpaddle.dataset.commonr   Z	paddle.ior   Znumpy.typingZnptr?   __all__r<   rK   rL   r   r5   r>   r   r   r   r   <module>   s"   #