o
    + i5                     @  s   d dl mZ d dlmZ erd dlZd dlmZ d dlZd dl	Z	d dlZd dl
mZ d dlmZ g ZdZdZdZd	Zd
ZdZdZdZdZdZd ZG dd deZdS )    )annotations)TYPE_CHECKINGN)_check_exists_and_download)DatasetzBhttp://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gzZ 387719152ae52d60422c016e92a742fcz:http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txtZ ea7fb7d4c75cc6254716f0177a506baaz:http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txtZ 0d2977293bbb6cbefab5b0f97db1e77cz<http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txtZ d8c7f03ceb5fc2e5a0fa7503a4353751z1http://paddlemodels.bj.bcebos.com/conll05st%2FembZ bf436eb0faa1f6f9103017f8be57cdb7c                   @  s   e Zd ZU dZded< ded< ded< ded< ded< ded	< ded
< ded< ded< ded< ded< 						d,d-ddZd.ddZd.ddZd/ddZd0d#d$Z	d1d%d&Z
d2d(d)Zd3d*d+ZdS )4	Conll05sta	  
    Implementation of `Conll05st <https://www.cs.upc.edu/~srlconll/soft.html>`_
    test dataset.

    Note: only support download test dataset automatically for that
          only test dataset of Conll05st is public.

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None
        word_dict_file(str|None): path to word dictionary file, can be set None if
            :attr:`download` is True. Default None
        verb_dict_file(str|None): path to verb dictionary file, can be set None if
            :attr:`download` is True. Default None
        target_dict_file(str|None): path to target dictionary file, can be set None if
            :attr:`download` is True. Default None
        emb_file(str|None): path to embedding dictionary file, only used for
            :code:`get_embedding` can be set None if :attr:`download` is
            True. Default None
        download(bool): whether to download dataset automatically if
            :attr:`data_file` :attr:`word_dict_file` :attr:`verb_dict_file`
            :attr:`target_dict_file` is not set. Default True

    Returns:
        Dataset: instance of conll05st dataset

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.text.datasets import Conll05st

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, pred_idx, mark, label):
            ...         return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label)


            >>> conll05st = Conll05st()

            >>> for i in range(10):
            ...     pred_idx, mark, label= conll05st[i][-3:]
            ...     pred_idx = paddle.to_tensor(pred_idx)
            ...     mark = paddle.to_tensor(mark)
            ...     label = paddle.to_tensor(label)
            ...
            ...     model = SimpleNet()
            ...     pred_idx, mark, label= model(pred_idx, mark, label)
            ...     print(pred_idx.item(), mark.item(), label.item())
            >>> # doctest: +SKIP('label will change')
            65840 5 1991
            92560 5 3686
            99120 5 457
            121960 5 3945
            4774 5 2378
            14973 5 1938
            36921 5 1090
            26908 5 2329
            62965 5 2968
            97755 5 2674

    
str | None	data_fileword_dict_fileverb_dict_filetarget_dict_fileemb_filedict[str, int]	word_dictpredicate_dict
label_dictlist	sentences
predicateslabelsNTdownloadboolc                 C  s  || _ | j d u r|sJ dt|ttd|| _ || _| jd u r.|s%J dt|ttd|| _|| _| jd u rE|s<J dt|tt	d|| _|| _
| j
d u r\|sSJ dt|ttd|| _
|| _| jd u rs|sjJ dt|ttd|| _| | j| _| | j| _| | j
| _|   d S )Nz>data_file is not set and downloading automatically is disabledZ	conll05stzCword_dict_file is not set and downloading automatically is disabledzCverb_dict_file is not set and downloading automatically is disabledzEtarget_dict_file is not set and downloading automatically is disabledz=emb_file is not set and downloading automatically is disabled)r   r   DATA_URLDATA_MD5r	   WORDDICT_URLWORDDICT_MD5r
   VERBDICT_URLVERBDICT_MD5r   TRGDICT_URLTRGDICT_MD5r   EMB_URLEMB_MD5
_load_dictr   r   _load_label_dictr   
_load_anno)selfr   r	   r
   r   r   r    r%   h/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/text/datasets/conll05.py__init__}   sp   	






zConll05st.__init__filenamestrreturnc           	      C  s   i }t  }t|dR}t|D ]%\}}| }|dr&||dd   q|dr4||dd   qd}|D ]}||d| < |d7 }||d| < |d7 }q9||d< W d    |S 1 s_w   Y  |S )NrB-   I-r      O)setopen	enumeratestrip
startswithadd)	r$   r(   dZtag_dictfilineindextagr%   r%   r&   r"      s*   




zConll05st._load_label_dictc                 C  sR   i }t |d}t|D ]
\}}||| < qW d    |S 1 s"w   Y  |S )Nr+   )r2   r3   r4   )r$   r(   r7   r8   r9   r:   r%   r%   r&   r!      s   
zConll05st._load_dictNonec              
     s  t | j}|d}|d}g | _g | _g | _tj|d(}tj|d}g }g }g }t	||D ]\}	}
|	
  }	|

   }
t|
dkr&tt|d D ]  fdd|D }|| qUt|dkrg }|d D ]}|dkr~|| qst|dd  D ]\ }d	}d
}g }d}|D ]t}|dkr|s|d	 q|dkr|r|d|  q|dkr|d|  d
}q|ddkr|ddkr|d|d }|d|  d
}q|ddkr|ddkr|d|d }|d|  d}qtd| | j| | j|   | j| qg }g }g }q4||	 ||
 q4W d    n	1 s<w   Y  W d    n	1 sLw   Y  |  |  |  d S )Nz2conll05st-release/test.wsj/words/test.wsj.words.gzz2conll05st-release/test.wsj/props/test.wsj.props.gz)fileobjr   c                   s   g | ]}|  qS r%   r%   ).0xr9   r%   r&   
<listcomp>   s    z(Conll05st._load_anno.<locals>.<listcomp>r/   -r0   F *r.   z*)()r,   TzUnexpected label: )tarfiler2   r   extractfiler   r   r   gzipGzipFilezipr4   decodesplitlenrangeappendr3   findRuntimeErrorclose)r$   tfZwfpfZ
words_fileZ
props_filer   r   Zone_segwordlabelZa_kind_labelZ	verb_listr@   ZlblZcur_tagZis_in_bracketZlbl_seqZ	verb_wordlr%   rA   r&   r#      s   


 
 9zConll05st._load_annoidxinttuple[npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_]]c                   s   j | } j| } j| }t|}|d}dgt| }|dkr0d||d < ||d  }nd}|dkrCd||d < ||d  }	nd}	d||< || }
|t|d k rbd||d < ||d  }nd}|t|d k ryd||d < ||d  }nd} fdd|D } j|	tg| } j|tg| } j|
tg| } j|tg| } j|tg| } j|g| } fd	d|D }t	
|t	
|t	
|t	
|t	
|t	
|t	
|t	
|t	
|f	S )
NzB-Vr   r/   Zbosr-   Zeosc                   s   g | ]	} j |tqS r%   )r   getUNK_IDXr?   wr$   r%   r&   rB   X  s    z)Conll05st.__getitem__.<locals>.<listcomp>c                   s   g | ]} j |qS r%   )r   r^   r`   rb   r%   r&   rB   a  s    )r   r   r   rP   r;   r   r^   r_   r   nparray)r$   r[   Zsentence	predicater   Zsen_lenZ
verb_indexmarkZctx_n1Zctx_n2Zctx_0Zctx_p1Zctx_p2Zword_idxZ
ctx_n2_idxZ
ctx_n1_idxZ	ctx_0_idxZ
ctx_p1_idxZ
ctx_p2_idxZpred_idxZ	label_idxr%   rb   r&   __getitem__(  sT   



zConll05st.__getitem__c                 C  s
   t | jS )N)rP   r   rb   r%   r%   r&   __len__o  s   
zConll05st.__len__5tuple[dict[str, int], dict[str, int], dict[str, int]]c                 C  s   | j | j| jfS )aD  
        Get the word, verb and label dictionary of Wikipedia corpus.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import Conll05st

                >>> conll05st = Conll05st()
                >>> word_dict, predicate_dict, label_dict = conll05st.get_dict()

        )r   r   r   rb   r%   r%   r&   get_dictr  s   zConll05st.get_dictc                 C  s   | j S )a  
        Get the embedding dictionary file.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import Conll05st

                >>> conll05st = Conll05st()
                >>> emb_file = conll05st.get_embedding()

        )r   rb   r%   r%   r&   get_embedding  s   zConll05st.get_embedding)NNNNNT)r   r   r	   r   r
   r   r   r   r   r   r   r   )r(   r)   r*   r   )r*   r=   )r[   r\   r*   r]   )r*   r\   )r*   ri   )r*   r)   )__name__
__module____qualname____doc____annotations__r'   r"   r!   r#   rg   rh   rj   rk   r%   r%   r%   r&   r   .   s6   
 B
I


H
G
r   )
__future__r   typingr   numpyrc   Znumpy.typingZnptrK   rI   Zpaddle.dataset.commonr   Z	paddle.ior   __all__r   r   r   r   r   r   r   r   r   r    r_   r   r%   r%   r%   r&   <module>   s.   