o
    W+ iSS                     @   s>  d dl Z d dlmZmZ d dlZd dlZd dlZd dlm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) d dl*m+Z+ e+ Z,dZ-dZ.G dd dej/Z0G dd de0Z1G dd dej/Z2dS )    N)DictUnion)ArrowBasedBuilderDatasetDatasetDictGeneratorBasedBuilderIterableDatasetIterableDatasetDict)is_remote_filesystem)DatasetInfo)camelcase_to_snakecase)csv)FileLock)
map_nested)HubApi)DatasetContextConfig)ExternalDatasetNativeIterableDataset)DataStreamingDownloadManager)get_subdir_hash_from_split)DEFAULT_DATASET_NAMESPACEDatasetPathNameDownloadMode)
get_logger	delimiter,c                       sz   e Zd Zdef fddZefddZddefdefdd	Zd
d Z	dd Z
dd Zdd Zdd ZdefddZ  ZS )CsvDatasetBuilderdataset_context_configc                    s.  |j | _ |j| _|j| _|j| _|j| _|j| _|jj| _|jj| _|j	| _
ti | _tj| j| j| j | jtj| _t| _t| j
v rI| j
t | _| jpSt|jj }t|| jd}ddlm}m   fdd| j D }||}t j d| j| j||d| j
 | j | j!_"t#| j | _$ti | _%d S )Nsplitversionr   )DataFilesDictDataFilesListc                    s    i | ]\}}| |gd dqS )N)Zorigin_metadata .0kvr"   r#   z/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/modelscope/msdatasets/download/dataset_builder.py
<dictcomp>C   s    z.CsvDatasetBuilder.__init__.<locals>.<dictcomp>)	cache_dirconfig_namehash
data_filesr#   )&dataset_namecache_root_dir	namespacer    subset_namer   data_meta_configmeta_data_fileszip_data_filesconfig_kwargsinput_config_kwargsdictsplit_path_dictospathjoinr   	META_NAMEcache_build_dirDEFAULT_CSV_DELIMITERcsv_delimiterDELIMITER_NAMElisttarget_dataset_structurekeysr   Zdatasets.data_filesr!   r"   itemsZfrom_local_or_remotesuper__init__infobuilder_namer   namelocal_meta_csv_paths)selfr   r   sub_dir_hashr!   r.   	__class__r(   r)   rG   (   sP   







zCsvDatasetBuilder.__init__c                 C   s    t j| j| jdd|d}|S )NFT)with_version	with_hashr1   )r:   r;   r<   _cache_dir_root_relative_data_dir)rL   r1   builder_data_dirr#   r#   r)   _build_cache_dirU   s   z"CsvDatasetBuilder._build_cache_dirTreturnc                 C   s   |du r| j jn| d| j j }| j}| j}|r!tj|| j}|r.tj|t| jj	}|r>|r>t
|tr>tj||}|S )zRelative path of this dataset in cache_dir:
        Will be:
            self.name/self.config.version/self.hash/
        or if a namespace has been specified:
            self.namespace___self.name/self.config.version/self.hash/
        NZ___)rH   rI   configr-   r:   r;   r<   Z	config_idstrr    
isinstance)rL   rP   rQ   r1   rT   Zbuilder_configr-   r#   r#   r)   rS   ]   s   "

z$CsvDatasetBuilder._relative_data_dirc              	   C   sz   | j jstd|| j j}|| j}g }| D ]\}}t|tr'|g}|t	j
|||||dd q|S )Nz7At least one data file must be specified, but got none.)filesbase_dirrJ   
gen_kwargs)rW   r.   
ValueErrordownload_and_extractr5   rE   rY   rX   appenddatasetsSplitGeneratorZ
iter_filesget)rL   
dl_managerr.   r5   splits
split_namerZ   r#   r#   r)   _split_generatorss   s&   
z#CsvDatasetBuilder._split_generatorsc                 #   s(   | j jd urt| j jjnd }|rdd t|j|jD nd }t|D ]l\}}t	j
|d|| jd}g }|jjD ]}	|	drE||	 q9z.t|D ]'\}
}|D ]}	 rb||	  fdd||	< qQtjj||d}||
f|fV  qKW q% ty } ztd	| d
t| d|   d }~ww d S )Nc                 S   s   i | ]	\}}||  qS r#   )Zto_pandas_dtype)r%   rJ   dtyper#   r#   r)   r*      s    z6CsvDatasetBuilder._generate_tables.<locals>.<dictcomp>T)iteratorrh   r   :FILEc                       t j | S Nr:   r;   r<   xr[   r#   r)   <lambda>       z4CsvDatasetBuilder._generate_tables.<locals>.<lambda>)schemazFailed to read file 'z' with error z: )rW   featurespars   typezipnamestypes	enumeratepdread_csvr@   Z_engineendswithr`   applyTablefrom_pandasr^   loggererror)rL   rZ   r[   rs   rh   Zfile_idxfileZcsv_file_readertransform_fields
field_nameZ	batch_idxdfpa_tableer#   rp   r)   _generate_tables   sN   



z"CsvDatasetBuilder._generate_tablesc                 K   s*  |j j}|j j}|stj}|j j}|stj}| j}|stj}g }|tj ||j j	 || || || tj
|}	tj|tj|	d }
t|
8 tj|}|ro|tjjkrotd| j d| d td| j d| d | j||d W d    d S 1 sw   Y  d S )N.lockReusing dataset  ()Generating dataset )rd   download_mode)download_configr+   r   r   ZLOCK_FILE_NAME_ANYr    r2   r`   ZDATA_FILES_NAMEr/   ZLOCK_FILE_NAME_DELIMITERr<   r:   r;   stripr   existsr   REUSE_DATASET_IF_EXISTSvaluer   warningrJ   rH   _download_and_prepare)rL   r   rd   download_kwargstarget_cache_dirrf   Zversion_namer2   Zlock_file_namesZlock_file_name	lock_pathdata_existsr#   r#   r)   download_and_prepare   sD   




"z&CsvDatasetBuilder.download_and_preparec                    sd   dd l }|jj |tjjkr|j dd tj dd  fdd| j	
 D | _|| j| _d S )Nr   T)ignore_errors)exist_okc                    s   i | ]\}}|t | qS r#   )r   fetch_meta_files_from_urlr$   r   r#   r)   r*          z;CsvDatasetBuilder._download_and_prepare.<locals>.<dictcomp>)shutilr   r+   r   ZFORCE_REDOWNLOADr   rmtreer:   makedirsr4   rE   rK   r_   r5   r9   )rL   rd   r   r   r#   r   r)   r      s   

z'CsvDatasetBuilder._download_and_preparec              
      s   t j|d| jd}g }|j D ]}|dr|| q| j|d |D ]M}t	 t
rVt dkrV|jd t krQtd| d|jd  dt  d	 q& ||< q&t	 trk rk||  fd
d||< q&td|  q&tj|}t|dS )NFri   r   rj    r   z,Number of lines in meta-csv file for split 'z' (z&) does not match number of data-files(z)!c                    rk   rl   rm   rn   Zbase_extracted_dirr#   r)   rq      rr   z;CsvDatasetBuilder._convert_csv_to_dataset.<locals>.<lambda>zNothing to do for field )Zarrow_table)r{   r|   r@   columnstolistr}   r`   r9   rc   rY   rB   lenshaper   r   rX   r~   r   ru   r   r   r   )rL   rf   Zcsv_file_pathr   r   r   Zpa_datar#   r   r)   _convert_csv_to_dataset   s<   





z)CsvDatasetBuilder._convert_csv_to_datasetc                    s   t  fdd j D S )Nc                    s   i | ]\}}|  ||qS r#   )r   r$   rL   r#   r)   r*      r   z0CsvDatasetBuilder.as_dataset.<locals>.<dictcomp>)r   rK   rE   r   r#   r   r)   
as_dataset   s   zCsvDatasetBuilder.as_dataset)__name__
__module____qualname__r   rG   r   rU   rX   rS   rg   r   r   r   r   r   r   __classcell__r#   r#   rN   r)   r   &   s    -	
'r   c                   @   s2   e Zd ZdefddZdd Zdd Zdd	 Zd
S )TaskSpecificDatasetBuilderr   c                 C   s   |j | _|j| _|j| _|j| _|j| _| jpt|jj	 }t
|| jd| _|jj| _|jj| _d | _d | _td|j i| _tj|j| _|  | _|jj| _d S )Nr   rI   )r/   rJ   r2   r1   r   r    rB   r3   rC   rD   r   r-   r4   r.   r5   r9   rW   r   	from_dictrH   r:   r;   
expanduserr0   rR   rU   
_cache_dirZmeta_args_map_config_kwargs)rL   r   r   r#   r#   r)   rG     s.   


z#TaskSpecificDatasetBuilder.__init__c                 K   s   t j| j| jt jdd }t|; t j| j}|r:|t	j
kr:td| j d| j d 	 W d    d S td| j d| j d W d    n1 sRw   Y  | j|d d S )N_r   r   r   r   r   )rd   )r:   r;   r<   rR   r   replacesepr   r   r   r   r   r   rJ   rH   r   )rL   r   rd   r   r   r   r#   r#   r)   r     s   
z/TaskSpecificDatasetBuilder.download_and_preparec                 C   s   | | j| _d S rl   )r_   r5   r9   )rL   rd   r#   r#   r)   r   '  s   
z0TaskSpecificDatasetBuilder._download_and_preparec                 C   s   t | j| jS rl   )r   r9   r   r   r#   r#   r)   r   +  s   z%TaskSpecificDatasetBuilder.as_datasetN)r   r   r   r   rG   r   r   r   r#   r#   r#   r)   r      s
    r   c                       s   e Zd Zdef fddZededejfddZde	de
eeef ef fdd	Zde	fd
dZdefddZdd ZdeddfddZedededefddZ  ZS )IterableDatasetBuilderr   c                    s  |j | _ |j| _|j| _|j| _|j| _|j| _|jj| _|jj| _|j	| _
|j| _tj| j| j| j | jtj| _t| _t| j
v rH| j
t | _| jpRt|jj }t|| jd}t jd| j| j | j|d d| j
 | j | j_t| j | _d | _|jj | _ d S )Nr   )r+   r/   r,   r-   r.   r#   )!r/   r0   r1   r    r2   r   r3   r4   r5   r6   r7   stream_batch_sizer:   r;   r<   r   r=   r>   r?   r@   rA   rB   rC   rD   r   rF   rG   rH   rI   r   rJ   meta_csv_dfmeta_cache_dir)rL   r   r   rM   rN   r#   r)   rG   1  sJ   




zIterableDatasetBuilder.__init__rV   c                 C   s   t | d}|S )N)r   )r   )r   Zbuilder_instancer#   r#   r)   get_builder_instanceY  s   z+IterableDatasetBuilder.get_builder_instancerd   c                 C   s   t | ttfstd| j dt| j }|s$tdt| jj	 d| 
| dd | |D }|jj}|d u r>|}n||v rG|| }ntd| dt| t| j|d	d
}t |trdt|}|S )NzBuilder z is not streamable.z(Loading a streaming dataset cached in a z is not supported yet.c                 S   s   i | ]}|j |qS r#   )rJ   )r%   Zsgr#   r#   r)   r*   n  s    z?IterableDatasetBuilder.as_streaming_dataset.<locals>.<dictcomp>zBad split: z. Available splits: T)Z	map_tuple)rY   r   r   r^   rJ   r
   Z_fsNotImplementedErrorrv   r   Z_check_manual_downloadrg   r   r   rB   r   _as_streaming_dataset_singler8   r	   )rL   rd   is_localZsplits_generatorsr   splits_generatorZstreaming_datasetsr#   r#   r)   as_streaming_dataset`  s6   


z+IterableDatasetBuilder.as_streaming_datasetc              	   C   s*  g }d}d}| j rtt| j  }| jrtt| j }|r<|s<| j  D ]\}}|tj||g |dd q'|S |rh|rh| j D ] \}}t	|t
rQ|g}| j |}|tj||||dd qE|S |s|r| j D ]\}}t	|t
r}|g}|tj|d||dd qq|S d| j d)Nr   )metarZ   rd   r\   +Neither column meta nor data file found in z#.json, specify at least one column.)r4   nextitervaluesr5   rE   r`   ra   rb   rY   rX   rc   r/   )rL   rd   re   Zmeta_data_fileZzip_data_filerf   meta_file_urlrZ   r#   r#   r)   rg     sb   '

z(IterableDatasetBuilder._split_generatorsc                 C   s    |  |}t|| j|j| jdS )N)rH   r   r   )Z _get_examples_iterable_for_splitr   rH   rJ   r   )rL   r   Zex_iterabler#   r#   r)   r     s   
z3IterableDatasetBuilder._as_streaming_dataset_singlec                 k   s   | d}| d}| d}t }d}d}|r0ttt|}|dr0d}tj|d }|rG|sG| 	| t
j| j}	d|	fV  d S |ro|ro| 	| |ra|| j| j| j|}
|
|j_t
j| j}	d|	fV  d S |s|rt
jd	|i}	d|	fV  d S d
| j d)Nr   rZ   rd   Fr   z.zipTr   z
Input:FILEr   z.json .)rc   r   rX   r   r   r}   r:   r;   splitext_get_meta_csv_dfru   r   r   r   Z&get_dataset_access_config_for_unzippedr/   r1   r    r   Z
oss_configZfrom_pydict)rL   r]   r   rZ   rd   Zhub_apiZis_zipZzip_file_namezip_filer   Zoss_config_for_unzippedr#   r#   r)   r     s<   





z'IterableDatasetBuilder._generate_tablesr   Nc                 C   s<   | j d u s	| j jrt|| j}tj|d| jd| _ d S d S )NFr   )r   emptyr   r   r   r{   r|   r@   )rL   r   Zmeta_csv_file_pathr#   r#   r)   r     s   z'IterableDatasetBuilder._get_meta_csv_dfheaderstextsr   c                 C   sT   i }|  |} tdt| D ]}g }|D ]}|| ||  q||| | < q|S )Nr   )r   ranger   r`   )r   r   r   residxZcol_listliner#   r#   r)   trans_data_to_mapping  s   
z,IterableDatasetBuilder.trans_data_to_mapping)r   r   r   r   rG   staticmethodr   Csvr   r   r   r   rX   r   r   rg   r   r   r   r   rB   r   r   r#   r#   rN   r)   r   /  s*    (
(3
)	 r   )3r:   typingr   r   ra   Zpandasr{   Zpyarrowru   r   r   r   r   r   r	   Zdatasets.filesystemsr
   Zdatasets.infor   Zdatasets.namingr   Zdatasets.packaged_modulesr   Zdatasets.utils.filelockr   Zdatasets.utils.py_utilsr   Zmodelscope.hub.apir   Z4modelscope.msdatasets.context.dataset_context_configr   Z!modelscope.msdatasets.dataset_clsr   r   Z/modelscope.msdatasets.download.download_managerr   Z)modelscope.msdatasets.utils.dataset_utilsr   Zmodelscope.utils.constantr   r   r   Zmodelscope.utils.loggerr   r   rA   r?   r   r   r   r   r#   r#   r#   r)   <module>   s4     Z0