o
    rqi~s                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlmZ d d	lmZmZ d d
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z= e0 Z>e(dddddfde?de?dee? dee? deee?df dee@ dee dee? dee? fddZAe'dddddfde?de?dee? dee?edf dee? deeee?f  dee@ dee de?fd d!ZBde(ddddddd"d#e?de?d$e?dee? dee? deee?df dee@ dee dee? d%e@dee? fd&d'ZCde?de?fd(d)ZDdde*fde?de?de?d$e?fd*d+ZE	d>de?de?de?d,ee? fd-d.ZFd/d0 ZG					d?d1e?de?d2e?ded3eee?e?f  d4eHd%e@d5eee2  d,e?fd6d7ZI			d@d1e?de?d2e?d4eHded3eee?e?f  d%e@d5eee2  fd8d9ZJ	d>d1e?de?d2e?ded3eee?e?f  f
d:d;ZK		dAd5eee2  fd<d=ZLdS )B    N)ThreadPoolExecutor)partial)	CookieJar)Path)DictListOptionalTypeUnion)Retry)tqdm)HubApiModelScopeConfig)API_FILE_DOWNLOAD_CHUNK_SIZEAPI_FILE_DOWNLOAD_RETRY_TIMESAPI_FILE_DOWNLOAD_TIMEOUT	FILE_HASHMODELSCOPE_DOWNLOAD_PARALLELS)MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MBTEMPORARY_FOLDER_NAME)DEFAULT_DATASET_REVISIONDEFAULT_MODEL_REVISIONREPO_TYPE_DATASETREPO_TYPE_MODELREPO_TYPE_SUPPORT)get_dataset_cache_rootget_model_cache_root)
get_logger   )ProgressCallbackTqdmCallback)FileDownloadErrorInvalidParameterNotExistError)ModelFileSystemCache)file_integrity_validationget_endpointmodel_id_to_group_owner_nameFmodel_id	file_pathrevision	cache_dir
user_agentlocal_files_onlycookies	local_dirreturnc                 C   s   t | |t||||||d	S )aF  Download from a given URL and cache it if it's not already present in the local cache.

    Given a URL, this function looks for the corresponding file in the local
    cache. If it's not there, download it. Then return the path to the cached
    file.

    Args:
        model_id (str): The model to whom the file to be downloaded belongs.
        file_path(str): Path of the file to be downloaded, relative to the root of model repo.
        revision(str, optional): revision of the model file to be downloaded.
            Can be any of a branch, tag or commit hash.
        cache_dir (str, Path, optional): Path to the folder where cached files are stored.
        user_agent (dict, str, optional): The user-agent info in the form of a dictionary or a string.
        local_files_only (bool, optional):  If `True`, avoid downloading the file and return the path to the
            local cached file if it exists. if `False`, download the file anyway even it exists.
        cookies (CookieJar, optional): The cookie of download request.
        local_dir (str, optional): Specific local directory path to which the file will be downloaded.

    Returns:
        string: string of local file or if networking is off, last version of
        file cached on disk.

    Raises:
        NotExistError: The file is not exist.
        ValueError: The request parameter error.

    Note:
        Raises the following errors:

            - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
            if `use_auth_token=True` and the token cannot be found.
            - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
            if ETag cannot be determined.
            - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
            if some parameter value is invalid
    	repo_typer*   r+   r,   r-   r.   r/   )_repo_file_downloadr   )r(   r)   r*   r+   r,   r-   r.   r/    r4   c/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/hub/file_download.pymodel_file_download*      .r6   
dataset_idc                 C   s   t | |t||||||d	S )a  Download raw files of a dataset.
    Downloads all files at the specified revision. This
    is useful when you want all files from a dataset, because you don't know which
    ones you will need a priori. All files are nested inside a folder in order
    to keep their actual filename relative to that folder.

    An alternative would be to just clone a dataset but this would require that the
    user always has git and git-lfs installed, and properly configured.

    Args:
        dataset_id (str): A user or an organization name and a dataset name separated by a `/`.
        file_path (str): The relative path of the file to download.
        revision (str, optional): An optional Git revision id which can be a branch name, a tag, or a
            commit hash. NOTE: currently only branch and tag name is supported
        cache_dir (str, Path, optional): Path to the folder where cached files are stored, dataset file will
            be save as cache_dir/dataset_id/THE_DATASET_FILES.
        local_dir (str, optional): Specific local directory path to which the file will be downloaded.
        user_agent (str, dict, optional): The user-agent info in the form of a dictionary or a string.
        local_files_only (bool, optional): If `True`, avoid downloading the file and return the path to the
            local cached file if it exists.
        cookies (CookieJar, optional): The cookie of the request, default None.
    Raises:
        ValueError: the value details.

    Returns:
        str: Local folder path (string) of repo snapshot

    Note:
        Raises the following errors:
        - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
        if `use_auth_token=True` and the token cannot be found.
        - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
        ETag cannot be determined.
        - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
        if some parameter value is invalid
    r1   )r3   r   )r8   r)   r*   r+   r/   r,   r-   r.   r4   r4   r5   dataset_file_downloadd   r7   r9   )r2   r*   r+   r,   r-   r.   r/   disable_tqdmrepo_idr2   r:   c             
   C   s  |st }|tvrtd|tf t| |||d\}
}|r0||}|d ur,td |S tdt }dt	j
|di}|d u rCt	 }g }|j| |d}d }|t kr|j| |||d}|j| |d	|d u rfd
n||d}|D ],}|d dkrvqm|d |kr||r|d }td| d ||  S |} nqmn|tkr t| \}}|st}d}d}	 z|j| |dd	|||d}W n ty } ztd|  d|  W Y d }~nJd }~ww d
}|D ]0}|d dkrq|d |kr||r	|d }td| d ||  S |}d	} nqt||k s|rn|d7 }q|d u r-td|| f |t kr:t| |||}n|tkrL|j|d ||||d}ntd| t|||
|||S )Nz'Invalid repo type: %s, only support: %s)r/   r+   r2   z>File exists in local cache, but we're not sure it's up to datezCannot find the requested files in the cached path and outgoing traffic has been disabled. To enable look-ups and downloads online, set 'local_files_only' to False.z
user-agent)r,   )r;   r2   )r*   r.   endpointTF)r(   r*   	recursiveZuse_cookiesr<   r	   treer   NamezFile z8 already in cache with identical hash, skip downloading!r   d   /)r;   r*   	root_pathr=   page_number	page_sizer<   zGet dataset: z file list failed, error: z"The file path: %s not exist in: %s)	file_nameZdataset_name	namespacer*   r<   zInvalid repo type )r   r   r"   $create_temporary_directory_and_cacheZget_file_by_pathloggerwarning
ValueErrorr   r   Zget_user_agentZget_cookiesZget_endpoint_for_readZget_valid_revisionZget_model_filesexistsdebugZget_file_by_infor   r'   r   Zget_dataset_files	Exceptionerrorlenr#   get_file_download_urlZget_dataset_file_urldownload_file)r;   r)   r2   r*   r+   r,   r-   r.   r/   r:   temporary_cache_dircacheZcached_file_pathZ_apiheadersZ
repo_filesr<   Zfile_to_download_metaZ	repo_filerE   group_or_ownernamerC   rD   Zdataset_fileseZis_existZurl_to_downloadr4   r4   r5   r3      s   








#

r3   c                 C   s   |  tjjr| tjj} tj| }tj| }|dkr d S | ds,tj|d}t|\}}|	dd}tj| ||}tj|||}tj
|rstj
|sutd| d|  z	t|| W d S  tyr   Y d S w d S d S )NZdatasetsZhub.___zLegacy cache dir exists: z
, move to )endswithospathsepstripdirnamebasenamejoinr'   replacerK   rH   infoshutilmoverM   )r+   r(   Zlegacy_cache_root	base_namerU   rV   rR   Zlegacy_cache_dirr4   r4   r5   !move_legacy_cache_to_standard_dir%  s4   


rg   c           	      C   s   |t krt }n|tkrt }ntd| t| \}}|d ur-tj|t	}t
|}n)|d u r8|}t||  t|trAt|}tj|t	||}|dd}t
|||}tj|dd ||fS )Nz7repo_type only support model and dataset, but now is : rX   rY   Texist_ok)r   r   r   r   rJ   r'   r[   r\   ra   r   r$   rg   
isinstancer   strrb   makedirs)	r(   r/   r+   r2   Zdefault_cache_rootrU   rV   rR   rS   r4   r4   r5   rG   H  s.   



rG   r<   c                 C   s8   t j|}t j|}d}|st }|j|| ||dS )a  Format file download url according to `model_id`, `revision` and `file_path`.
    e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`,
    the resulted download url is: https://modelscope.cn/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md

    Args:
        model_id (str): The model_id.
        file_path (str): File path
        revision (str): File revision.
        endpoint (str): The remote endpoint

    Returns:
        str: The file url.
    zQ{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path})r<   r(   r*   r)   )urllibparse
quote_plusr&   format)r(   r)   r*   r<   Zdownload_url_templater4   r4   r5   rP   h  s   rP   c              
   C   s  | \}}}}}}}}|d u ri nt |}	tt j|	d< ttddgd}
|d||f  }	 zd}tj	
|r^t|d}|dtj}|D ]}|| qGW d    n1 sYw   Y  || }||kriW d S d	||f |	d
< t|d0}tj|d|	|td}|jtdD ]}|r|| |D ]	}|t| qqW d    W d S 1 sw   Y  W d S  ty } z|
jd||d}
td||f  |
  W Y d }~nd }~ww q/)NX-Request-IDr   GETtotalbackoff_factorallowed_methods_%s_%sTr   rbbytes=%s-%sRangeab+streamrT   r.   timeout
chunk_sizerN   z-Downloading: %s failed, reason: %s will retry)copydeepcopyrk   uuiduuid4hexr   r   r[   r\   rK   openseekioSEEK_ENDupdaterequestsgetr   iter_contentr   writerO   rM   	incrementrH   rI   sleep)paramsZmodel_file_pathprogress_callbacksstartendurlrE   r.   rT   get_headersretrypart_file_namepartial_lengthfcallbackZdownload_startrchunkrW   r4   r4   r5   download_part_with_retry  sn   


r   r   rE   rT   	file_sizer   c	                    s   |d u rg n|  }|s|t  fdd|D }d}	g }
tj| }tjtj|dd tt	|	 D ]}||	 }|d |	 d }|
|||||  ||f q9|d k rl|
|||d d |  ||f t
td}t|dd	}t|t|
 W d    n1 sw   Y  |D ]}|  qt }ttj| d
N}|
D ]@}|d d|d |d f  }t|d}	 |dt }|sn|| || qW d    n1 sw   Y  t| qW d    | S 1 sw   Y  | S )Nc                       g | ]}| qS r4   r4   .0r   rE   r   r4   r5   
<listcomp>      
z%parallel_download.<locals>.<listcomp>i   
Trh   r      download)max_workersthread_name_prefixwbr   rw         rx   )r   appendr    r[   r\   ra   rl   r_   rangeintminr   r   listmapr   r   hashlibsha256r   readr   r   r   remove	hexdigest)r   r/   rE   r.   rT   r   r:   r   r<   Z	PART_SIZEtasksr)   idxr   r   Z	parallelsexecutorr   hash_sha256Zoutput_filetaskr   Z	part_filer   r4   r   r5   parallel_download  sb   	





r   c              
      st  |du rg n|  }|s|t  fdd|D }|du r!i nt |}tt j|d< tj	
| }	tjtj	|	dd td| |	 d}
t }ttd	d
gd}	 zdkr}t|	d |D ]}|d	 qdW d   n1 svw   Y  W nd}tj	|	rd}
t|	d}|dtj}|D ]}|| qW d   n1 sw   Y  |krW ntd|d	 f |d< t|	d9}tj| d||td}|  |jtdD ]}|r|D ]	}|t| q| | |
s|| qW d   n1 sw   Y  W n" t!y% } zd}
|j"d
| |d}|#  W Y d}~nd}~ww qW|D ]}|$  q)|
r6dS |% S )aj  Download remote file, will retry 5 times before giving up on errors.

    Args:
        url(str):
            actual download url of the file
        local_dir(str):
            local directory where the downloaded file stores
        file_name(str):
            name of the file stored in `local_dir`
        file_size(int):
            The file size.
        cookies(CookieJar):
            cookies used to authentication the user, which is used for downloading private repos
        headers(Dict[str, str], optional):
            http headers to carry necessary info when requesting the remote file
        disable_tqdm(bool, optional): Disable the progress bar with tqdm.
        progress_callbacks(List[Type[ProgressCallback]], optional):
            progress callbacks to track the download progress.

    Raises:
        FileDownloadError: File download failed.

    Nc                    r   r4   r4   r   r   r4   r5   r     r   z'http_get_model_file.<locals>.<listcomp>rq   Trh   downloading %s to %sFr   rr   rs   r   zw+rx   ry   rz   r{   r|   r   r   )&r   r   r    r   rk   r   r   r   r[   r\   ra   rl   r_   rH   rL   r   r   r   r   r   r   rK   r   r   r   r   r   r   raise_for_statusr   r   rO   r   rM   r   r   r   r   )r   r/   rE   r   r.   rT   r:   r   r   Ztemp_file_pathZ	has_retryr   r   r   r   r   r   r   rW   r4   r   r5   http_get_model_file  s   !




/r   c                 C   s  d}t tjd|dd}|du ri nt|}tt j|d< | }t	
d| |j ttdd	gd
}		 zS| }
d|
 |d< tj| d||td}|  |jd}|dur[t|nd}tddd||
d| d d}|jtdD ]}|r|t| || qq|  W n ty } z|	jd	| |d}	|	  W Y d}~nd}~ww q4W d   n1 sw   Y  t	
d| | tj !|j}||krt"|j d|||f }t	#| t$|t%|jtj &|| dS )aj  Download remote file, will retry 5 times before giving up on errors.

    Args:
        url(str):
            actual download url of the file
        local_dir(str):
            local directory where the downloaded file stores
        file_name(str):
            name of the file stored in `local_dir`
        cookies(CookieJar):
            cookies used to authentication the user, which is used for downloading private repos
        headers(Dict[str, str], optional):
            http headers to carry necessary info when requesting the remote file

    Raises:
        FileDownloadError: File download failed.

    r   F)modedirdeleteNrq   r   r   rr   rs   Tz	bytes=%d-rz   r|   zContent-LengthBi   zDownloading [])unitZ
unit_scaleZunit_divisorrt   initialZdescr   r   zstoring %s in cache at %sz}File %s download incomplete, content_length: %s but the                     file downloaded length: %s, please download again)'r   tempfileNamedTemporaryFiler   r   rk   r   r   r   rH   rL   rV   r   r   tellr   r   r   r   rT   r   r   r   r   r   rO   r   closerM   r   r   r[   r\   getsizer   rN   r!   rb   ra   )r   r/   rE   r.   rT   rt   Ztemp_file_managerr   	temp_filer   Zdownloaded_sizer   content_lengthprogressr   rW   Zdownloaded_lengthmsgr4   r4   r5   http_get_fileP  s   



(
r   c              
   C   s   t d d |d k r'tdkr't| ||d ||d u rd n| |d ||d}nt| ||d |d ||||d}tj||d }	t|v r_|t }
|d urZ||
krYt	d t
|	|
 nt
|	|
 |||	S )Ni  Sizer   r   )rT   r.   r   r:   r   )r   rT   r.   r:   r   zLMismatched real-time digest found, falling back to lump-sum hash computation)r   r   r   get_dictr   r[   r\   ra   r   printr%   Zput_file)r   Z	file_metarR   rS   rT   r.   r:   r   Zfile_digestr   Zexpected_hashr4   r4   r5   rQ     sH   


rQ   )N)NNFNN)NFN)FN)Mr   r   r   r[   rd   r   rm   r   concurrent.futuresr   	functoolsr   http.cookiejarr   pathlibr   typingr   r   r   r	   r
   r   Zrequests.adaptersr   Z	tqdm.autor   Zmodelscope.hub.apir   r   Zmodelscope.hub.constantsr   r   r   r   r   r   r   Zmodelscope.utils.constantr   r   r   r   r   Zmodelscope.utils.file_utilsr   r   Zmodelscope.utils.loggerr   r   r   r    errorsr!   r"   r#   Zutils.cachingr$   Zutils.utilsr%   r&   r'   rH   rk   boolr6   r9   r3   rg   rG   rP   r   r   r   r   r   rQ   r4   r4   r4   r5   <module>   sv  $	
=	
>	

 $
#
0

:

o
Y
