o
    1 i+                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlm Z  er^d dl!m"Z" e #e$Z%eG dd deZeG dd deZee ddG dd dZ&dS )    N)	dataclass)cached_property)Path)TYPE_CHECKINGListOptionalUnion)CheckpointConfigFailureConfigScalingConfig)
RuntimeEnv)_DEPRECATED)StorageContext)FAIL_FAST_DEPRECATION_MESSAGE%TRAINER_RESOURCES_DEPRECATION_MESSAGE)date_str)	PublicAPI)UserCallbackc                       sz   e Zd ZU dZdZee ed< dZe	e
 ed< dZee ed<  fddZe fd	d
Zedd Zedd Z  ZS )r   aJ	  Configuration for scaling training.

    Args:
        num_workers: The number of workers (Ray actors) to launch.
            Each worker will reserve 1 CPU by default. The number of CPUs
            reserved by each worker can be overridden with the
            ``resources_per_worker`` argument. If the number of workers is 0,
            the training function will run in local mode, meaning the training
            function runs in the same process.
        use_gpu: If True, training will be done on GPUs (1 per worker).
            Defaults to False. The number of GPUs reserved by each
            worker can be overridden with the ``resources_per_worker``
            argument.
        resources_per_worker: If specified, the resources
            defined in this Dict is reserved for each worker.
            Define the ``"CPU"`` and ``"GPU"`` keys (case-sensitive) to
            override the number of CPU or GPUs used by each worker.
        placement_strategy: The placement strategy to use for the
            placement group of the Ray actors. See :ref:`Placement Group
            Strategies <pgroup-strategy>` for the possible options.
        accelerator_type: [Experimental] If specified, Ray Train will launch the
            training coordinator and workers on the nodes with the specified type
            of accelerators.
            See :ref:`the available accelerator types <accelerator_types>`.
            Ensure that your cluster has instances with the specified accelerator type
            or is able to autoscale to fulfill the request. This field is required
            when `use_tpu` is True and `num_workers` is greater than 1.
        use_tpu: [Experimental] If True, training will be done on TPUs (1 TPU VM
            per worker). Defaults to False. The number of TPUs reserved by each
            worker can be overridden with the ``resources_per_worker``
            argument. This arg enables SPMD execution of the training workload.
        topology: [Experimental] If specified, Ray Train will launch the training
            coordinator and workers on nodes with the specified topology. Topology is
            auto-detected for TPUs and added as Ray node labels. This arg enables
            SPMD execution of the training workload. This field is required
            when `use_tpu` is True and `num_workers` is greater than 1.
    Ntrainer_resourcesFuse_tputopologyc                    s   | j d ur	tt| jr| jrtd| js| jdkrtd| jr+| jdkr+td| jrA| jdkrA| js:td| j	sAtd| jdkrKt
d t   d S )	Nz6Cannot specify both `use_gpu=True` and `use_tpu=True`.r   z`use_tpu` is False but `TPU` was found in `resources_per_worker`. Either set `use_tpu` to True or remove `TPU` from `resources_per_worker.z`use_tpu` is True but `TPU` is set to 0 in `resources_per_worker`. Either set `use_tpu` to False or request a positive number of `TPU` in `resources_per_worker.   zY`topology` must be specified in ScalingConfig when `use_tpu=True`  and `num_workers` > 1.z``accelerator_type` must be specified in ScalingConfig when `use_tpu=True` and `num_workers` > 1.zRunning in local mode. The training function will run in the same process. If you are using it and running into issues please file a report at https://github.com/ray-project/ray/issues.)r   DeprecationWarningr   Zuse_gpur   
ValueErrornum_tpus_per_workerZnum_workersr   Zaccelerator_typeloggerinfosuper__post_init__self	__class__ c/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/ray/train/v2/api/config.pyr   J   s4   

zScalingConfig.__post_init__c                    s    | j d u r| jrddiS t jS )NTPUr   )Zresources_per_workerr   r   _resources_per_worker_not_noner   r!   r#   r$   r&   u   s   
z,ScalingConfig._resources_per_worker_not_nonec                 C   s   i S Nr#   r   r#   r#   r$   _trainer_resources_not_none}   s   z)ScalingConfig._trainer_resources_not_nonec                 C   s   | j ddS )z%The number of TPUs to set per worker.r%   r   )r&   getr   r#   r#   r$   r      s   z!ScalingConfig.num_tpus_per_worker)__name__
__module____qualname____doc__r   r   dict__annotations__r   r   boolr   strr   propertyr&   r(   r   __classcell__r#   r#   r!   r$   r      s   
 &+
r   c                   @   s:   e Zd ZU dZeZeeef e	d< dZ
ee	d< dd ZdS )r
   ah  Configuration related to failure handling of each training run.

    Args:
        max_failures: Tries to recover a run from training worker errors at least this many times.
            Will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 0.
        controller_failure_limit: [DeveloperAPI] The maximum number of controller failures to tolerate.
            Setting to -1 will lead to infinite controller retries.
            Setting to 0 will disable controller retries. Defaults to -1.
    	fail_fastcontroller_failure_limitc                 C   s   | j tkr	ttd S r'   )r4   r   r   r   r   r#   r#   r$   r      s   
zFailureConfig.__post_init__N)r*   r+   r,   r-   r   r4   r   r0   r1   r/   r6   intr   r#   r#   r#   r$   r
      s
   
 r
   Zstable)Z	stabilityc                   @   s   e Zd ZU dZdZee ed< dZee ed< dZ	ee
jj ed< dZee ed< dZee ed< dZeed  ed	< dZeeeef  ed
< eZeed< eZeed< eZeed< eZeed< eZeed< dd ZedefddZ dS )	RunConfigaD  Runtime configuration for training runs.

    Args:
        name: Name of the trial or experiment. If not provided, will be deduced
            from the Trainable.
        storage_path: [Beta] Path where all results and checkpoints are persisted.
            Can be a local directory or a destination on cloud storage.
            For multi-node training/tuning runs, this must be set to a
            shared storage location (e.g., S3, NFS).
            This defaults to the local ``~/ray_results`` directory.
        storage_filesystem: [Beta] A custom filesystem to use for storage.
            If this is provided, `storage_path` should be a path with its
            prefix stripped (e.g., `s3://bucket/path` -> `bucket/path`).
        failure_config: Failure mode configuration.
        checkpoint_config: Checkpointing configuration.
        callbacks: [DeveloperAPI] A list of callbacks that the Ray Train controller
            will invoke during training.
        worker_runtime_env: [DeveloperAPI] Runtime environment configuration
            for all Ray Train worker actors.
    Nnamestorage_pathstorage_filesystemfailure_configcheckpoint_configr   	callbacksworker_runtime_envsync_configverbosestopprogress_reporterlog_to_filec                    s  ddl m} | jd u r|| _| jst | _| jst | _t| jtr(| j	 | _d}g d}|D ]}t
| |tkr@t||q0| jsKdt  | _| jpOg | _| jpUi | _ddlm  t fdd| jD smtd	t| jts}td
| jj dt| jtstd| jj dd S )Nr   )DEFAULT_STORAGE_PATHaI  `RunConfig({})` is deprecated. This configuration was a Ray Tune API that did not support Ray Train usage well, so we are dropping support going forward. If you heavily rely on these configurations, you can run Ray Train as a single Ray Tune trial. See this issue for more context: https://github.com/ray-project/ray/issues/49454)r@   rA   rB   rC   rD   zray_train_run-RayTrainCallbackc                 3   s    | ]}t | V  qd S r'   )
isinstance).0cbrF   r#   r$   	<genexpr>   s    z*RunConfig.__post_init__.<locals>.<genexpr>zAll callbacks must be instances of `ray.train.UserCallback`. Passing in a Ray Tune callback is no longer supported. See this issue for more context: https://github.com/ray-project/ray/issues/49454z!Invalid `CheckpointConfig` type: z|. Use `ray.train.CheckpointConfig` instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454zInvalid `FailureConfig` type: zy. Use `ray.train.FailureConfig` instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454)Zray.train.constantsrE   r:   r<   r
   r=   r	   rH   r   as_posixgetattrr   r   formatr9   r   r>   r?   Zray.train.v2.api.callbackrG   allr   r"   )r    rE   Zrun_config_deprecation_messageZunsupported_paramsparamr#   rF   r$   r      sD   

zRunConfig.__post_init__returnc                 C   s   t | j| j| jdS )N)r:   Zexperiment_dir_namer;   )r   r:   r9   r;   r   r#   r#   r$   storage_context  s
   zRunConfig.storage_context)!r*   r+   r,   r-   r9   r   r1   r/   r:   r;   pyarrowfsZ
FileSystemr<   r
   r=   r	   r>   r   r?   r   r.   r   r   r@   rA   rB   rC   rD   r   r   r   rR   r#   r#   r#   r$   r8      s"   
 Er8   )'loggingdataclassesr   	functoolsr   pathlibr   typingr   r   r   r   Z
pyarrow.fsrS   Zray.air.configr	   r
   ZFailureConfigV1r   ZScalingConfigV1Zray.runtime_envr   Z ray.train.v2._internal.constantsr   Z(ray.train.v2._internal.execution.storager   Z&ray.train.v2._internal.migration_utilsr   r   Zray.train.v2._internal.utilr   Zray.util.annotationsr   Z	ray.trainr   	getLoggerr*   r   r8   r#   r#   r#   r$   <module>   s.    
h