o
    1 iN                  3   @   s  d dl mZmZmZmZmZmZmZmZ d dl	Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) e! \Z*Z+e  \Z,Z+edddddddddddddddddddddde-de-deeeeee egee(ee( f f  deeg e)f  deeeegee-e(f f  deeeeeeeef  ee gef  deeeee-e(f ee( eegee-e(f f  deeede(gee-e(f f  deeegee-e(f f  deeee)gdf  deeee
j.e
j.e)gdf  deeee
j.e
j.e)gdf  deeee
j/j.e
j/j.e)gdf  deeee
j.e
j.e)gdf  d eeee
j/j.e
j/j.e)gdf  d!eee(ee( gee(e(f f  d"eeeee(e(e(gee(e0ee( f f  d#eeee
j/j.e
j/j.e)gef  d$eeee
j/j.e
j/j.e)geeee f f  d%eeeegee'e1f f  d&eeedgdf  d'eee0  d(eeege2f  d)ee f0d*d+Z3dS ),    )AnyCallableDictListOptionalTupleTypeUnionN)ModelCatalog)ModelV2)TorchDistributionWrapper)TorchModelV2)Policy)SampleBatch)TorchPolicy)
add_mixinsNullContextManager)OldAPIStackoverride)try_import_torchtry_import_jax)LEARNER_STATS_KEY)convert_to_numpy)ModelGradients
TensorTypeAlgorithmConfigDict)get_default_configstats_fnpostprocess_fnextra_action_out_fnextra_grad_process_fnextra_learn_fetches_fnoptimizer_fnvalidate_spacesbefore_initbefore_loss_init
after_init_after_loss_initaction_sampler_fnaction_distribution_fn
make_modelmake_model_and_action_distcompute_gradients_fnapply_gradients_fnmixinsget_batch_divisibility_reqname	frameworkloss_fnr   r   r   r   r    ztorch.optim.Optimizerr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   returnc                   sz   t   tt|}G  	
fddd|}fdd}t||_| |_| |_|S )a  Helper function for creating a new Policy class at runtime.

    Supports frameworks JAX and PyTorch.

    Args:
        name: name of the policy (e.g., "PPOTorchPolicy")
        framework: Either "jax" or "torch".
            loss_fn (Optional[Callable[[Policy, ModelV2,
                Type[TorchDistributionWrapper], SampleBatch], Union[TensorType,
                List[TensorType]]]]): Callable that returns a loss tensor.
            get_default_config (Optional[Callable[[None], AlgorithmConfigDict]]):
                Optional callable that returns the default config to merge with any
                overrides. If None, uses only(!) the user-provided
                PartialAlgorithmConfigDict as dict for this Policy.
            postprocess_fn (Optional[Callable[[Policy, SampleBatch,
                Optional[Dict[Any, SampleBatch]], Optional[Any]],
                SampleBatch]]): Optional callable for post-processing experience
                batches (called after the super's `postprocess_trajectory` method).
            stats_fn (Optional[Callable[[Policy, SampleBatch],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                values given the policy and training batch. If None,
                will use `TorchPolicy.extra_grad_info()` instead. The stats dict is
                used for logging (e.g. in TensorBoard).
            extra_action_out_fn (Optional[Callable[[Policy, Dict[str, TensorType],
                List[TensorType], ModelV2, TorchDistributionWrapper]], Dict[str,
                TensorType]]]): Optional callable that returns a dict of extra
                values to include in experiences. If None, no extra computations
                will be performed.
            extra_grad_process_fn (Optional[Callable[[Policy,
                "torch.optim.Optimizer", TensorType], Dict[str, TensorType]]]):
                Optional callable that is called after gradients are computed and
                returns a processing info dict. If None, will call the
                `TorchPolicy.extra_grad_process()` method instead.
            # TODO: (sven) dissolve naming mismatch between "learn" and "compute.."
            extra_learn_fetches_fn (Optional[Callable[[Policy],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                extra tensors from the policy after loss evaluation. If None,
                will call the `TorchPolicy.extra_compute_grad_fetches()` method
                instead.
            optimizer_fn (Optional[Callable[[Policy, AlgorithmConfigDict],
                "torch.optim.Optimizer"]]): Optional callable that returns a
                torch optimizer given the policy and config. If None, will call
                the `TorchPolicy.optimizer()` method instead (which returns a
                torch Adam optimizer).
            validate_spaces (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable that takes the
                Policy, observation_space, action_space, and config to check for
                correctness. If None, no spaces checking will be done.
            before_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable to run at the
                beginning of `Policy.__init__` that takes the same arguments as
                the Policy constructor. If None, this step will be skipped.
            before_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], None]]): Optional callable to
                run prior to loss init. If None, this step will be skipped.
            after_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): DEPRECATED: Use `before_loss_init`
                instead.
            _after_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], None]]): Optional callable to
                run after the loss init. If None, this step will be skipped.
                This will be deprecated at some point and renamed into `after_init`
                to match `build_tf_policy()` behavior.
            action_sampler_fn (Optional[Callable[[TensorType, List[TensorType]],
                Tuple[TensorType, TensorType]]]): Optional callable returning a
                sampled action and its log-likelihood given some (obs and state)
                inputs. If None, will either use `action_distribution_fn` or
                compute actions by calling self.model, then sampling from the
                so parameterized action distribution.
            action_distribution_fn (Optional[Callable[[Policy, ModelV2, TensorType,
                TensorType, TensorType], Tuple[TensorType,
                Type[TorchDistributionWrapper], List[TensorType]]]]): A callable
                that takes the Policy, Model, the observation batch, an
                explore-flag, a timestep, and an is_training flag and returns a
                tuple of a) distribution inputs (parameters), b) a dist-class to
                generate an action distribution object from, and c) internal-state
                outputs (empty list if not applicable). If None, will either use
                `action_sampler_fn` or compute actions by calling self.model,
                then sampling from the parameterized action distribution.
            make_model (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], ModelV2]]): Optional callable
                that takes the same arguments as Policy.__init__ and returns a
                model instance. The distribution class will be determined
                automatically. Note: Only one of `make_model` or
                `make_model_and_action_dist` should be provided. If both are None,
                a default Model will be created.
            make_model_and_action_dist (Optional[Callable[[Policy,
                gym.spaces.Space, gym.spaces.Space, AlgorithmConfigDict],
                Tuple[ModelV2, Type[TorchDistributionWrapper]]]]): Optional
                callable that takes the same arguments as Policy.__init__ and
                returns a tuple of model instance and torch action distribution
                class.
                Note: Only one of `make_model` or `make_model_and_action_dist`
                should be provided. If both are None, a default Model will be
                created.
            compute_gradients_fn (Optional[Callable[
                [Policy, SampleBatch], Tuple[ModelGradients, dict]]]): Optional
                callable that the sampled batch an computes the gradients w.r.
                to the loss function.
                If None, will call the `TorchPolicy.compute_gradients()` method
                instead.
            apply_gradients_fn (Optional[Callable[[Policy,
                "torch.optim.Optimizer"], None]]): Optional callable that
                takes a grads list and applies these to the Model's parameters.
                If None, will call the `TorchPolicy.apply_gradients()` method
                instead.
            mixins (Optional[List[type]]): Optional list of any class mixins for
                the returned policy class. These mixins will be applied in order
                and will have higher precedence than the TorchPolicy class.
            get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
                Optional callable that returns the divisibility requirement for
                sample batches. If None, will assume a value of 1.

    Returns:
        Type[TorchPolicy]: TorchPolicy child class constructed from the
            specified args.
    c                       s   e Zd ZfddZee	d fdd	Ze
fddZefdd	Zefd
dZ	efddZ
e	fddZefddZefddZdd Zdd Z  ZS )z&build_policy_class.<locals>.policy_clsc           	         s  || _  | _| j d< r| ||| j  r| ||| j  	r?
d u s)J d	| |||| _tj|| j d d\}}n(
rL
| |||\| _}ntj|| j d d\}}tj|||| j d d| _t}t| j|ssJ d| _| jj	| |||| j| j d rd n||d d d	 | j
| jj
 p}|r|| | j| j| | jd
| j d rd nd  r | ||| d| _d S )Nr1   zAEither `make_model` or `make_model_and_action_dist` must be None!model)r1   )	obs_spaceaction_spaceZnum_outputsZmodel_configr1   z5ERROR: Generated Model must be a TorchModelV2 object!Zin_evaluationmax_seq_len)
observation_spacer6   configr4   lossZaction_distribution_classr(   r)   r7   r/   T)Zauto_remove_unneeded_view_reqsr   r   )r9   r1   r4   r
   Zget_action_distZget_model_v2r   
isinstance
parent_cls__init__Zview_requirementsupdater8   r6   Z!_initialize_loss_from_dummy_batchZglobal_timestep)	selfr5   r6   r9   Z
dist_class_Z	logit_dimZ	model_clsZ_before_loss_init)r'   r)   r(   r&   r$   r%   r1   r/   r2   r*   r+   r<   r   r#    l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/ray/rllib/policy/policy_template.pyr=      sz   

	

z/build_policy_class.<locals>.policy_cls.__init__Nc                    s`   |   " t |||}r| |||W  d    S |W  d    S 1 s)w   Y  d S N)_no_grad_contextsuperpostprocess_trajectory)r?   Zsample_batchZother_agent_batchesZepisode)	__class__r   rA   rB   rF   L  s   

$z=build_policy_class.<locals>.policy_cls.postprocess_trajectoryc                    s    r | ||S  | ||S )zCalled after optimizer.zero_grad() and loss.backward() calls.

            Allows for gradient processing before optimizer.step() is called.
            E.g. for gradient clipping.
            )extra_grad_process)r?   	optimizerr:   )r    r<   rA   rB   rH   _  s   z9build_policy_class.<locals>.policy_cls.extra_grad_processc                    s.    rt  | }tti ifi |S | S rC   )r   dictr   extra_compute_grad_fetches)r?   Zfetches)r!   r<   rA   rB   rK   k  s   
zAbuild_policy_class.<locals>.policy_cls.extra_compute_grad_fetchesc                    s    r | |S  | |S rC   )compute_gradients)r?   batch)r,   r<   rA   rB   rL   t  s   
z8build_policy_class.<locals>.policy_cls.compute_gradientsc                    s"    r	 | | d S  | | d S rC   )apply_gradients)r?   Z	gradients)r-   r<   rA   rB   rN   {  s   z6build_policy_class.<locals>.policy_cls.apply_gradientsc                    s^   |   !  r | ||||}n	| ||||}| |W  d    S 1 s(w   Y  d S rC   )rD   extra_action_out_convert_to_numpy)r?   Z
input_dictZstate_batchesr4   Zaction_dist
stats_dict)r   r<   rA   rB   rO     s   


$z7build_policy_class.<locals>.policy_cls.extra_action_outc                    s"    r
 | | j }|S | }|S rC   )r9   rI   )r?   Z
optimizers)r"   r<   rA   rB   rI     s
   
z0build_policy_class.<locals>.policy_cls.optimizerc                    sT   |     r | |}n| j| |}| |W  d    S 1 s#w   Y  d S rC   )rD   r<   extra_grad_inforP   )r?   Ztrain_batchrQ   )r   rA   rB   rR     s   
$z6build_policy_class.<locals>.policy_cls.extra_grad_infoc                 S   s   | j dkr	t S t S Ntorch)r1   rT   Zno_gradr   )r?   rA   rA   rB   rD     s   
z7build_policy_class.<locals>.policy_cls._no_grad_contextc                 S   s   | j dkr	t|S |S rS   )r1   r   )r?   datarA   rA   rB   rP     s   
z8build_policy_class.<locals>.policy_cls._convert_to_numpy)NN)__name__
__module____qualname__r=   r   r   rF   rH   rK   rL   rN   rO   rI   rR   rD   rP   __classcell__rA   )r'   r)   r(   r&   r-   r$   r%   r,   r   r    r!   r1   r/   r2   r*   r+   r"   r<   r   r   r#   )rG   rB   
policy_cls   s*    &UrZ   c                     s   t di t fi | S )a  Creates a Torch|JAXPolicy cls based on settings of another one.

        Keyword Args:
            **overrides: The settings (passed into `build_torch_policy`) that
                should be different from the class that this method is called
                on.

        Returns:
            type: A new Torch|JAXPolicy sub-class.

        Examples:
        >> MySpecialDQNPolicyClass = DQNTorchPolicy.with_updates(
        ..    name="MySpecialDQNPolicyClass",
        ..    loss_function=[some_new_loss_function],
        .. )
        NrA   )build_policy_classrJ   )Z	overrides)original_kwargsrA   rB   with_updates  s   z(build_policy_class.<locals>.with_updates)localscopyr   r   staticmethodr]   rV   rX   )r0   r1   r2   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   baserZ   r]   rA   )r'   r)   r(   r&   r-   r$   r%   r,   r   r    r!   r1   r/   r2   r*   r+   r"   r\   r<   r   r   r#   rB   r[       s    
S
< 5
r[   )4typingr   r   r   r   r   r   r   r	   Z	gymnasiumZgymZray.rllib.models.catalogr
   Zray.rllib.models.modelv2r   Z(ray.rllib.models.torch.torch_action_distr   Z$ray.rllib.models.torch.torch_modelv2r   Zray.rllib.policy.policyr   Zray.rllib.policy.sample_batchr   Zray.rllib.policy.torch_policyr   Zray.rllib.utilsr   r   Zray.rllib.utils.annotationsr   r   Zray.rllib.utils.frameworkr   r   Z$ray.rllib.utils.metrics.learner_infor   Zray.rllib.utils.numpyr   Zray.rllib.utils.typingr   r   r   Zjaxr@   rT   strZSpacespacestyperJ   intr[   rA   rA   rA   rB   <module>   s4  ( 




#'(+.169>AGLRU
XYZ