o
    1 i+                     @   s0  d Z ddlZddlmZ ddlZddlZddlZddlmZ ddl	m
Z
 ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 e" \Z1Z2Z3e# \Z4Z5dZ6dZ7e8 Z9e9j:dg dddd e9j:dddd e9j:d e;d!d"d# e9j:d$e;d%d&d# e9j:d'e<d(d)d# G d*d+ d+Z=	dOd,d-Z>d.d/ Z?d0d1 Z@d2d3 ZAeAeZBeAeZCG d4d5 d5e=eZDG d6d7 d7eZEeFd8krejGd9d: e9H ZIeJd;eIjKdkrene e jLd<d<d=MeKeIjKjNd>dd?jOd@d;idAjPdedBejQejReIjKdCfdedBejQejReIjKdCfdDdEdF dGjSe;ejTUdHdIdJZVe
eIjWe'eIjXe% dKe& eIjYiZZej[eEeV\ ej]eZdLdMdNZ^e^_ Z`eIjare+e`eIjY dS dS dS )Pah  An example of customizing PPO to leverage a centralized critic.

Here the model and policy are hard-coded to implement a centralized critic
for TwoStepGame, but you can adapt this for your own use cases.

Compared to simply running `rllib/examples/two_step_game.py --run=PPO`,
this centralized critic version reaches vf_explained_variance=1.0 more stably
since it takes into account the opponent actions as well as the policy's.
Note that this is also using two independent policies instead of weight-sharing
with one.

See also: centralized_critic_2.py for a simpler approach that instead
modifies the environment.
    N)Discrete)tune)TRAINING_ITERATION)PPO	PPOConfig)PPOTF1PolicyPPOTF2Policy)PPOTorchPolicy)compute_advantagesPostprocessing)TwoStepGame)CentralizedCriticModelTorchCentralizedCriticModel)ModelCatalog)SampleBatch)override)try_import_tftry_import_torch)ENV_RUNNER_RESULTSEPISODE_RETURN_MEANNUM_ENV_STEPS_SAMPLED_LIFETIME)convert_to_numpy)check_learning_achieved)explained_variancemake_tf_callable)convert_to_torch_tensorZopponent_obsZopponent_actionz--framework)tfZtf2torchr   zThe DL framework specifier.)choicesdefaulthelpz	--as-test
store_truezuWhether this script should be run as a test: --stop-reward must be achieved within --stop-timesteps AND --stop-iters.)actionr    z--stop-itersd   zNumber of iterations to train.)typer   r    z--stop-timestepsi zNumber of timesteps to train.z--stop-rewardg(\@z!Reward at which we stop training.c                   @   s   e Zd ZdZdd ZdS )CentralizedValueMixinzAAdd method to evaluate the central value function from the model.c                 C   s6   | j d dkrt|  | jj| _d S | jj| _d S )N	frameworkr   )configr   Zget_sessionmodelcentral_value_functioncompute_central_vf)self r,   q/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/ray/rllib/examples/centralized_critic.py__init__Z   s
   

zCentralizedValueMixin.__init__N)__name__
__module____qualname____doc__r.   r,   r,   r,   r-   r%   W   s    r%   c           
      C   sp  | j d dk}|rt| ds|so|  ro|d usJ t| \\}}}|tj |t< |tj |t	< t
jdkr[| t|tj | jt|t | jt|t	 | j   |tj< n6t| |tj |t |t	 |tj< n"t|tj |t< t|tj |t	< tj|tj tjd|tj< |tj d }|rd}n|tj d }t||| j d | j d | j d	 d
}	|	S )Nr&   r   r*   )Zdtypeg        gammalambdause_gae)r6   )r'   hasattrZloss_initializedlistvaluesr   CUR_OBSOPPONENT_OBSACTIONSOPPONENT_ACTIONargsr&   r*   r   ZdevicecpudetachnumpyZVF_PREDSr   npZ
zeros_likeZREWARDSZfloat32ZTERMINATEDSr
   )
policysample_batchother_agent_batchesepisodeZpytorch_Zopponent_batch	completedZlast_rtrain_batchr,   r,   r-   !centralized_critic_postprocessinge   s`   

	
	rJ   c                    s8   |j } fdd|_ |   _|||}||_ |S )Nc                      s    j tj t t S N)r(   r)   r   r:   r;   r=   r,   rC   rI   r,   r-   <lambda>   s
    z*loss_with_central_critic.<locals>.<lambda>)Zvalue_function_central_value_outloss)rC   Zbase_policyr(   
dist_classrI   Zvf_savedrO   r,   rL   r-   loss_with_central_critic   s   
rQ   c                 C   s   dt |tj | jiS )NZvf_explained_var)r   r   ZVALUE_TARGETSrN   rL   r,   r,   r-   central_vf_stats   s   rR   c                    s   G  fdddt  }|S )Nc                       s^   e Zd ZfddZe fddZe	dddZedef fd	d
Z  Z	S )z'get_ccppo_policy.<locals>.CCPPOTFPolicyc                    s     | ||| t |  d S rK   )r.   r%   r+   Zobservation_spaceaction_spacer'   baser,   r-   r.         z0get_ccppo_policy.<locals>.CCPPOTFPolicy.__init__c                       t | t |||S rK   rQ   superr+   r(   rP   rI   	__class__r,   r-   rO      s   z,get_ccppo_policy.<locals>.CCPPOTFPolicy.lossNc                 S      t | |||S rK   rJ   r+   rD   rE   rF   r,   r,   r-   postprocess_trajectory      z>get_ccppo_policy.<locals>.CCPPOTFPolicy.postprocess_trajectoryrI   c                    s    t  |}|t| | |S rK   )rZ   stats_fnupdaterR   )r+   rI   statsr\   r,   r-   rc      s   z0get_ccppo_policy.<locals>.CCPPOTFPolicy.stats_fnNN)
r/   r0   r1   r.   r   rO   ra   r   rc   __classcell__r,   rU   r\   r-   CCPPOTFPolicy   s    	rh   )r%   )rV   rh   r,   rU   r-   get_ccppo_policy   s   ri   c                       s@   e Zd Zdd Zee fddZee	dddZ  ZS )	CCPPOTorchPolicyc                 C   s   t | ||| t|  d S rK   )r	   r.   r%   rS   r,   r,   r-   r.      rW   zCCPPOTorchPolicy.__init__c                    rX   rK   rY   r[   r\   r,   r-   rO      s   zCCPPOTorchPolicy.lossNc                 C   r^   rK   r_   r`   r,   r,   r-   ra      rb   z'CCPPOTorchPolicy.postprocess_trajectoryrf   )	r/   r0   r1   r.   r   r	   rO   ra   rg   r,   r,   r\   r-   rj      s    rj   c                   @   s    e Zd Zeeedd ZdS )CentralizedCriticc                 C   s$   |d dkrt S |d dkrtS tS )Nr&   r   r   )rj   CCPPOStaticGraphTFPolicyCCPPOEagerTFPolicy)clsr'   r,   r,   r-   get_default_policy_class   s
   z*CentralizedCritic.get_default_policy_classN)r/   r0   r1   classmethodr   r   ro   r,   r,   r,   r-   rk      s    rk   __main__T)Z
local_modeZcc_modelF)Z"enable_env_runner_and_connector_v2Zenable_rl_module_and_learnerZcomplete_episodes)Z
batch_modeZnum_env_runnersZcustom_model)r(      )Zframework_str)pol1pol2c                 K   s   | dkrdS dS )Nr   rs   rt   r,   )Zagent_idrF   Zworkerkwargsr,   r,   r-   rM   )  s   rM   )ZpoliciesZpolicy_mapping_fnZRLLIB_NUM_GPUS0)Znum_gpus/   )stopverbose)Zparam_spaceZ
run_configrf   )br2   argparseZgymnasium.spacesr   rA   rB   osZrayr   Zray.tune.resultr   Zray.rllib.algorithms.ppo.ppor   r   Z&ray.rllib.algorithms.ppo.ppo_tf_policyr   r   Z)ray.rllib.algorithms.ppo.ppo_torch_policyr	   Z#ray.rllib.evaluation.postprocessingr
   r   Z9ray.rllib.examples.envs.classes.multi_agent.two_step_gamer   ZBray.rllib.examples._old_api_stack.models.centralized_critic_modelsr   r   Zray.rllib.modelsr   Zray.rllib.policy.sample_batchr   Zray.rllib.utils.annotationsr   Zray.rllib.utils.frameworkr   r   Zray.rllib.utils.metricsr   r   r   Zray.rllib.utils.numpyr   Zray.rllib.utils.test_utilsr   Zray.rllib.utils.tf_utilsr   r   Zray.rllib.utils.torch_utilsr   Ztf1r   Ztfvr   nnr;   r=   ArgumentParserparseradd_argumentintfloatr%   rJ   rQ   rR   ri   rl   rm   rj   rk   r/   init
parse_argsr>   Zregister_custom_modelr&   Z	api_stackenvironmentZenv_runnersZtrainingZmulti_agentrT   Z	overrides	resourcesenvirongetr'   Z
stop_itersZstop_timestepsZstop_rewardry   ZTunerto_dictZ	RunConfigZtunerfitresultsZas_testr,   r,   r,   r-   <module>   s   

?	!
	$;