o
    1 ie:                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZ erbd dl m!Z! d dl"m#Z# eG dd dZ$dS )    N)defaultdict)TYPE_CHECKINGAnyCallableDictListOptionalTuple)_DUMMY_AGENT_ID)AgentCollector)_PolicyCollector_PolicyCollectorGroup)	PolicyMap)SampleBatch)OldAPIStack)AgentIDEnvIDEnvInfoDictPolicyID
TensorType)RLlibCallback)RolloutWorkerc                   @   s  e Zd ZdZddddededeed dgef de	d d	e	d
 f
ddZ
edfdededefddZdee fddZdedefddZd7ddZdddededeeef deddf
ddZded eeef ddfd!d"Z		d8d#ed$ed%eddfd&d'Zd9dedefd(d)Zdedefd*d+Zdedefd,d-Zdedefd.d/Zded0efd1d2Zefdede	e fd3d4Z e!d5d6 Z"dS ):	EpisodeV2z=Tracks the current state of a (possibly multi-agent) episode.N)worker	callbacksenv_idpoliciespolicy_mapping_fnr   r   r   r   c                C   s   t td| _|| _d| _d| _d| _d| _d| _	i | _
i | _i | _i | _|| _|| _|| _|| _i | _i | _d| _i | _tt| _tt| _i | _i | _i | _i | _dS )a  Initializes an Episode instance.

        Args:
            env_id: The environment's ID in which this episode runs.
            policies: The PolicyMap object (mapping PolicyIDs to Policy
                objects) to use for determining, which policy is used for
                which agent.
            policy_mapping_fn: The mapping function mapping AgentIDs to
                PolicyIDs.
            worker: The RolloutWorker instance, in which this episode runs.
        g NgmCg        r   N)random	randrangeint
episode_idr   total_rewardactive_env_stepstotal_env_stepsactive_agent_stepstotal_agent_stepsZcustom_metrics	user_dataZ	hist_dataZmediar   r   
policy_mapr   _agent_to_policy_agent_collectors_next_agent_index_agent_to_indexr   floatagent_rewardslist_agent_reward_history_has_init_obs_last_terminateds_last_truncateds_last_infos)selfr   r   r   r   r    r7   k/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/ray/rllib/evaluation/episode_v2.py__init__   s4   


zEpisodeV2.__init__Fagent_idrefreshreturnc                 C   sT   || j vs|r| j|| | jd }| j |< n| j | }|| jvr(td| d|S )a  Returns and stores the policy ID for the specified agent.

        If the agent is new, the policy mapping fn will be called to bind the
        agent to a policy for the duration of the entire episode (even if the
        policy_mapping_fn is changed in the meantime!).

        Args:
            agent_id: The agent ID to lookup the policy ID for.

        Returns:
            The policy ID for the specified agent.
        )r   z.policy_mapping_fn returned invalid policy id 'z'!)r*   r   r   r)   KeyError)r6   r:   r;   	policy_idr7   r7   r8   
policy_fore   s   


zEpisodeV2.policy_forc                 C   s   t | j S )zReturns list of agent IDs that have appeared in this episode.

        Returns:
            The list of all agent IDs that have appeared so far in this
            episode.
        )r0   r-   keysr6   r7   r7   r8   
get_agents   s   zEpisodeV2.get_agentsc                 C   s.   || j vr| j| j |< |  jd7  _| j | S )zGet the index of an agent among its environment.

        A new index will be created if an agent is seen for the first time.

        Args:
            agent_id: ID of an agent.

        Returns:
            The index of this agent.
           )r-   r,   r6   r:   r7   r7   r8   agent_index   s   

zEpisodeV2.agent_indexc                 C   s    |  j d7  _ |  jd7  _dS )z(Advance the episode forward by one step.rC   N)r$   r%   rA   r7   r7   r8   step   s   zEpisodeV2.stepr   )tinit_obs
init_infosrG   c                C   s   | j | | }|| jvsJ t|j|jd d |jdd| | dd| j|< | j| j	| j
| || j|||d d| j|< dS )	zAdd initial env obs at the start of a new episode

        Args:
            agent_id: Agent ID.
            init_obs: Initial observations.
            init_infos: Initial infos dicts.
            t: timestamp.
        modelmax_seq_lenZ_disable_action_flatteningF)rK   Zdisable_action_flatteningZis_policy_recurrentZintial_statesZ_enable_new_api_stack)r"   rE   r   rH   rI   rG   TN)r)   r?   r+   r   view_requirementsconfiggetZis_recurrentZget_initial_stateadd_init_obsr"   rE   r   r2   )r6   r:   rH   rI   rG   policyr7   r7   r8   rO      s*   

	zEpisodeV2.add_init_obsvaluesc                 C   s   || j v sJ |  jd7  _|  jd7  _|tkr||d< | j | | |tj }|  j|7  _| j|| 	|f  |7  < | j
| | tj|v rT|tj | j|< tj|v ra|tj | j|< tj|v rq| ||tj  dS dS )zAdd action, reward, info, and next_obs as a new step.

        Args:
            agent_id: Agent ID.
            values: Dict of action, reward, info, and next_obs.
        rC   r:   N)r+   r&   r'   r
   Zadd_action_reward_next_obsr   ZREWARDSr#   r/   r?   r1   appendZTERMINATEDSr3   Z
TRUNCATEDSr4   ZINFOSset_last_info)r6   r:   rQ   Zrewardr7   r7   r8   add_action_reward_done_next_obs   s"   



z)EpisodeV2.add_action_reward_done_next_obsbatch_builderis_donecheck_donesc              
   C   s  i }| j  D ]!\}}|jdkrq| |}| j| }||j}	|||	f||< q| D ]\}\}}}	|rK|rK|	 sKtd	| j
|| |d | j|i ddsWq-|	 rgtt|	tj dkrltd|	t|dkrz| }
|
|= ni }
|	}t|dd	d	ur|j|||  |d	 |||
| }dd
lm} | jj| | ||| j||d ||jvrt||j|< |j| ||j q-| j| j 7  _| j!| j"7  _!d| _ d| _"d	S )a  Build and return currently collected training samples by policies.

        Clear agent collector states if this episode is done.

        Args:
            batch_builder: _PolicyCollectorGroup for saving the collected per-agent
                sample batches.
            is_done: If this episode is done (terminated or truncated).
            check_dones: Whether to make sure per-agent trajectories are actually done.
        r   zkEpisode {} terminated for all agents, but we still don't have a last observation for agent {} (policy {}). zkPlease ensure that you include the last observations of all live agents when setting done[__all__] to True.Ztraining_enabledTrC   zPBatches sent to postprocessing must only contain steps from a single trajectory.explorationN)get_global_worker)r   Zepisoder:   r>   r   Zpostprocessed_batchZoriginal_batches)#r+   itemsZagent_stepsr?   r)   Zbuild_for_trainingrL   Zis_terminated_or_truncated
ValueErrorformatr"   r5   rN   Zis_single_trajectorylennpuniquer   ZEPS_IDcopygetattrrX   Zpostprocess_trajectoryZget_sessionZset_get_interceptor#ray.rllib.evaluation.rollout_workerrY   r   Zon_postprocess_trajectoryZpolicy_collectorsr   Z$add_postprocessed_batch_for_trainingr&   Z	env_stepsr$   )r6   rU   rV   rW   Zpre_batchesr:   	collectorpidrP   Z	pre_batchZother_batchesZ
post_batchrY   r7   r7   r8   postprocess_episode   sn   







zEpisodeV2.postprocess_episodec                 C   s.   |dur|| j v o| j | S tt| j  S )zReturns whether this episode has initial obs for an agent.

        If agent_id is None, return whether we have received any initial obs,
        in other words, whether this episode is completely fresh.
        N)r2   anyr0   rQ   rD   r7   r7   r8   has_init_obs]  s   zEpisodeV2.has_init_obsc                 C   s   |  |p	| |S N)is_terminatedis_truncatedrD   r7   r7   r8   rV   h  s   zEpisodeV2.is_donec                 C      | j |dS NF)r3   rN   rD   r7   r7   r8   ri   k     zEpisodeV2.is_terminatedc                 C   rk   rl   )r4   rN   rD   r7   r7   r8   rj   n  rm   zEpisodeV2.is_truncatedinfoc                 C   s   || j |< d S rh   )r5   )r6   r:   rn   r7   r7   r8   rS   q  rm   zEpisodeV2.set_last_infoc                 C   s   | j |S rh   )r5   rN   rD   r7   r7   r8   last_info_fort  s   zEpisodeV2.last_info_forc                 C   s   | j S rh   )r%   rA   r7   r7   r8   lengthy  s   zEpisodeV2.length)r<   N)FFrh   )#__name__
__module____qualname____doc__r   r   r   r   r   r   r9   r
   boolr?   r   rB   r!   rE   rF   r   r   strrO   rT   r   re   rg   rV   ri   rj   rS   r   ro   propertyrp   r7   r7   r7   r8   r      s    
K
#	


*

,
d
r   )%r   collectionsr   typingr   r   r   r   r   r   r	   numpyr^   Zray.rllib.env.base_envr
   Z/ray.rllib.evaluation.collectors.agent_collectorr   Z5ray.rllib.evaluation.collectors.simple_list_collectorr   r   Zray.rllib.policy.policy_mapr   Zray.rllib.policy.sample_batchr   Zray.rllib.utils.annotationsr   Zray.rllib.utils.typingr   r   r   r   r   Zray.rllib.callbacks.callbacksr   rb   r   r   r7   r7   r7   r8   <module>   s     $