diff --git a/rllib/BUILD b/rllib/BUILD index 8d8fc3f73244e..977b56be70ac5 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -683,6 +683,16 @@ py_test( args = ["--dir=tuned_examples/ppo"] ) +py_test( + name = "test_memory_leak_ppo_new_stack", + tags = ["team:rllib", "memory_leak_tests"], + main = "utils/tests/run_memory_leak_tests.py", + size = "large", + srcs = ["utils/tests/run_memory_leak_tests.py"], + data = ["tuned_examples/ppo/memory-leak-test-ppo-new-stack.py"], + args = ["--dir=tuned_examples/ppo", "--to-check=rollout_worker"] +) + py_test( name = "test_memory_leak_sac", tags = ["team:rllib", "memory_leak_tests"], @@ -772,12 +782,12 @@ py_test( srcs = ["env/tests/test_multi_agent_env.py"] ) -py_test( - name = "env/tests/test_multi_agent_episode", - tags = ["team:rllib", "env"], - size = "medium", - srcs = ["env/tests/test_multi_agent_episode.py"] -) +# py_test( +# name = "env/tests/test_multi_agent_episode", +# tags = ["team:rllib", "env"], +# size = "medium", +# srcs = ["env/tests/test_multi_agent_episode.py"] +# ) sh_test( name = "env/tests/test_remote_inference_cartpole", @@ -818,19 +828,26 @@ sh_test( # ) py_test( - name = "env/tests/test_single_agent_gym_env_runner", + name = "env/tests/test_single_agent_env_runner", tags = ["team:rllib", "env"], size = "medium", - srcs = ["env/tests/test_single_agent_gym_env_runner.py"] + srcs = ["env/tests/test_single_agent_env_runner.py"] ) py_test( name = "env/tests/test_single_agent_episode", tags = ["team:rllib", "env"], - size = "medium", + size = "small", srcs = ["env/tests/test_single_agent_episode.py"] ) +py_test( + name = "env/tests/test_lookback_buffer", + tags = ["team:rllib", "env"], + size = "small", + srcs = ["env/tests/test_lookback_buffer.py"] +) + py_test( name = "env/wrappers/tests/test_exception_wrapper", tags = ["team:rllib", "env"], @@ -1332,7 +1349,6 @@ py_test( # Tag: utils # -------------------------------------------------------------------- -# Checkpoint Utils py_test( name = "test_checkpoint_utils", tags = ["team:rllib", "utils"], @@ -2947,6 +2963,7 @@ py_test( py_test_module_list( files = [ "env/wrappers/tests/test_kaggle_wrapper.py", + "env/tests/test_multi_agent_episode.py", "examples/env/tests/test_cliff_walking_wall_env.py", "examples/env/tests/test_coin_game_non_vectorized_env.py", "examples/env/tests/test_coin_game_vectorized_env.py", diff --git a/rllib/algorithms/dreamerv3/utils/env_runner.py b/rllib/algorithms/dreamerv3/utils/env_runner.py index 259a27e4f7dfe..0fcf9a746235e 100644 --- a/rllib/algorithms/dreamerv3/utils/env_runner.py +++ b/rllib/algorithms/dreamerv3/utils/env_runner.py @@ -32,6 +32,9 @@ _, tf, _ = try_import_tf() +# TODO (sven): Use SingleAgentEnvRunner instead of this as soon as we have the new +# ConnectorV2 example classes to make Atari work properly with these (w/o requiring the +# classes at the bottom of this file here, e.g. `ActionClip`). class DreamerV3EnvRunner(EnvRunner): """An environment runner to collect data from vectorized gymnasium environments.""" @@ -144,6 +147,7 @@ def __init__( self._needs_initial_reset = True self._episodes = [None for _ in range(self.num_envs)] + self._states = [None for _ in range(self.num_envs)] # TODO (sven): Move metrics temp storage and collection out of EnvRunner # and RolloutWorkers. These classes should not continue tracking some data @@ -254,10 +258,8 @@ def _sample_timesteps( # Set initial obs and states in the episodes. for i in range(self.num_envs): - self._episodes[i].add_initial_observation( - initial_observation=obs[i], - initial_state={k: s[i] for k, s in states.items()}, - ) + self._episodes[i].add_env_reset(observation=obs[i]) + self._states[i] = {k: s[i] for k, s in states.items()} # Don't reset existing envs; continue in already started episodes. else: # Pick up stored observations and states from previous timesteps. @@ -268,7 +270,9 @@ def _sample_timesteps( states = { k: np.stack( [ - initial_states[k][i] if eps.states is None else eps.states[k] + initial_states[k][i] + if self._states[i] is None + else self._states[i][k] for i, eps in enumerate(self._episodes) ] ) @@ -278,7 +282,7 @@ def _sample_timesteps( # to 1.0, otherwise 0.0. is_first = np.zeros((self.num_envs,)) for i, eps in enumerate(self._episodes): - if eps.states is None: + if len(eps) == 0: is_first[i] = 1.0 # Loop through env for n timesteps. @@ -319,14 +323,14 @@ def _sample_timesteps( if terminateds[i] or truncateds[i]: # Finish the episode with the actual terminal observation stored in # the info dict. - self._episodes[i].add_timestep( - infos["final_observation"][i], - actions[i], - rewards[i], - state=s, - is_terminated=terminateds[i], - is_truncated=truncateds[i], + self._episodes[i].add_env_step( + observation=infos["final_observation"][i], + action=actions[i], + reward=rewards[i], + terminated=terminateds[i], + truncated=truncateds[i], ) + self._states[i] = s # Reset h-states to the model's initial ones b/c we are starting a # new episode. for k, v in self.module.get_initial_state().items(): @@ -334,22 +338,24 @@ def _sample_timesteps( is_first[i] = True done_episodes_to_return.append(self._episodes[i]) # Create a new episode object. - self._episodes[i] = SingleAgentEpisode( - observations=[obs[i]], states=s - ) + self._episodes[i] = SingleAgentEpisode(observations=[obs[i]]) else: - self._episodes[i].add_timestep( - obs[i], actions[i], rewards[i], state=s + self._episodes[i].add_env_step( + observation=obs[i], + action=actions[i], + reward=rewards[i], ) is_first[i] = False + self._states[i] = s + # Return done episodes ... self._done_episodes_for_metrics.extend(done_episodes_to_return) # ... and all ongoing episode chunks. Also, make sure, we return # a copy and start new chunks so that callers of this function # don't alter our ongoing and returned Episode objects. ongoing_episodes = self._episodes - self._episodes = [eps.create_successor() for eps in self._episodes] + self._episodes = [eps.cut() for eps in self._episodes] for eps in ongoing_episodes: self._ongoing_episodes_for_metrics[eps.id_].append(eps) @@ -385,10 +391,9 @@ def _sample_episodes( render_images = [e.render() for e in self.env.envs] for i in range(self.num_envs): - episodes[i].add_initial_observation( - initial_observation=obs[i], - initial_state={k: s[i] for k, s in states.items()}, - initial_render_image=render_images[i], + episodes[i].add_env_reset( + observation=obs[i], + render_image=render_images[i], ) eps = 0 @@ -419,19 +424,17 @@ def _sample_episodes( render_images = [e.render() for e in self.env.envs] for i in range(self.num_envs): - s = {k: s[i] for k, s in states.items()} # The last entry in self.observations[i] is already the reset # obs of the new episode. if terminateds[i] or truncateds[i]: eps += 1 - episodes[i].add_timestep( - infos["final_observation"][i], - actions[i], - rewards[i], - state=s, - is_terminated=terminateds[i], - is_truncated=truncateds[i], + episodes[i].add_env_step( + observation=infos["final_observation"][i], + action=actions[i], + reward=rewards[i], + terminated=terminateds[i], + truncated=truncateds[i], ) done_episodes_to_return.append(episodes[i]) @@ -448,15 +451,15 @@ def _sample_episodes( episodes[i] = SingleAgentEpisode( observations=[obs[i]], - states=s, - render_images=[render_images[i]], + render_images=( + [render_images[i]] if with_render_data else None + ), ) else: - episodes[i].add_timestep( - obs[i], - actions[i], - rewards[i], - state=s, + episodes[i].add_env_step( + observation=obs[i], + action=actions[i], + reward=rewards[i], render_image=render_images[i], ) is_first[i] = False diff --git a/rllib/env/multi_agent_episode.py b/rllib/env/multi_agent_episode.py index 873a9db1c26a1..50add0b4aaad6 100644 --- a/rllib/env/multi_agent_episode.py +++ b/rllib/env/multi_agent_episode.py @@ -1,8 +1,9 @@ -import numpy as np -import uuid - +from collections import defaultdict from queue import Queue from typing import Any, Dict, List, Optional, Set, Union +import uuid + +import numpy as np from ray.rllib.env.single_agent_episode import SingleAgentEpisode from ray.rllib.policy.sample_batch import MultiAgentBatch @@ -29,14 +30,14 @@ def __init__( agent_episode_ids: Optional[Dict[str, str]] = None, *, observations: Optional[List[MultiAgentDict]] = None, + infos: Optional[List[MultiAgentDict]] = None, actions: Optional[List[MultiAgentDict]] = None, rewards: Optional[List[MultiAgentDict]] = None, - infos: Optional[List[MultiAgentDict]] = None, - t_started: Optional[int] = None, - is_terminated: Union[List[MultiAgentDict], bool] = False, - is_truncated: Union[List[MultiAgentDict], bool] = False, + terminateds: Union[MultiAgentDict, bool] = False, + truncateds: Union[MultiAgentDict, bool] = False, render_images: Optional[List[np.ndarray]] = None, extra_model_outputs: Optional[List[MultiAgentDict]] = None, + t_started: Optional[int] = None, ) -> "MultiAgentEpisode": """Initializes a `MultiAgentEpisode`. @@ -45,43 +46,49 @@ def __init__( If None, a hexadecimal id is created. In case of providing a string, make sure that it is unique, as episodes get concatenated via this string. - agent_ids: Obligatory. A list of strings containing the agent ids. + agent_ids: A list of strings containing the agent ids. These have to be provided at initialization. - agent_episode_ids: Optional. Either a dictionary mapping agent ids + agent_episode_ids: Either a dictionary mapping agent ids corresponding `SingleAgentEpisode` or None. If None, each `SingleAgentEpisode` in `MultiAgentEpisode.agent_episodes` will generate a hexadecimal code. If a dictionary is provided make sure that ids are unique as agents' `SingleAgentEpisode`s get concatenated or recreated by it. - observations: A dictionary mapping from agent ids to observations. - Can be None. If provided, it should be provided together with - all other episode data (actions, rewards, etc.) - actions: A dictionary mapping from agent ids to corresponding - actions. Can be None. If provided, it should be provided - together with all other episode data (observations, rewards, - etc.). - rewards: A dictionary mapping from agent ids to corresponding rewards. - Can be None. If provided, it should be provided together with - all other episode data (observations, rewards, etc.). - infos: A dictionary mapping from agent ids to corresponding infos. - Can be None. If provided, it should be provided together with - all other episode data (observations, rewards, etc.). - t_started: Optional. An unsigned int that defines the starting point - of the episode. This is only different from zero, if an ongoing - episode is created. - is_terminazted: Optional. A boolean defining, if an environment has - terminated. The default is `False`, i.e. the episode is ongoing. - is_truncated: Optional. A boolean, defining, if an environment is - truncated. The default is `False`, i.e. the episode is ongoing. - render_images: Optional. A list of RGB uint8 images from rendering - the environment. - extra_model_outputs: Optional. A dictionary mapping agent ids to their - corresponding extra model outputs. Each of the latter is a list of - dictionaries containing specific model outputs for the algorithm - used (e.g. `vf_preds` and `action_logp` for PPO) from a rollout. - If data is provided it should be complete (i.e. observations, - actions, rewards, is_terminated, is_truncated, and all necessary - `extra_model_outputs`). + observations: A list of dictionaries mapping agent IDs to observations. + Can be None. If provided, should match all other episode data + (actions, rewards, etc.) in terms of list lengths and agent IDs. + infos: A list of dictionaries mapping agent IDs to info dicts. + Can be None. If provided, should match all other episode data + (observations, rewards, etc.) in terms of list lengths and agent IDs. + actions: A list of dictionaries mapping agent IDs to actions. + Can be None. If provided, should match all other episode data + (observations, rewards, etc.) in terms of list lengths and agent IDs. + rewards: A list of dictionaries mapping agent IDs to rewards. + Can be None. If provided, should match all other episode data + (actions, rewards, etc.) in terms of list lengths and agent IDs. + terminateds: A boolean defining if an environment has + terminated OR a MultiAgentDict mapping individual agent ids + to boolean flags indicating whether individual agents have terminated. + A special __all__ key in these dicts indicates, whether the episode + is terminated for all agents. + The default is `False`, i.e. the episode has not been terminated. + truncateds: A boolean defining if the environment has been + truncated OR a MultiAgentDict mapping individual agent ids + to boolean flags indicating whether individual agents have been + truncated. A special __all__ key in these dicts indicates, whether the + episode is truncated for all agents. + The default is `False`, i.e. the episode has not been truncated. + render_images: A list of RGB uint8 images from rendering + the multi-agent environment. + extra_model_outputs: A list of dictionaries mapping agent IDs to their + corresponding extra model outputs. Each of these "outputs" is a dict + mapping keys (str) to model output values, for example for + `key=STATE_OUT`, the values would be the internal state outputs for + that agent. + t_started: The timestep (int) that defines the starting point + of the episode. This is only larger zero, if an ongoing episode is + created, for example by slicing an ongoing episode or by calling + the `cut()` method on an ongoing episode. """ self.id_: str = id_ or uuid.uuid4().hex @@ -147,16 +154,16 @@ def __init__( # If this is an ongoing episode than the last `__all__` should be `False` self.is_terminated: bool = ( - is_terminated - if isinstance(is_terminated, bool) - else is_terminated[-1]["__all__"] + terminateds + if isinstance(terminateds, bool) + else terminateds.get("__all__", False) ) # If this is an ongoing episode than the last `__all__` should be `False` self.is_truncated: bool = ( - is_truncated - if isinstance(is_truncated, bool) - else is_truncated[-1]["__all__"] + truncateds + if isinstance(truncateds, bool) + else truncateds.get("__all__", False) ) # Note that all attributes will be recorded along the global timestep @@ -169,8 +176,8 @@ def __init__( actions, rewards, infos, - is_terminated, - is_truncated, + terminateds, + truncateds, extra_model_outputs, ) for agent_id in self._agent_ids @@ -773,30 +780,27 @@ def get_truncateds(self) -> MultiAgentDict: truncateds.update({"__all__": self.is_terminated}) return truncateds - def add_initial_observation( + def add_env_reset( self, *, - initial_observation: MultiAgentDict, - initial_info: Optional[MultiAgentDict] = None, - initial_render_image: Optional[np.ndarray] = None, + observations: MultiAgentDict, + infos: Optional[MultiAgentDict] = None, + render_image: Optional[np.ndarray] = None, ) -> None: """Stores initial observation. Args: - initial_observation: A dictionary mapping agent ids - to initial observations. Note that some agents may not have an initial - observation. - initial_info: A dictionary mapping agent ids to initial - infos. Note that some agents may not have an initial - info dict. - initial_render_image: An RGB uint8 image from rendering the - environment. + observations: A dictionary mapping agent ids to initial observations. + Note that some agents may not have an initial observation. + infos: A dictionary mapping agent ids to initial info dicts. + Note that some agents may not have an initial info dict. + render_image: An RGB uint8 image from rendering the environment. """ assert not self.is_done # Assume that this episode is completely empty and has not stepped yet. # Leave self.t (and self.t_started) at 0. assert self.t == self.t_started == 0 - initial_info = initial_info or {} + infos = infos or {} # TODO (simon): After clearing with sven for initialization of timesteps # this might be removed. @@ -807,66 +811,65 @@ def add_initial_observation( # Note that we store the render images into the `MultiAgentEpisode` # instead into each `SingleAgentEpisode`. - if initial_render_image is not None: - self.render_images.append(initial_render_image) + if render_image is not None: + self.render_images.append(render_image) # Note, all agents will have an initial observation. - for agent_id in initial_observation.keys(): + for agent_id in observations.keys(): # Add initial timestep for each agent to the timestep mapping. self.global_t_to_local_t[agent_id].append(self.t) # Add initial observations to the agent's episode. - self.agent_episodes[agent_id].add_initial_observation( + self.agent_episodes[agent_id].add_env_reset( # Note, initial observation has to be provided. - initial_observation=initial_observation[agent_id], - initial_info=initial_info.get(agent_id), - initial_state=None, + observation=observations[agent_id], + infos=infos.get(agent_id), ) - def add_timestep( + def add_env_step( self, - observation: MultiAgentDict, - action: MultiAgentDict, - reward: MultiAgentDict, + observations: MultiAgentDict, + actions: MultiAgentDict, + rewards: MultiAgentDict, *, - info: Optional[MultiAgentDict] = None, - is_terminated: Optional[MultiAgentDict] = None, - is_truncated: Optional[MultiAgentDict] = None, + infos: Optional[MultiAgentDict] = None, + terminateds: Optional[MultiAgentDict] = None, + truncateds: Optional[MultiAgentDict] = None, render_image: Optional[np.ndarray] = None, - extra_model_output: Optional[MultiAgentDict] = None, + extra_model_outputs: Optional[MultiAgentDict] = None, ) -> None: """Adds a timestep to the episode. Args: - observation: A dictionary mapping agent ids to their corresponding + observations: A dictionary mapping agent ids to their corresponding observations. Note that some agents may not have stepped at this timestep. - action: Mandatory. A dictionary mapping agent ids to their + actions: Mandatory. A dictionary mapping agent ids to their corresponding actions. Note that some agents may not have stepped at this timestep. - reward: Mandatory. A dictionary mapping agent ids to their + rewards: Mandatory. A dictionary mapping agent ids to their corresponding observations. Note that some agents may not have stepped at this timestep. - info: A dictionary mapping agent ids to their + infos: A dictionary mapping agent ids to their corresponding info. Note that some agents may not have stepped at this timestep. - is_terminated: A dictionary mapping agent ids to their `terminated` flags, + terminateds: A dictionary mapping agent ids to their `terminated` flags, indicating, whether the environment has been terminated for them. A special `__all__` key indicates that the episode is terminated for all agent ids. - is_terminated: A dictionary mapping agent ids to their `truncated` flags, + terminateds: A dictionary mapping agent ids to their `truncated` flags, indicating, whether the environment has been truncated for them. A special `__all__` key indicates that the episode is `truncated` for all agent ids. render_image: An RGB uint8 image from rendering the environment. - extra_model_output: Optional. A dictionary mapping agent ids to their + extra_model_outputs: Optional. A dictionary mapping agent ids to their corresponding specific model outputs (also in a dictionary; e.g. `vf_preds` for PPO). """ # Cannot add data to an already done episode. assert not self.is_done - is_terminated = is_terminated or {} - is_truncated = is_truncated or {} + terminateds = terminateds or {} + truncateds = truncateds or {} # Environment step. self.t += 1 @@ -875,8 +878,8 @@ def add_timestep( # terminated or truncated? # TODO (simon): Maybe allow user to not provide this and then `__all__` is # False? - self.is_terminated = is_terminated.get("__all__", False) - self.is_truncated = is_truncated.get("__all__", False) + self.is_terminated = terminateds.get("__all__", False) + self.is_truncated = truncateds.get("__all__", False) # Note that we store the render images into the `MultiAgentEpisode` # instead of storing them into each `SingleAgentEpisode`. @@ -889,15 +892,13 @@ def add_timestep( if self.agent_episodes[agent_id].is_done: continue - agent_is_terminated = ( - is_terminated.get(agent_id, False) or self.is_terminated - ) - agent_is_truncated = is_truncated.get(agent_id, False) or self.is_truncated + agent_is_terminated = terminateds.get(agent_id, False) or self.is_terminated + agent_is_truncated = truncateds.get(agent_id, False) or self.is_truncated # CASE 1: observation, no action. # If we have an observation, but no action, we might have a buffered action, # or an initial agent observation. - if agent_id in observation and agent_id not in action: + if agent_id in observations and agent_id not in actions: # We have a buffered action. if self.agent_buffers[agent_id]["actions"].full(): # Get the action from the buffer. @@ -913,10 +914,10 @@ def add_timestep( # default of zero reward. agent_reward = self.agent_buffers[agent_id]["rewards"].get_nowait() # We might also got some reward in this episode. - if agent_id in reward: - agent_reward += reward[agent_id] + if agent_id in rewards: + agent_reward += rewards[agent_id] # Also add to the global reward list. - self.partial_rewards[agent_id].append(reward[agent_id]) + self.partial_rewards[agent_id].append(rewards[agent_id]) # And add to the global reward timestep mapping. self.partial_rewards_t[agent_id].append(self.t) @@ -935,14 +936,14 @@ def add_timestep( # mapping. self.global_t_to_local_t[agent_id].append(self.t) # Add data to `SingleAgentEpisode. - self.agent_episodes[agent_id].add_timestep( - observation=observation[agent_id], + self.agent_episodes[agent_id].add_env_step( + observation=observations[agent_id], action=agent_action, reward=agent_reward, - info=info.get(agent_id), - is_terminated=agent_is_terminated, - is_truncated=agent_is_truncated, - extra_model_output=agent_extra_model_output, + infos=infos.get(agent_id), + terminated=agent_is_terminated, + truncated=agent_is_truncated, + extra_model_outputs=agent_extra_model_output, ) # We have no buffered action. else: @@ -969,24 +970,24 @@ def add_timestep( self.global_t_to_local_t[agent_id].append(self.t) # The agent might have got a reward. # TODO (simon): Refactor to a function `record_rewards`. - if agent_id in reward: + if agent_id in rewards: # Add the reward to the one in the buffer. self.agent_buffers[agent_id]["rewards"].put_nowait( self.agent_buffers[agent_id]["rewards"].get_nowait() - + reward[agent_id] + + rewards[agent_id] ) # Add the reward to the partial rewards of this agent. - self.partial_rewards[agent_id].append(reward[agent_id]) + self.partial_rewards[agent_id].append(rewards[agent_id]) self.partial_rewards_t[agent_id].append(self.t) - self.agent_episodes[agent_id].add_initial_observation( - initial_observation=observation[agent_id], - initial_info=info.get(agent_id), + self.agent_episodes[agent_id].add_env_reset( + observation=observations[agent_id], + infos=infos.get(agent_id), ) # CASE 2: No observation, but action. # We have no observation, but we have an action. This must be an orphane # action and we need to buffer it. - elif agent_id not in observation and agent_id in action: + elif agent_id not in observations and agent_id in actions: # Maybe the agent got terminated. if agent_is_terminated or agent_is_truncated: # If this was indeed the agent's last step, we need to record it @@ -999,15 +1000,16 @@ def add_timestep( # agent_id list? # If the agent was terminated and no observation is provided, # take the last one. - self.agent_episodes[agent_id].add_timestep( + self.agent_episodes[agent_id].add_env_step( observation=self.agent_episodes[agent_id].observations[-1], - action=action[agent_id], - reward=0.0 if agent_id not in reward else reward[agent_id], - is_terminated=agent_is_terminated, - is_truncated=agent_is_truncated, - extra_model_output=None - if agent_id not in extra_model_output - else extra_model_output[agent_id], + action=actions[agent_id], + reward=0.0 if agent_id not in rewards else rewards[agent_id], + infos=self.agent_episodes[agent_id].infos[-1], + terminated=agent_is_terminated, + truncated=agent_is_truncated, + extra_model_outputs=None + if agent_id not in extra_model_outputs + else extra_model_outputs[agent_id], ) # Agent is still alive. else: @@ -1015,36 +1017,38 @@ def add_timestep( # original action timestep (global one). Right now the # `global_reward_t` might serve here. # Buffer the action. - self.agent_buffers[agent_id]["actions"].put_nowait(action[agent_id]) + self.agent_buffers[agent_id]["actions"].put_nowait( + actions[agent_id] + ) # Record the timestep for the action. self.global_actions_t[agent_id].append(self.t) # If available, buffer also reward. Note, if the agent is terminated # or truncated, we finish the `SingleAgentEpisode`. - if agent_id in reward: + if agent_id in rewards: # Add the reward to the existing one in the buffer. Note, the # default value is zero. # TODO (simon): Refactor to `record_rewards()`. self.agent_buffers[agent_id]["rewards"].put_nowait( self.agent_buffers[agent_id]["rewards"].get_nowait() - + reward[agent_id] + + rewards[agent_id] ) # Add to the global reward list. - self.partial_rewards[agent_id].append(reward[agent_id]) + self.partial_rewards[agent_id].append(rewards[agent_id]) # Add also to the global reward timestep mapping. self.partial_rewards_t[agent_id].append(self.t) # If the agent got any extra model outputs, buffer them, too. - if extra_model_output and agent_id in extra_model_output: + if extra_model_outputs and agent_id in extra_model_outputs: # Flush the default `None` from buffer. self.agent_buffers[agent_id]["extra_model_outputs"].get_nowait() # STore the extra model outputs into the buffer. self.agent_buffers[agent_id]["extra_model_outputs"].put_nowait( - extra_model_output[agent_id] + extra_model_outputs[agent_id] ) # CASE 3: No observation and no action. # We have neither observation nor action. Then, we could have `reward`, # `is_terminated` or `is_truncated` and should record it. - elif agent_id not in observation and agent_id not in action: + elif agent_id not in observations and agent_id not in actions: # The agent could be is_terminated if agent_is_terminated or agent_is_truncated: # If the agent has never stepped, we treat it as not being @@ -1076,11 +1080,11 @@ def add_timestep( # as it is initialized as a zero reward. agent_reward = self.agent_buffers[agent_id]["rewards"].get_nowait() # If a reward is received at this timestep record it. - if agent_id in reward: + if agent_id in rewards: # TODO (simon): Refactor to `record_rewards()`. - agent_reward += reward[agent_id] + agent_reward += rewards[agent_id] # Add to the global reward list. - self.partial_rewards[agent_id].append(reward[agent_id]) + self.partial_rewards[agent_id].append(rewards[agent_id]) # Add also to the global reward timestep mapping. self.partial_rewards_t[agent_id].append(self.t) @@ -1088,27 +1092,27 @@ def add_timestep( # it in the timestep mapping. self.global_t_to_local_t[agent_id].append(self.t) # Finish the agent's episode. - self.agent_episodes[agent_id].add_timestep( + self.agent_episodes[agent_id].add_env_step( observation=self.agent_episodes[agent_id].observations[-1], action=agent_action, reward=agent_reward, - info=info.get(agent_id), - is_terminated=agent_is_terminated, - is_truncated=agent_is_truncated, - extra_model_output=agent_extra_model_output, + infos=infos.get(agent_id), + terminated=agent_is_terminated, + truncated=agent_is_truncated, + extra_model_outputs=agent_extra_model_output, ) # The agent is still alive. else: # If the agent received an reward (triggered by actions of # other agents) we collect it and add it to the one in the # buffer. - if agent_id in reward: + if agent_id in rewards: self.agent_buffers[agent_id]["rewards"].put_nowait( self.agent_buffers[agent_id]["rewards"].get_nowait() - + reward[agent_id] + + rewards[agent_id] ) # Add to the global reward list. - self.partial_rewards[agent_id].append(reward[agent_id]) + self.partial_rewards[agent_id].append(rewards[agent_id]) # Add also to the global reward timestep mapping. self.partial_rewards_t[agent_id].append(self.t) # CASE 4: Observation and action. @@ -1128,31 +1132,31 @@ def add_timestep( self.global_t_to_local_t[agent_id].append(self.t) # Record the action to the global action timestep mapping. self.global_actions_t[agent_id].append(self.t) - if agent_id in reward: + if agent_id in rewards: # Also add to the global reward list. - self.partial_rewards[agent_id].append(reward[agent_id]) + self.partial_rewards[agent_id].append(rewards[agent_id]) # And add to the global reward timestep mapping. self.partial_rewards_t[agent_id].append(self.t) # Add timestep to `SingleAgentEpisode`. - self.agent_episodes[agent_id].add_timestep( - observation=observation[agent_id], - action=action[agent_id], - reward=0.0 if agent_id not in reward else reward[agent_id], - info=info.get(agent_id), - is_terminated=agent_is_terminated, - is_truncated=agent_is_truncated, - extra_model_output=None - if extra_model_output is None - else extra_model_output[agent_id], + self.agent_episodes[agent_id].add_env_step( + observation=observations[agent_id], + action=actions[agent_id], + reward=0.0 if agent_id not in rewards else rewards[agent_id], + infos=infos.get(agent_id), + terminated=agent_is_terminated, + truncated=agent_is_truncated, + extra_model_outputs=None + if extra_model_outputs is None + else extra_model_outputs[agent_id], ) @property def is_done(self): """Whether the episode is actually done (terminated or truncated). - A done episode cannot be continued via `self.add_timestep()` or being + A done episode cannot be continued via `self.add_env_step()` or being concatenated on its right-side with another episode chunk or being - succeeded via `self.create_successor()`. + succeeded via `self.cut()`. Note that in a multi-agent environment this does not necessarily correspond to single agents having terminated or being truncated. @@ -1178,23 +1182,15 @@ def is_done(self): # TODO (sven, simon): We are taking over dead agents to the successor # is this intended or should we better check during concatenation, if # the union of agents from both episodes is included? Next issue. - def create_successor(self) -> "MultiAgentEpisode": - """Restarts an ongoing episode from its last observation. - - Note, this method is used so far specifically for the case of - `batch_mode="truncated_episodes"` to ensure that episodes are - immutable inside the `EnvRunner` when truncated and passed over - to postprocessing. - - The newly created `MultiAgentEpisode` contains the same id, and - starts at the timestep where it's predecessor stopped in the last - rollout. Last observations, infos, rewards, etc. are carried over - from the predecessor. This also helps to not carry stale data that - had been collected in the last rollout when rolling out the new - policy in the next iteration (rollout). - - Returns: A MultiAgentEpisode starting at the timepoint where - its predecessor stopped. + def cut(self) -> "MultiAgentEpisode": + """Returns a successor episode chunk (of len=0) continuing from this Episode. + + The successor will have the same ID as `self` and starts at the timestep where + it's predecessor `self` left off. The current observations and infos + are carried over from the predecessor as initial observations/infos. + + Returns: A MultiAgentEpisode starting at the timestep where its predecessor + stopped. """ assert not self.is_done @@ -1205,8 +1201,8 @@ def create_successor(self) -> "MultiAgentEpisode": agent_id: agent_eps.id_ for agent_id, agent_eps in self.agent_episodes.items() }, - is_terminated=self.is_terminated, - is_truncated=self.is_truncated, + terminateds=self.is_terminated, + truncateds=self.is_truncated, t_started=self.t, ) @@ -1215,7 +1211,7 @@ def create_successor(self) -> "MultiAgentEpisode": # all agents that are still alive. if not agent_eps.is_done and agent_eps.observations: # Build a successor for each agent that is not done, yet. - successor.agent_episodes[agent_id] = agent_eps.create_successor() + successor.agent_episodes[agent_id] = agent_eps.cut() # Record the initial observation in the global timestep mapping. successor.global_t_to_local_t[agent_id] = _IndexMapping( [self.global_t_to_local_t[agent_id][-1]] @@ -1225,8 +1221,8 @@ def create_successor(self) -> "MultiAgentEpisode": else: successor.agent_episodes[agent_id] = SingleAgentEpisode( id_=agent_eps.id_, - is_terminated=agent_eps.is_terminated, - is_truncated=agent_eps.is_truncated, + terminated=agent_eps.is_terminated, + truncated=agent_eps.is_truncated, ) successor.global_t_to_local_t[agent_id] = _IndexMapping() @@ -1324,7 +1320,7 @@ def to_sample_batch(self) -> MultiAgentBatch: # Note, only agents that have stepped are included into the batch. return MultiAgentBatch( policy_batches={ - agent_id: agent_eps.to_sample_batch() + agent_id: agent_eps.get_sample_batch() for agent_id, agent_eps in self.agent_episodes.items() if agent_eps.t - agent_eps.t_started > 0 }, @@ -1347,7 +1343,7 @@ def get_return(self, consider_buffer=False) -> float: sum(len(agent_map) for agent_map in self.global_t_to_local_t.values()) > 0 ), ( "ERROR: Cannot determine return of episode that hasn't started, yet!" - "Call `MultiAgentEpisode.add_initial_observation(initial_observation=)` " + "Call `MultiAgentEpisode.add_env_reset(observations=)` " "first (after which `get_return(MultiAgentEpisode)` will be 0)." ) env_return = sum( @@ -1489,47 +1485,14 @@ def _generate_single_agent_episode( actions: Optional[List[MultiAgentDict]] = None, rewards: Optional[List[MultiAgentDict]] = None, infos: Optional[List[MultiAgentDict]] = None, - is_terminateds: Union[MultiAgentDict, bool] = False, - is_truncateds: Union[MultiAgentDict, bool] = False, - extra_model_outputs: Optional[MultiAgentDict] = None, + terminateds: Union[MultiAgentDict, bool] = False, + truncateds: Union[MultiAgentDict, bool] = False, + extra_model_outputs: Optional[List[MultiAgentDict]] = None, ) -> SingleAgentEpisode: - """Generates a `SingleAgentEpisode` from multi-agent data. + """Generates a SingleAgentEpisode from multi-agent data. Note, if no data is provided an empty `SingleAgentEpiosde` - will be returned that starts at `SIngleAgentEpisode.t_started=0`. - - Args: - agent_id: String, idnetifying the agent for which the data should - be extracted. - agent_episode_ids: Optional. A dictionary mapping agents to - corresponding episode ids. If `None` the `SingleAgentEpisode` - creates a hexadecimal code. - observations: Optional. A list of dictionaries, each mapping - from agent ids to observations. When data is provided - it should be complete, i.e. observations, actions, rewards, - etc. should be provided. - actions: Optional. A list of dictionaries, each mapping - from agent ids to actions. When data is provided - it should be complete, i.e. observations, actions, rewards, - etc. should be provided. - rewards: Optional. A list of dictionaries, each mapping - from agent ids to rewards. When data is provided - it should be complete, i.e. observations, actions, rewards, - etc. should be provided. - infos: Optional. A list of dictionaries, each mapping - from agent ids to infos. When data is provided - it should be complete, i.e. observations, actions, rewards, - etc. should be provided. - extra_model_outputs: Optional. A list of agent mappings for every - timestep. Each of these dictionaries maps an agent to its - corresponding `extra_model_outputs`, which a re specific model - outputs needed by the algorithm used (e.g. `vf_preds` and - `action_logp` for PPO). f data is provided it should be complete - (i.e. observations, actions, rewards, is_terminated, is_truncated, - and all necessary `extra_model_outputs`). - - Returns: An instance of `SingleAgentEpisode` containing the agent's - extracted episode data. + will be returned that starts at `SingleAgentEpisode.t_started=0`. """ # If an episode id for an agent episode was provided assign it. @@ -1569,7 +1532,7 @@ def _generate_single_agent_episode( ) # Like observations, infos start at timestep `t=0`, so we do not need to # shift or start later when using the global timestep mapping. But we - # need to use tha timestep carriage in case the starting timestep is + # need to use the timestep carriage in case the starting timestep is # different from the length of observations-after-initialization. agent_infos = ( None @@ -1579,7 +1542,7 @@ def _generate_single_agent_episode( ) ) - agent_extra_model_outputs = ( + _agent_extra_model_outputs = ( None if extra_model_outputs is None else self._get_single_agent_data( @@ -1588,35 +1551,15 @@ def _generate_single_agent_episode( use_global_t_to_local_t=False, ) ) + # Convert `extra_model_outputs` for this agent from list of dicts to dict + # of lists. + agent_extra_model_outputs = defaultdict(list) + for _model_out in _agent_extra_model_outputs: + for key, val in _model_out.items(): + agent_extra_model_outputs[key].append(val) - agent_is_terminated = ( - [False] - if is_terminateds is None - else self._get_single_agent_data( - agent_id, is_terminateds, use_global_t_to_local_t=False - ) - # else self._get_single_agent_data( - # agent_id, is_terminateds, start_index=1, shift=-1 - # ) - ) - # If a list the list could be empty, if the agent never stepped. - agent_is_terminated = ( - False if not agent_is_terminated else agent_is_terminated[-1] - ) - - agent_is_truncated = ( - [False] - if is_truncateds is None - else self._get_single_agent_data( - agent_id, - is_truncateds, - use_global_t_to_local_t=False, - ) - ) - # If a list the list could be empty, if the agent never stepped. - agent_is_truncated = ( - False if not agent_is_truncated else agent_is_truncated[-1] - ) + agent_is_terminated = terminateds.get(agent_id, False) + agent_is_truncated = truncateds.get(agent_id, False) # If there are as many actions as observations we have to buffer. if ( @@ -1626,14 +1569,18 @@ def _generate_single_agent_episode( ): # Assert then that the other data is in order. if agent_extra_model_outputs: - assert len(agent_extra_model_outputs) == len( - agent_actions - ), f"Agent {agent_id} has not as many extra model outputs as " - "actions." + assert all( + len(v) == len(agent_actions) + for v in agent_extra_model_outputs.values() + ), ( + f"Agent {agent_id} doesn't have the same number of " + "`extra_model_outputs` as it has actions " + f"({len(agent_actions)})." + ) # Put the last extra model outputs into the buffer. self.agent_buffers[agent_id]["extra_model_outputs"].get_nowait() self.agent_buffers[agent_id]["extra_model_outputs"].put_nowait( - agent_extra_model_outputs.pop() + {k: v.pop() for k, v in agent_extra_model_outputs.items()} ) # Put the last action into the buffer. @@ -1643,8 +1590,7 @@ def _generate_single_agent_episode( # `_generate_partial_rewards` method and can be done where # the global timestep and global action timestep # mappings are created (__init__). - # We have to take care of partial rewards when generating the - # agent rewards: + # We have to take care of partial rewards when generating the agent rewards: # 1. Rewards between different observations -> added up and # assigned to next observation. # 2. Rewards after the last observation -> get buffered and added up @@ -1671,7 +1617,6 @@ def _generate_single_agent_episode( if (t + 1) in self.global_t_to_local_t[agent_id][1:]: agent_rewards.append(agent_reward) agent_reward = 0.0 - continue # If the agent reward is not zero, we must have rewards that came # after the last observation. Then we buffer this reward. @@ -1688,8 +1633,8 @@ def _generate_single_agent_episode( actions=agent_actions, rewards=agent_rewards, infos=agent_infos, - is_terminated=agent_is_terminated, - is_truncated=agent_is_truncated, + terminated=agent_is_terminated, + truncated=agent_is_truncated, extra_model_outputs=agent_extra_model_outputs, ) # Otherwise return empty `SingleAgentEpisode`. @@ -1952,7 +1897,7 @@ def __len__(self): sum(len(agent_map) for agent_map in self.global_t_to_local_t.values()) > 0 ), ( "ERROR: Cannot determine length of episode that hasn't started, yet!" - "Call `MultiAgentEpisode.add_initial_observation(initial_observation=)` " + "Call `MultiAgentEpisode.add_env_reset(observations=)` " "first (after which `len(MultiAgentEpisode)` will be 0)." ) return self.t - self.t_started diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index bfee8ba7482b6..b2c3f137701cb 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -9,6 +9,7 @@ from ray.rllib.core.models.base import STATE_IN, STATE_OUT from ray.rllib.core.rl_module.rl_module import RLModule, SingleAgentRLModuleSpec from ray.rllib.env.env_runner import EnvRunner +from ray.rllib.env.single_agent_episode import SingleAgentEpisode from ray.rllib.env.utils import _gym_env_creator from ray.rllib.evaluation.metrics import RolloutMetrics from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch @@ -22,9 +23,6 @@ if TYPE_CHECKING: from ray.rllib.algorithms.algorithm_config import AlgorithmConfig - # TODO (sven): This gives a tricky circular import that goes - # deep into the library. We have to see, where to dissolve it. - from ray.rllib.env.single_agent_episode import SingleAgentEpisode _, tf, _ = try_import_tf() torch, nn = try_import_torch() @@ -74,15 +72,20 @@ def __init__(self, config: "AlgorithmConfig", **kwargs): # Create our own instance of the (single-agent) `RLModule` (which # the needs to be weight-synched) each iteration. - module_spec: SingleAgentRLModuleSpec = self.config.get_default_rl_module_spec() - module_spec.observation_space = self.env.envs[0].observation_space - # TODO (simon): The `gym.Wrapper` for `gym.vector.VectorEnv` should - # actually hold the spaces for a single env, but for boxes the - # shape is (1, 1) which brings a problem with the action dists. - # shape=(1,) is expected. - module_spec.action_space = self.env.envs[0].action_space - module_spec.model_config_dict = self.config.model - self.module: RLModule = module_spec.build() + try: + module_spec: SingleAgentRLModuleSpec = ( + self.config.get_default_rl_module_spec() + ) + module_spec.observation_space = self.env.envs[0].observation_space + # TODO (simon): The `gym.Wrapper` for `gym.vector.VectorEnv` should + # actually hold the spaces for a single env, but for boxes the + # shape is (1, 1) which brings a problem with the action dists. + # shape=(1,) is expected. + module_spec.action_space = self.env.envs[0].action_space + module_spec.model_config_dict = self.config.model + self.module: RLModule = module_spec.build() + except NotImplementedError: + self.module = None # This should be the default. self._needs_initial_reset: bool = True @@ -95,6 +98,11 @@ def __init__(self, config: "AlgorithmConfig", **kwargs): self._ts_since_last_metrics: int = 0 self._weights_seq_no: int = 0 + # TODO (sven): This is a temporary solution. STATE_OUTs + # will be resolved entirely as `extra_model_outputs` and + # not be stored separately inside Episodes. + self._states = [None for _ in range(self.num_envs)] + @override(EnvRunner) def sample( self, @@ -106,6 +114,7 @@ def sample( with_render_data: bool = False, ) -> List["SingleAgentEpisode"]: """Runs and returns a sample (n timesteps or m episodes) on the env(s).""" + assert not (num_timesteps is not None and num_episodes is not None) # If not execution details are provided, use the config. if num_timesteps is None and num_episodes is None: @@ -176,13 +185,11 @@ def _sample_timesteps( for i in range(self.num_envs): # TODO (sven): Maybe move this into connector pipeline # (even if automated). - self._episodes[i].add_initial_observation( - initial_observation=obs[i], - initial_info=infos[i], - # TODO (simon): Check, if this works for the default - # stateful encoders. - initial_state={k: s[i] for k, s in states.items()}, + self._episodes[i].add_env_reset( + observation=obs[i], + infos=infos[i], ) + self._states[i] = {k: s[i] for k, s in states.items()} # Do not reset envs, but instead continue in already started episodes. else: # Pick up stored observations and states from previous timesteps. @@ -193,8 +200,8 @@ def _sample_timesteps( states = { k: np.stack( [ - initial_states[k][i] if eps.states is None else eps.states[k] - for i, eps in enumerate(self._episodes) + initial_states[k][i] if state is None else state[k] + for i, state in enumerate(self._states) ] ) for k in initial_states.keys() @@ -207,7 +214,8 @@ def _sample_timesteps( # Act randomly. if random_actions: actions = self.env.action_space.sample() - # TODO (simon): Add action_logp for smapled actions. + action_logp = np.zeros(shape=(actions.shape[0],)) + fwd_out = {} # Compute an action using the RLModule. else: # Note, RLModule `forward()` methods expect `NestedDict`s. @@ -228,6 +236,8 @@ def _sample_timesteps( else: fwd_out = self.module.forward_inference(batch) + # TODO (sven): Will be completely replaced by connector logic in + # upcoming PR. actions, action_logp = self._sample_actions_if_necessary( fwd_out, explore ) @@ -260,54 +270,61 @@ def _sample_timesteps( if terminateds[i] or truncateds[i]: # Finish the episode with the actual terminal observation stored in # the info dict. - self._episodes[i].add_timestep( + self._episodes[i].add_env_step( # Gym vector env provides the `"final_observation"`. infos[i]["final_observation"], actions[i], rewards[i], - info=infos[i]["final_info"], - state=s, - is_terminated=terminateds[i], - is_truncated=truncateds[i], - extra_model_output=extra_model_output, + infos=infos[i]["final_info"], + terminated=terminateds[i], + truncated=truncateds[i], + extra_model_outputs=extra_model_output, ) + self._states[i] = s # Reset h-states to nthe model's intiial ones b/c we are starting a # new episode. - for k, v in self.module.get_initial_state().items(): - states[k][i] = convert_to_numpy(v) + if hasattr(self.module, "get_initial_state"): + for k, v in self.module.get_initial_state().items(): + states[k][i] = convert_to_numpy(v) - done_episodes_to_return.append(self._episodes[i]) - # Create a new episode object. + done_episodes_to_return.append(self._episodes[i].finalize()) + # Create a new episode object with already the reset data in it. self._episodes[i] = SingleAgentEpisode( - observations=[obs[i]], infos=[infos[i]], states=s + observations=[obs[i]], infos=[infos[i]] ) + self._states[i] = s else: - self._episodes[i].add_timestep( + self._episodes[i].add_env_step( obs[i], actions[i], rewards[i], - info=infos[i], - state=s, - extra_model_output=extra_model_output, + infos=infos[i], + extra_model_outputs=extra_model_output, ) + self._states[i] = s # Return done episodes ... self._done_episodes_for_metrics.extend(done_episodes_to_return) + # Also, make sure, we return a copy and start new chunks so that callers + # of this function do not alter the ongoing and returned Episode objects. + new_episodes = [eps.cut() for eps in self._episodes] + # ... and all ongoing episode chunks. # Initialized episodes do not have recorded any step and lack # `extra_model_outputs`. - ongoing_episodes = [episode for episode in self._episodes if episode.t > 0] - # Also, make sure, we return a copy and start new chunks so that callers - # of this function do not alter the ongoing and returned Episode objects. - self._episodes = [eps.create_successor() for eps in self._episodes] - for eps in ongoing_episodes: + ongoing_episodes_to_return = [ + episode.finalize() for episode in self._episodes if episode.t > 0 + ] + for eps in ongoing_episodes_to_return: self._ongoing_episodes_for_metrics[eps.id_].append(eps) # Record last metrics collection. self._ts_since_last_metrics += ts - return done_episodes_to_return + ongoing_episodes + self._episodes = new_episodes + + return done_episodes_to_return + ongoing_episodes_to_return def _sample_episodes( self, @@ -320,42 +337,50 @@ def _sample_episodes( See docstring of `self.sample()` for more details. """ - # TODO (sven): This gives a tricky circular import that goes # deep into the library. We have to see, where to dissolve it. from ray.rllib.env.single_agent_episode import SingleAgentEpisode + # If user calls sample(num_timesteps=..) after this, we must reset again + # at the beginning. + self._needs_initial_reset = True + done_episodes_to_return: List["SingleAgentEpisode"] = [] obs, infos = self.env.reset() episodes = [SingleAgentEpisode() for _ in range(self.num_envs)] - # Multiply states n times according to our vector env batch size (num_envs). - states = tree.map_structure( - lambda s: np.repeat(s, self.num_envs, axis=0), - self.module.get_initial_state(), - ) + # Get initial states for all 'batch_size_B` rows in the forward batch, + # i.e. for all vector sub_envs. + if hasattr(self.module, "get_initial_state"): + states = tree.map_structure( + lambda s: np.repeat(s, self.num_envs, axis=0), + self.module.get_initial_state(), + ) + else: + states = {} render_images = [None] * self.num_envs if with_render_data: render_images = [e.render() for e in self.env.envs] for i in range(self.num_envs): - # TODO (sven): Maybe move this into connector pipeline - # (even if automated). - episodes[i].add_initial_observation( - initial_observation=obs[i], - initial_info=infos[i], - initial_state={k: s[i] for k, s in states.items()}, - initial_render_image=render_images[i], + episodes[i].add_env_reset( + observation=obs[i], + infos=infos[i], + render_image=render_images[i], ) eps = 0 while eps < num_episodes: if random_actions: actions = self.env.action_space.sample() + action_logp = np.zeros(shape=(actions.shape[0])) + fwd_out = {} else: batch = { + # TODO (sven): This will move entirely into connector logic in + # upcoming PR. STATE_IN: tree.map_structure( lambda s: self._convert_from_numpy(s), states ), @@ -368,17 +393,18 @@ def _sample_episodes( else: fwd_out = self.module.forward_inference(batch) + # TODO (sven): This will move entirely into connector logic in upcoming + # PR. actions, action_logp = self._sample_actions_if_necessary( fwd_out, explore ) fwd_out = convert_to_numpy(fwd_out) + # TODO (sven): This will move entirely into connector logic in upcoming + # PR. if STATE_OUT in fwd_out: states = convert_to_numpy(fwd_out[STATE_OUT]) - # states = tree.map_structure( - # lambda s: s.numpy(), fwd_out[STATE_OUT] - # ) obs, rewards, terminateds, truncateds, infos = self.env.step(actions) if with_render_data: @@ -387,28 +413,27 @@ def _sample_episodes( for i in range(self.num_envs): # Extract info and state for vector sub_env. # info = {k: v[i] for k, v in infos.items()} - s = {k: s[i] for k, s in states.items()} # The last entry in self.observations[i] is already the reset # obs of the new episode. extra_model_output = {} for k, v in fwd_out.items(): if SampleBatch.ACTIONS not in k: extra_model_output[k] = v[i] - # TODO (simon, sven): Some algos do not have logps. + # TODO (sven): This will move entirely into connector logic in upcoming + # PR. extra_model_output[SampleBatch.ACTION_LOGP] = action_logp[i] if terminateds[i] or truncateds[i]: eps += 1 - episodes[i].add_timestep( + episodes[i].add_env_step( infos[i]["final_observation"], actions[i], rewards[i], - info=infos[i]["final_info"], - state=s, - is_terminated=terminateds[i], - is_truncated=truncateds[i], - extra_model_output=extra_model_output, + infos=infos[i]["final_info"], + terminated=terminateds[i], + truncated=truncateds[i], + extra_model_outputs=extra_model_output, ) done_episodes_to_return.append(episodes[i]) @@ -418,38 +443,33 @@ def _sample_episodes( if eps == num_episodes: break - # Reset h-states to the model's initial ones b/c we are starting - # a new episode. - for k, v in self.module.get_initial_state().items(): - states[k][i] = (convert_to_numpy(v),) + # TODO (sven): This will move entirely into connector logic in + # upcoming PR. + if hasattr(self.module, "get_initial_state"): + for k, v in self.module.get_initial_state().items(): + states[k][i] = (convert_to_numpy(v),) # Create a new episode object. episodes[i] = SingleAgentEpisode( observations=[obs[i]], infos=[infos[i]], - states=s, render_images=None if render_images[i] is None else [render_images[i]], ) else: - episodes[i].add_timestep( + episodes[i].add_env_step( obs[i], actions[i], rewards[i], - info=infos[i], - state=s, + infos=infos[i], render_image=render_images[i], - extra_model_output=extra_model_output, + extra_model_outputs=extra_model_output, ) self._done_episodes_for_metrics.extend(done_episodes_to_return) self._ts_since_last_metrics += sum(len(eps) for eps in done_episodes_to_return) - # If user calls sample(num_timesteps=..) after this, we must reset again - # at the beginning. - self._needs_initial_reset = True - # Initialized episodes have to be removed as they lack `extra_model_outputs`. return [episode for episode in done_episodes_to_return if episode.t > 0] diff --git a/rllib/env/single_agent_episode.py b/rllib/env/single_agent_episode.py index 25c7a02bb0465..8071f0d335201 100644 --- a/rllib/env/single_agent_episode.py +++ b/rllib/env/single_agent_episode.py @@ -1,127 +1,301 @@ +import functools +from collections import defaultdict import numpy as np import uuid +import gymnasium as gym from gymnasium.core import ActType, ObsType -from typing import Any, Dict, List, Optional, SupportsFloat +from typing import Any, Dict, List, Optional, SupportsFloat, Union from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.env.utils import BufferWithInfiniteLookback class SingleAgentEpisode: + """A class representing RL environment episodes for individual agents. + + SingleAgentEpisode stores observations, info dicts, actions, rewards, and all + module outputs (e.g. state outs, action logp, etc..) for an individual agent within + some single-agent or multi-agent environment. + The two main APIs to add data to an ongoing episode are the `add_env_reset()` + and `add_env_step()` methods, which should be called passing the outputs of the + respective gym.Env API calls: `env.reset()` and `env.step()`. + + A SingleAgentEpisode might also only represent a chunk of an episode, which is + useful for cases, in which partial (non-complete episode) sampling is performed + and collected episode data has to be returned before the actual gym.Env episode has + finished (see `SingleAgentEpisode.cut()`). In order to still maintain visibility + onto past experiences within such a "cut" episode, SingleAgentEpisode instances + can have a "lookback buffer" of n timesteps at their beginning (left side), which + solely exists for the purpose of compiling extra data (e.g. "prev. reward"), but + is not considered part of the finished/packaged episode (b/c the data in the + lookback buffer is already part of a previous episode chunk). + + Powerful getter methods, such as `get_observations()` help collect different types + of data from the episode at individual time indices or time ranges, including the + "lookback buffer" range described above. For example, to extract the last 4 rewards + of an ongoing episode, one can call `self.get_rewards(slice(-4, None))` or + `self.rewards[-4:]`. This would work, even if the ongoing SingleAgentEpisode is + a continuation chunk from a much earlier started episode, as long as it has a + lookback buffer size of sufficient size. + + Examples: + + .. testcode:: + + import gymnasium as gym + + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + + # Construct a new episode (without any data in it yet). + episode = SingleAgentEpisode() + assert len(episode) == 0 + + # Fill the episode with some data (10 timesteps). + env = gym.make("CartPole-v1") + obs, infos = env.reset() + episode.add_env_reset(obs, infos) + + # Even with the initial obs/infos, the episode is still considered len=0. + assert len(episode) == 0 + for _ in range(10): + action = env.action_space.sample() + obs, reward, term, trunc, infos = env.step(action) + episode.add_env_step( + observation=obs, + action=action, + reward=reward, + terminated=term, + truncated=trunc, + infos=infos, + ) + assert len(episode) == 10 + + # We can now access information from the episode via the getters. + + # Get the last 3 rewards (in a batch of size 3). + episode.get_rewards(slice(-3, None)) # same as `episode.rewards[-3:]` + + # Get the most recent action (single item, not batched). + # This works regardless of the action space or whether the episode has + # been finalized or not (see below). + episode.get_actions(-1) # same as episode.actions[-1] + + # Looking back from ts=1, get the previous 4 rewards AND fill with 0.0 + # in case we go over the beginning (ts=0). So we would expect + # [0.0, 0.0, 0.0, r0] to be returned here, where r0 is the very first received + # reward in the episode: + episode.get_rewards(slice(-4, 0), neg_indices_left_of_zero=True, fill=0.0) + + # Note the use of fill=0.0 here (fill everything that's out of range with this + # value) AND the argument `neg_indices_left_of_zero=True`, which interprets + # negative indices as being left of ts=0 (e.g. -1 being the timestep before + # ts=0). + + # Assuming we had a complex action space (nested gym.spaces.Dict) with one or + # more elements being Discrete or MultiDiscrete spaces: + # 1) The `fill=...` argument would still work, filling all spaces (Boxes, + # Discrete) with that provided value. + # 2) Setting the flag `one_hot_discrete=True` would convert those discrete + # sub-components automatically into one-hot (or multi-one-hot) tensors. + # This simplifies the task of having to provide the previous 4 (nested and + # partially discrete/multi-discrete) actions for each timestep within a training + # batch, thereby filling timesteps before the episode started with 0.0s and + # one-hot'ing the discrete/multi-discrete components in these actions: + episode = SingleAgentEpisode(action_space=gym.spaces.Dict({ + "a": gym.spaces.Discrete(3), + "b": gym.spaces.MultiDiscrete([2, 3]), + "c": gym.spaces.Box(-1.0, 1.0, (2,)), + })) + + # ... fill episode with data ... + episode.add_env_reset(observation=0) + # ... from a few steps. + episode.add_env_step( + observation=1, + action={"a":0, "b":np.array([1, 2]), "c":np.array([.5, -.5], np.float32)}, + reward=1.0, + ) + + # In your connector + prev_4_a = [] + # Note here that len(episode) does NOT include the lookback buffer. + for ts in range(len(episode)): + prev_4_a.append( + episode.get_actions( + indices=slice(ts - 4, ts), + # Make sure negative indices are interpreted as + # "into lookback buffer" + neg_indices_left_of_zero=True, + # Zero-out everything even further before the lookback buffer. + fill=0.0, + # Take care of discrete components (get ready as NN input). + one_hot_discrete=True, + ) + ) + + # Finally, convert from list of batch items to a struct (same as action space) + # of batched (numpy) arrays, in which all leafs have B==len(prev_4_a). + from ray.rllib.utils.spaces.space_utils import batch + + prev_4_actions_col = batch(prev_4_a) + """ + def __init__( self, id_: Optional[str] = None, *, observations: List[ObsType] = None, + observation_space: Optional[gym.Space] = None, actions: List[ActType] = None, + action_space: Optional[gym.Space] = None, rewards: List[SupportsFloat] = None, infos: List[Dict] = None, - states=None, - t_started: Optional[int] = None, - is_terminated: bool = False, - is_truncated: bool = False, - render_images: Optional[List[np.ndarray]] = None, + terminated: bool = False, + truncated: bool = False, extra_model_outputs: Optional[Dict[str, Any]] = None, + render_images: Optional[List[np.ndarray]] = None, + t_started: Optional[int] = None, + len_lookback_buffer: Optional[int] = 0, ) -> "SingleAgentEpisode": - """Initializes a `SingleAgentEpisode` instance. + """Initializes a SingleAgentEpisode instance. - This constructor can be called with or without sampled data. Note - that if data is provided the episode will start at timestep - `t_started = len(observations) - 1` (the initial observation is not - counted). If the episode should start at `t_started = 0` (e.g. - because the instance should simply store episode data) this has to - be provided in the `t_started` parameter of the constructor. + This constructor can be called with or without already sampled data, part of + which might then go into the lookback buffer. Args: id_: Optional. Unique identifier for this episode. If no id is provided the constructor generates a hexadecimal code for the id. observations: Optional. A list of observations from a rollout. If data is provided it should be complete (i.e. observations, actions, - rewards, is_terminated, is_truncated, and all necessary + rewards, terminated, truncated, and all necessary `extra_model_outputs`). The length of the `observations` defines the default starting value. See the parameter `t_started`. actions: Optional. A list of actions from a rollout. If data is provided it should be complete (i.e. observations, actions, - rewards, is_terminated, is_truncated, and all necessary + rewards, terminated, truncated, and all necessary `extra_model_outputs`). rewards: Optional. A list of rewards from a rollout. If data is provided it should be complete (i.e. observations, actions, - rewards, is_terminated, is_truncated, and all necessary + rewards, terminated, truncated, and all necessary `extra_model_outputs`). infos: Optional. A list of infos from a rollout. If data is provided it should be complete (i.e. observations, actions, - rewards, is_terminated, is_truncated, and all necessary + rewards, terminated, truncated, and all necessary `extra_model_outputs`). states: Optional. The hidden model states from a rollout. If data is provided it should be complete (i.e. observations, actions, - rewards, is_terminated, is_truncated, and all necessary + rewards, terminated, truncated, and all necessary `extra_model_outputs`). States are only avasilable if a stateful model (`RLModule`) is used. - t_started: Optional. The starting timestep of the episode. The default - is zero. If data is provided, the starting point is from the last - observation onwards (i.e. `t_started = len(observations) - 1). If - this parameter is provided the episode starts at the provided value. - is_terminated: Optional. A boolean indicating, if the episode is already + terminated: Optional. A boolean indicating, if the episode is already terminated. Note, this parameter is only needed, if episode data is provided in the constructor. The default is `False`. - is_truncated: Optional. A boolean indicating, if the episode was + truncated: Optional. A boolean indicating, if the episode was truncated. Note, this parameter is only needed, if episode data is provided in the constructor. The default is `False`. - render_images: Optional. A list of RGB uint8 images from rendering - the environment. extra_model_outputs: Optional. A list of dictionaries containing specific model outputs for the algorithm used (e.g. `vf_preds` and `action_logp` for PPO) from a rollout. If data is provided it should be complete - (i.e. observations, actions, rewards, is_terminated, is_truncated, + (i.e. observations, actions, rewards, terminated, truncated, and all necessary `extra_model_outputs`). + render_images: Optional. A list of RGB uint8 images from rendering + the environment. + t_started: Optional. The starting timestep of the episode. The default + is zero. If data is provided, the starting point is from the last + observation onwards (i.e. `t_started = len(observations) - 1). If + this parameter is provided the episode starts at the provided value. + len_lookback_buffer: The size of an optional lookback buffer to keep in + front of this Episode. If >0, will interpret the first + `len_lookback_buffer` items in each data as NOT part of this actual + episode chunk, but instead serve as historic data that may be viewed. + If None, will interpret all provided data in constructor as part of the + lookback buffer. """ self.id_ = id_ or uuid.uuid4().hex + + # Lookback buffer length is provided. + if len_lookback_buffer is not None: + self._len_lookback_buffer = len_lookback_buffer + # Lookback buffer length is not provided. Interpret already given data as + # lookback buffer. + else: + self._len_lookback_buffer = len(rewards or []) + + infos = infos or [{} for _ in range(len(observations or []))] + # Observations: t0 (initial obs) to T. - self.observations = [] if observations is None else observations + self.observation_space = observation_space + self.observations = BufferWithInfiniteLookback( + data=observations, + lookback=self._len_lookback_buffer, + space=observation_space, + ) # Actions: t1 to T. - self.actions = [] if actions is None else actions + self.action_space = action_space + self.actions = BufferWithInfiniteLookback( + data=actions, + lookback=self._len_lookback_buffer, + space=action_space, + ) # Rewards: t1 to T. - self.rewards = [] if rewards is None else rewards + self.rewards = BufferWithInfiniteLookback( + data=rewards, + lookback=self._len_lookback_buffer, + space=gym.spaces.Box(float("-inf"), float("inf"), (), np.float32), + ) # Infos: t0 (initial info) to T. - if infos is None: - self.infos = [{} for _ in range(len(self.observations))] - else: - self.infos = infos - # h-states: t0 (in case this episode is a continuation chunk, we need to know - # about the initial h) to T. - # TODO (simon): Create as list not as singleton. - self.states = states - # The global last timestep of the episode and the timesteps when this chunk - # started. - # TODO (simon): Check again what are the consequences of this decision for - # the methods of this class. For example the `validate()` method or - # `create_successor`. Write a test. - # Note, the case `t_started > len(observations) - 1` can occur, if a user - # wants to have an episode that is ongoing but does not want to carry the - # stale data from the last rollout in it. - self.t = self.t_started = ( - t_started if t_started is not None else max(len(self.observations) - 1, 0) + self.infos = BufferWithInfiniteLookback( + data=infos, + lookback=self._len_lookback_buffer, ) - if self.t_started < len(self.observations) - 1: - self.t = len(self.observations) - 1 # obs[-1] is the final observation in the episode. - self.is_terminated = is_terminated + self.is_terminated = terminated # obs[-1] is the last obs in a truncated-by-the-env episode (there will no more # observations in following chunks for this episode). - self.is_truncated = is_truncated + self.is_truncated = truncated + # Extra model outputs, e.g. `action_dist_input` needed in the batch. + self.extra_model_outputs = defaultdict( + functools.partial( + BufferWithInfiniteLookback, + lookback=self._len_lookback_buffer, + ), + ) + for k, v in (extra_model_outputs or {}).items(): + if isinstance(v, BufferWithInfiniteLookback): + self.extra_model_outputs[k] = v + else: + self.extra_model_outputs[k].data = v + # RGB uint8 images from rendering the env; the images include the corresponding # rewards. assert render_images is None or observations is not None - self.render_images = [] if render_images is None else render_images - # Extra model outputs, e.g. `action_dist_input` needed in the batch. - self.extra_model_outputs = ( - {} if extra_model_outputs is None else extra_model_outputs + self.render_images = render_images or [] + + # The global last timestep of the episode and the timesteps when this chunk + # started (excluding a possible lookback buffer). + self.t_started = t_started or 0 + + self.t = ( + (len(rewards) if rewards is not None else 0) + - self._len_lookback_buffer + + self.t_started ) + # Validate the episode data thus far. + self.validate() + def concat_episode(self, episode_chunk: "SingleAgentEpisode"): """Adds the given `episode_chunk` to the right side of self. + In order for this to work, both chunks (`self` and `episode_chunk`) must fit + together. This is checked by the IDs (must be identical), the time step counters + (`self.t` must be the same as `episode_chunk.t_started`), as well as the + observations/infos at the concatenation boundaries (`self.observations[-1]` + must match `episode_chunk.observations[0]`). Also, `self.is_done` must not be + True, meaning `self.is_terminated` and `self.is_truncated` are both False. + Args: episode_chunk: Another `SingleAgentEpisode` to be concatenated. @@ -129,7 +303,7 @@ def concat_episode(self, episode_chunk: "SingleAgentEpisode"): from both episodes. """ assert episode_chunk.id_ == self.id_ - assert not self.is_done + assert not self.is_done and not self.is_finalized # Make sure the timesteps match. assert self.t == episode_chunk.t_started @@ -144,41 +318,42 @@ def concat_episode(self, episode_chunk: "SingleAgentEpisode"): # Extend ourselves. In case, episode_chunk is already terminated (and numpyfied) # we need to convert to lists (as we are ourselves still filling up lists). - self.observations.extend(list(episode_chunk.observations)) - self.actions.extend(list(episode_chunk.actions)) - self.rewards.extend(list(episode_chunk.rewards)) - self.infos.extend(list(episode_chunk.infos)) + self.observations.extend(episode_chunk.get_observations()) + self.actions.extend(episode_chunk.get_actions()) + self.rewards.extend(episode_chunk.get_rewards()) + self.infos.extend(episode_chunk.get_infos()) self.t = episode_chunk.t - self.states = episode_chunk.states if episode_chunk.is_terminated: self.is_terminated = True elif episode_chunk.is_truncated: self.is_truncated = True - for k, v in episode_chunk.extra_model_outputs.items(): - self.extra_model_outputs[k].extend(list(v)) + for model_out_key in episode_chunk.extra_model_outputs.keys(): + self.extra_model_outputs[model_out_key].extend( + episode_chunk.get_extra_model_outputs(model_out_key) + ) # Validate. self.validate() - def add_initial_observation( + def add_env_reset( self, + observation: ObsType, + infos: Optional[Dict] = None, *, - initial_observation: ObsType, - initial_info: Optional[Dict] = None, - initial_state=None, - initial_render_image: Optional[np.ndarray] = None, + render_image: Optional[np.ndarray] = None, ) -> None: - """Adds the initial data to the episode. + """Adds the initial data (after an `env.reset()`) to the episode. + + This data consists of initial observations and initial infos, as well as + - optionally - a render image. Args: - initial_observation: Obligatory. The initial observation. - initial_info: Optional. The initial info. - initial_state: Optional. The initial hidden state of a - model (`RLModule`) if the latter is stateful. - initial_render_image: Optional. An RGB uint8 image from rendering - the environment. + observation: The initial observation returned by `env.reset()`. + infos: An (optional) info dict returned by `env.reset()`. + render_image: Optional. An RGB uint8 image from rendering + the environment right after the reset. """ assert not self.is_done assert len(self.observations) == 0 @@ -186,48 +361,55 @@ def add_initial_observation( # Leave self.t (and self.t_started) at 0. assert self.t == self.t_started == 0 - initial_info = initial_info or {} + infos = infos or {} + + if self.observation_space is not None: + assert self.observation_space.contains(observation), ( + f"`observation` {observation} does NOT fit SingleAgentEpisode's " + f"observation_space: {self.observation_space}!" + ) + + self.observations.append(observation) + self.infos.append(infos) + if render_image is not None: + self.render_images.append(render_image) - self.observations.append(initial_observation) - self.states = initial_state - self.infos.append(initial_info) - if initial_render_image is not None: - self.render_images.append(initial_render_image) - # TODO (sven): Do we have to call validate here? It is our own function - # that manipulates the object. + # Validate our data. self.validate() - def add_timestep( + def add_env_step( self, observation: ObsType, action: ActType, reward: SupportsFloat, + infos: Optional[Dict[str, Any]] = None, *, - info: Optional[Dict[str, Any]] = None, - state=None, - is_terminated: bool = False, - is_truncated: bool = False, + terminated: bool = False, + truncated: bool = False, render_image: Optional[np.ndarray] = None, - extra_model_output: Optional[Dict[str, Any]] = None, + extra_model_outputs: Optional[Dict[str, Any]] = None, ) -> None: - """Adds a timestep to the episode. + """Adds results of an `env.step()` call (including the action) to this episode. + + This data consists of an observation and info dict, an action, a reward, + terminated/truncated flags, extra model outputs (e.g. action probabilities or + RNN internal state outputs), and - optionally - a render image. Args: - observation: The observation received from the - environment. - action: The last action used by the agent. - reward: The last reward received by the agent. - info: The last info recevied from the environment. - state: Optional. The last hidden state of the model (`RLModule` ). - This is only available, if the model is stateful. - is_terminated: A boolean indicating, if the environment has been - terminated. - is_truncated: A boolean indicating, if the environment has been - truncated. + observation: The observation received from the environment after taking + `action`. + action: The last action used by the agent during the call to `env.step()`. + reward: The last reward received by the agent after taking `action`. + infos: The last info received from the environment after taking `action`. + terminated: A boolean indicating, if the environment has been + terminated (after taking `action`). + truncated: A boolean indicating, if the environment has been + truncated (after taking `action`). render_image: Optional. An RGB uint8 image from rendering - the environment. - extra_model_output: The last timestep's specific model outputs - (e.g. `vf_preds` for PPO). + the environment (after taking `action`). + extra_model_outputs: The last timestep's specific model outputs. + These are normally outputs of an RLModule that were computed along with + `action`, e.g. `action_logp` or `action_dist_inputs`. """ # Cannot add data to an already done episode. assert ( @@ -237,55 +419,65 @@ def add_timestep( self.observations.append(observation) self.actions.append(action) self.rewards.append(reward) - info = info or {} - self.infos.append(info) - self.states = state + infos = infos or {} + self.infos.append(infos) self.t += 1 if render_image is not None: self.render_images.append(render_image) - if extra_model_output is not None: - for k, v in extra_model_output.items(): - if k not in self.extra_model_outputs: - self.extra_model_outputs[k] = [v] - else: - self.extra_model_outputs[k].append(v) - self.is_terminated = is_terminated - self.is_truncated = is_truncated + if extra_model_outputs is not None: + for k, v in extra_model_outputs.items(): + self.extra_model_outputs[k].append(v) + self.is_terminated = terminated + self.is_truncated = truncated + + # Validate our data. self.validate() + # Only check spaces every n timesteps. + if self.t % 50: + if self.observation_space is not None: + assert self.observation_space.contains(observation), ( + f"`observation` {observation} does NOT fit SingleAgentEpisode's " + f"observation_space: {self.observation_space}!" + ) + if self.action_space is not None: + assert self.action_space.contains(action), ( + f"`action` {action} does NOT fit SingleAgentEpisode's " + f"action_space: {self.action_space}!" + ) def validate(self) -> None: - """Validates the episode. + """Validates the episode's data. This function ensures that the data stored to a `SingleAgentEpisode` is in order (e.g. that the correct number of observations, actions, rewards are there). """ + assert len(self.observations) == len(self.infos) + if len(self.observations) == 0: + assert len(self.infos) == len(self.rewards) == len(self.actions) == 0 + for k, v in self.extra_model_outputs.items(): + assert len(v) == 0 # Make sure we always have one more obs stored than rewards (and actions) # due to the reset and last-obs logic of an MDP. - assert ( - len(self.observations) - == len(self.infos) - == len(self.rewards) + 1 - == len(self.actions) + 1 - ) - # TODO (sven): This is unclear to me. It makes sense - # to start at a point after the length of experiences - # provided at initialization, but when we test then here - # it will imo always error out. - # Example: we initialize the class by providing 101 observations, - # 100 actions and rewards. - # self.t = self.t_started = len(observations) - 1. Then - # we add a single timestep. self.t += 1 and - # self.t - self.t_started is 1, but len(rewards) is 100. - # assert len(self.rewards) == (self.t - self.t_started) - - if len(self.extra_model_outputs) > 0: + else: + assert ( + len(self.observations) + == len(self.infos) + == len(self.rewards) + 1 + == len(self.actions) + 1 + ) for k, v in self.extra_model_outputs.items(): assert len(v) == len(self.observations) - 1 - # Convert all lists to numpy arrays, if we are terminated. - if self.is_done: - self.convert_lists_to_numpy() + # Make sure, length of pre-buffer and len(self) make sense. + assert self._len_lookback_buffer + len(self) == len(self.rewards.data) + + @property + def is_finalized(self) -> bool: + """True, if the data in this episode is already stored as numpy arrays.""" + # If rewards are still a list, return False. + # Otherwise, rewards should already be a (1D) numpy array. + return self.rewards.finalized @property def is_done(self) -> bool: @@ -297,238 +489,727 @@ def is_done(self) -> bool: """ return self.is_terminated or self.is_truncated - def convert_lists_to_numpy(self) -> None: - """Converts list attributes to numpy arrays. + def finalize(self) -> "SingleAgentEpisode": + """Converts this Episode's list attributes to numpy arrays. + + This means in particular that this episodes' lists of (possibly complex) + data (e.g. if we have a dict obs space) will be converted to (possibly complex) + structs, whose leafs are now numpy arrays. Each of these leaf numpy arrays will + have the same length (batch dimension) as the length of the original lists. - When an episode is terminated or truncated (`self.is_done`) the data - will be not anymore touched and instead converted to numpy for later - use in postprocessing. This function converts all the data stored - into numpy arrays. + Note that SampleBatch.INFOS are NEVER numpy'ized and will remain a list + (normally, a list of the original, env-returned dicts). This is due to the + herterogenous nature of INFOS returned by envs, which would make it unwieldy to + convert this information to numpy arrays. + + After calling this method, no further data may be added to this episode via + the `self.add_env_step()` method. + + Examples: + + .. testcode:: + + import numpy as np + + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + + episode = SingleAgentEpisode( + observations=[0, 1, 2, 3], + actions=[1, 2, 3], + rewards=[1, 2, 3], + # Note: terminated/truncated have nothing to do with an episode + # being `finalized` or not (via the `self.finalize()` method)! + terminated=False, + ) + # Episode has not been finalized (numpy'ized) yet. + assert not episode.is_finalized + # We are still operating on lists. + assert episode.get_observations([1]) == [1] + assert episode.get_observations(slice(None, 2)) == [0, 1] + # We can still add data (and even add the terminated=True flag). + episode.add_env_step( + observation=4, + action=4, + reward=4, + terminated=True, + ) + # Still NOT finalized. + assert not episode.is_finalized + + # Let's finalize the episode. + episode.finalize() + assert episode.is_finalized + + # We cannot add data anymore. The following would crash. + # episode.add_env_step(observation=5, action=5, reward=5) + + # Everything is now numpy arrays (with 0-axis of size + # B=[len of requested slice]). + assert isinstance(episode.get_observations([1]), np.ndarray) # B=1 + assert isinstance(episode.actions[0:2], np.ndarray) # B=2 + assert isinstance(episode.rewards[1:4], np.ndarray) # B=3 + + Returns: + This `SingleAgentEpisode` object with the converted numpy data. """ - self.observations = np.array(self.observations) - self.actions = np.array(self.actions) - self.rewards = np.array(self.rewards) - self.infos = np.array(self.infos) + self.observations.finalize() + self.actions.finalize() + self.rewards.finalize() self.render_images = np.array(self.render_images, dtype=np.uint8) for k, v in self.extra_model_outputs.items(): - self.extra_model_outputs[k] = np.array(v) + self.extra_model_outputs[k].finalize() - def create_successor(self) -> "SingleAgentEpisode": - """Returns a successor episode chunk (of len=0) continuing with this one. + return self - The successor will have the same ID and state as self and its only observation - will be the last observation in self. Its length will therefore be 0 (no - steps taken yet). + def cut(self, len_lookback_buffer: int = 0) -> "SingleAgentEpisode": + """Returns a successor episode chunk (of len=0) continuing from this Episode. + + The successor will have the same ID as `self`. + If no lookback buffer is requested (len_lookback_buffer=0), the successor's + observations will be the last observation(s) of `self` and its length will + therefore be 0 (no further steps taken yet). If `len_lookback_buffer` > 0, + the returned successor will have `len_lookback_buffer` observations (and + actions, rewards, etc..) taken from the right side (end) of `self`. For example + if `len_lookback_buffer=2`, the returned successor's lookback buffer actions + will be identical to `self.actions[-2:]`. This method is useful if you would like to discontinue building an episode chunk (b/c you have to return it from somewhere), but would like to have a new - episode (chunk) instance to continue building the actual env episode at a later - time. + episode instance to continue building the actual gym.Env episode at a later + time. Vie the `len_lookback_buffer` argument, the continuing chunk (successor) + will still be able to "look back" into this predecessor episode's data (at + least to some extend, depending on the value of `len_lookback_buffer`). + + Args: + len_lookback_buffer: The number of timesteps to take along into the new + chunk as "lookback buffer". A lookback buffer is additional data on + the left side of the actual episode data for visibility purposes + (but without actually being part of the new chunk). For example, if + `self` ends in actions 5, 6, 7, and 8, and we call + `self.cut(len_lookback_buffer=2)`, the returned chunk will have + actions 7 and 8 already in it, but still `t_started`==t==8 (not 7!) and + a length of 0. If there is not enough data in `self` yet to fulfil + the `len_lookback_buffer` request, the value of `len_lookback_buffer` + is automatically adjusted (lowered). Returns: The successor Episode chunk of this one with the same ID and state and the only observation being the last observation in self. """ - assert not self.is_done + assert not self.is_done and len_lookback_buffer >= 0 + + # Initialize this chunk with the most recent obs and infos (even if lookback is + # 0). Similar to an initial `env.reset()`. + indices_obs_and_infos = slice(-len_lookback_buffer - 1, None) + indices_rest = ( + slice(-len_lookback_buffer, None) + if len_lookback_buffer > 0 + else slice(None, 0) + ) return SingleAgentEpisode( # Same ID. id_=self.id_, - # First (and only) observation of successor is this episode's last obs. - observations=[self.observations[-1]], - # First (and only) info of successor is this episode's last info. - infos=[self.infos[-1]], - # Same state. - states=self.states, + observations=self.get_observations(indices=indices_obs_and_infos), + infos=self.get_infos(indices=indices_obs_and_infos), + actions=self.get_actions(indices=indices_rest), + rewards=self.get_rewards(indices=indices_rest), + extra_model_outputs={ + k: self.get_extra_model_outputs(k, indices_rest) + for k in self.extra_model_outputs.keys() + }, # Continue with self's current timestep. t_started=self.t, ) - def to_sample_batch(self) -> SampleBatch: - """Converts a `SingleAgentEpisode` into a `SampleBatch`. + def get_observations( + self, + indices: Optional[Union[int, List[int], slice]] = None, + *, + neg_indices_left_of_zero: bool = False, + fill: Optional[float] = None, + one_hot_discrete: bool = False, + ) -> Any: + """Returns individual observations or batched ranges thereof from this episode. + + Args: + indices: A single int is interpreted as an index, from which to return the + individual observation stored at this index. + A list of ints is interpreted as a list of indices from which to gather + individual observations in a batch of size len(indices). + A slice object is interpreted as a range of observations to be returned. + Thereby, negative indices by default are interpreted as "before the end" + unless the `neg_indices_left_of_zero=True` option is used, in which case + negative indices are interpreted as "before ts=0", meaning going back + into the lookback buffer. + neg_indices_left_of_zero: If True, negative values in `indices` are + interpreted as "before ts=0", meaning going back into the lookback + buffer. For example, an episode with observations [4, 5, 6, 7, 8, 9], + where [4, 5, 6] is the lookback buffer range (ts=0 item is 7), will + respond to `get_observations(-1, neg_indices_left_of_zero=True)` + with `6` and to + `get_observations(slice(-2, 1), neg_indices_left_of_zero=True)` with + `[5, 6, 7]`. + fill: An optional float value to use for filling up the returned results at + the boundaries. This filling only happens if the requested index range's + start/stop boundaries exceed the episode's boundaries (including the + lookback buffer on the left side). This comes in very handy, if users + don't want to worry about reaching such boundaries and want to zero-pad. + For example, an episode with observations [10, 11, 12, 13, 14] and + lookback buffer size of 2 (meaning observations `10` and `11` are part + of the lookback buffer) will respond to + `get_observations(slice(-7, -2), fill=0.0)` with + `[0.0, 0.0, 10, 11, 12]`. + one_hot_discrete: If True, will return one-hot vectors (instead of + int-values) for those sub-components of a (possibly complex) observation + space that are Discrete or MultiDiscrete. Note that if `fill=0` and the + requested `indices` are out of the range of our data, the returned + one-hot vectors will actually be zero-hot (all slots zero). - Note that `RLlib` is relying in training on the `SampleBatch` class and - therefore episodes have to be converted to this format before training can - start. + Examples: + + .. testcode:: + + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + + episode = SingleAgentEpisode( + # Discrete(4) observations (ints between 0 and 4 (excl.)) + observation_space=gym.spaces.Discrete(4), + observations=[0, 1, 2, 3], + actions=[1, 2, 3], rewards=[1, 2, 3], # <- not relevant for this demo + ) + # Plain usage (`indices` arg only). + episode.get_observations(-1) # 3 + episode.get_observations(0) # 0 + episode.get_observations([0, 2]) # [0, 2] + episode.get_observations([-1, 0]) # [3, 0] + episode.get_observations(slice(None, 2)) # [0, 1] + episode.get_observations(slice(-2, None)) # [2, 3] + # Using `fill=...` (requesting slices beyond the boundaries). + episode.get_observations(slice(-6, -2), fill=-9) # [-9, -9, 0, 1] + episode.get_observations(slice(2, 5), fill=-7) # [2, 3, -7] + # Using `one_hot_discrete=True`. + episode.get_observations(2, one_hot_discrete=True) # [0 0 1 0] + episode.get_observations(3, one_hot_discrete=True) # [0 0 0 1] + episode.get_observations( + slice(0, 3), + one_hot_discrete=True, + ) # [[1 0 0 0], [0 1 0 0], [0 0 1 0]] + # Special case: Using `fill=0.0` AND `one_hot_discrete=True`. + episode.get_observations( + -1, + neg_indices_left_of_zero=True, # -1 means one left of ts=0 + fill=0.0, + one_hot_discrete=True, + ) # [0 0 0 0] <- all 0s one-hot tensor (note difference to [1 0 0 0]!) Returns: - An `ray.rLlib.policy.sample_batch.SampleBatch` instance containing this - episode's data. + The collected observations. + As a 0-axis batch, if there are several `indices` or a list of exactly one + index provided OR `indices` is a slice object. + As single item (B=0 -> no additional 0-axis) if `indices` is a single int. """ - return SampleBatch( - { - SampleBatch.EPS_ID: np.array([self.id_] * len(self)), - SampleBatch.OBS: self.observations[:-1], - SampleBatch.NEXT_OBS: self.observations[1:], - SampleBatch.ACTIONS: self.actions, - SampleBatch.REWARDS: self.rewards, - SampleBatch.T: list(range(self.t_started, self.t)), - SampleBatch.TERMINATEDS: np.array( - [False] * (len(self) - 1) + [self.is_terminated] - ), - SampleBatch.TRUNCATEDS: np.array( - [False] * (len(self) - 1) + [self.is_truncated] - ), - # Return the infos after stepping the environment. - SampleBatch.INFOS: self.infos[1:], - **self.extra_model_outputs, - } + return self.observations.get( + indices=indices, + neg_indices_left_of_zero=neg_indices_left_of_zero, + fill=fill, + one_hot_discrete=one_hot_discrete, ) - @staticmethod - def from_sample_batch(batch: SampleBatch) -> "SingleAgentEpisode": - """Converts a `SampleBatch` instance into a `SingleAegntEpisode`. + def get_infos( + self, + indices: Optional[Union[int, List[int], slice]] = None, + *, + neg_indices_left_of_zero: bool = False, + fill: Optional[Any] = None, + ) -> Any: + """Returns individual info dicts or batched ranges thereof from this episode. + + Args: + indices: A single int is interpreted as an index, from which to return the + individual info dict stored at this index. + A list of ints is interpreted as a list of indices from which to gather + individual info dicts in a list of size len(indices). + A slice object is interpreted as a range of info dicts to be returned. + Thereby, negative indices by default are interpreted as "before the end" + unless the `neg_indices_left_of_zero=True` option is used, in which case + negative indices are interpreted as "before ts=0", meaning going back + into the lookback buffer. + neg_indices_left_of_zero: If True, negative values in `indices` are + interpreted as "before ts=0", meaning going back into the lookback + buffer. For example, an episode with infos + [{"l":4}, {"l":5}, {"l":6}, {"a":7}, {"b":8}, {"c":9}], where the + first 3 items are the lookback buffer (ts=0 item is {"a": 7}), will + respond to `get_infos(-1, neg_indices_left_of_zero=True)` with + `{"l":6}` and to + `get_infos(slice(-2, 1), neg_indices_left_of_zero=True)` with + `[{"l":5}, {"l":6}, {"a":7}]`. + fill: An optional value to use for filling up the returned results at + the boundaries. This filling only happens if the requested index range's + start/stop boundaries exceed the episode's boundaries (including the + lookback buffer on the left side). This comes in very handy, if users + don't want to worry about reaching such boundaries and want to + auto-fill. For example, an episode with infos + [{"l":10}, {"l":11}, {"a":12}, {"b":13}, {"c":14}] and lookback buffer + size of 2 (meaning infos {"l":10}, {"l":11} are part of the lookback + buffer) will respond to `get_infos(slice(-7, -2), fill={"o": 0.0})` + with `[{"o":0.0}, {"o":0.0}, {"l":10}, {"l":11}, {"a":12}]`. + TODO (sven): This would require a space being provided. Maybe we can + skip this check for infos, which don't have a space anyways. + + Examples: + + .. testcode:: + + from ray.rllib.env.single_agent_episode import SingleAgentEpisode - The `ray.rllib.policy.sample_batch.SampleBatch` class is used in `RLlib` - for training an agent's modules (`RLModule`), converting from or to - `SampleBatch` can be performed by this function and its counterpart - `to_sample_batch()`. + episode = SingleAgentEpisode( + infos=[{"a":0}, {"b":1}, {"c":2}, {"d":3}], + # The following is needed, but not relevant for this demo. + observations=[0, 1, 2, 3], actions=[1, 2, 3], rewards=[1, 2, 3], + ) + # Plain usage (`indices` arg only). + episode.get_infos(-1) # {"d":3} + episode.get_infos(0) # {"a":0} + episode.get_infos([0, 2]) # [{"a":0},{"c":2}] + episode.get_infos([-1, 0]) # [{"d":3},{"a":0}] + episode.get_infos(slice(None, 2)) # [{"a":0},{"b":1}] + episode.get_infos(slice(-2, None)) # [{"c":2},{"d":3}] + # Using `fill=...` (requesting slices beyond the boundaries). + # TODO (sven): This would require a space being provided. Maybe we can + # skip this check for infos, which don't have a space anyways. + # episode.get_infos(slice(-5, -3), fill={"o":-1}) # [{"o":-1},{"a":0}] + # episode.get_infos(slice(3, 5), fill={"o":-2}) # [{"d":3},{"o":-2}] + + Returns: + The collected info dicts. + As a 0-axis batch, if there are several `indices` or a list of exactly one + index provided OR `indices` is a slice object. + As single item (B=0 -> no additional 0-axis) if `indices` is a single int. + """ + return self.infos.get( + indices=indices, + neg_indices_left_of_zero=neg_indices_left_of_zero, + fill=fill, + ) + + def get_actions( + self, + indices: Optional[Union[int, List[int], slice]] = None, + *, + neg_indices_left_of_zero: bool = False, + fill: Optional[float] = None, + one_hot_discrete: bool = False, + ) -> Any: + """Returns individual actions or batched ranges thereof from this episode. Args: - batch: A `SampleBatch` instance. It should contain only a single episode. + indices: A single int is interpreted as an index, from which to return the + individual action stored at this index. + A list of ints is interpreted as a list of indices from which to gather + individual actions in a batch of size len(indices). + A slice object is interpreted as a range of actions to be returned. + Thereby, negative indices by default are interpreted as "before the end" + unless the `neg_indices_left_of_zero=True` option is used, in which case + negative indices are interpreted as "before ts=0", meaning going back + into the lookback buffer. + neg_indices_left_of_zero: If True, negative values in `indices` are + interpreted as "before ts=0", meaning going back into the lookback + buffer. For example, an episode with actions [4, 5, 6, 7, 8, 9], where + [4, 5, 6] is the lookback buffer range (ts=0 item is 7), will respond + to `get_actions(-1, neg_indices_left_of_zero=True)` with `6` and + to `get_actions(slice(-2, 1), neg_indices_left_of_zero=True)` with + `[5, 6, 7]`. + fill: An optional float value to use for filling up the returned results at + the boundaries. This filling only happens if the requested index range's + start/stop boundaries exceed the episode's boundaries (including the + lookback buffer on the left side). This comes in very handy, if users + don't want to worry about reaching such boundaries and want to zero-pad. + For example, an episode with actions [10, 11, 12, 13, 14] and + lookback buffer size of 2 (meaning actions `10` and `11` are part + of the lookback buffer) will respond to + `get_actions(slice(-7, -2), fill=0.0)` with `[0.0, 0.0, 10, 11, 12]`. + one_hot_discrete: If True, will return one-hot vectors (instead of + int-values) for those sub-components of a (possibly complex) action + space that are Discrete or MultiDiscrete. Note that if `fill=0` and the + requested `indices` are out of the range of our data, the returned + one-hot vectors will actually be zero-hot (all slots zero). + + Examples: + + .. testcode:: + + import gymnasium as gym + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + + episode = SingleAgentEpisode( + # Discrete(4) actions (ints between 0 and 4 (excl.)) + action_space=gym.spaces.Discrete(4), + actions=[1, 2, 3], + observations=[0, 1, 2, 3], rewards=[1, 2, 3], # <- not relevant here + ) + # Plain usage (`indices` arg only). + episode.get_actions(-1) # 3 + episode.get_actions(0) # 1 + episode.get_actions([0, 2]) # [1, 3] + episode.get_actions([-1, 0]) # [3, 1] + episode.get_actions(slice(None, 2)) # [1, 2] + episode.get_actions(slice(-2, None)) # [2, 3] + # Using `fill=...` (requesting slices beyond the boundaries). + episode.get_actions(slice(-5, -2), fill=-9) # [-9, -9, 1, 2] + episode.get_actions(slice(1, 5), fill=-7) # [2, 3, -7, -7] + # Using `one_hot_discrete=True`. + episode.get_actions(1, one_hot_discrete=True) # [0 0 1 0] (action=2) + episode.get_actions(2, one_hot_discrete=True) # [0 0 0 1] (action=3) + episode.get_actions( + slice(0, 2), + one_hot_discrete=True, + ) # [[0 1 0 0], [0 0 0 1]] (actions=1 and 3) + # Special case: Using `fill=0.0` AND `one_hot_discrete=True`. + episode.get_actions( + -1, + neg_indices_left_of_zero=True, # -1 means one left of ts=0 + fill=0.0, + one_hot_discrete=True, + ) # [0 0 0 0] <- all 0s one-hot tensor (note difference to [1 0 0 0]!) Returns: - An `SingleAegntEpisode` instance containing the data from `batch`. + The collected actions. + As a 0-axis batch, if there are several `indices` or a list of exactly one + index provided OR `indices` is a slice object. + As single item (B=0 -> no additional 0-axis) if `indices` is a single int. """ - is_done = ( - batch[SampleBatch.TERMINATEDS][-1] or batch[SampleBatch.TRUNCATEDS][-1] + return self.actions.get( + indices=indices, + neg_indices_left_of_zero=neg_indices_left_of_zero, + fill=fill, + one_hot_discrete=one_hot_discrete, ) - observations = np.concatenate( - [batch[SampleBatch.OBS], batch[SampleBatch.NEXT_OBS][None, -1]] + + def get_rewards( + self, + indices: Optional[Union[int, List[int], slice]] = None, + *, + neg_indices_left_of_zero: bool = False, + fill: Optional[Any] = None, + ) -> Any: + """Returns individual rewards or batched ranges thereof from this episode. + + Args: + indices: A single int is interpreted as an index, from which to return the + individual reward stored at this index. + A list of ints is interpreted as a list of indices from which to gather + individual rewards in a batch of size len(indices). + A slice object is interpreted as a range of rewards to be returned. + Thereby, negative indices by default are interpreted as "before the end" + unless the `neg_indices_left_of_zero=True` option is used, in which case + negative indices are interpreted as "before ts=0", meaning going back + into the lookback buffer. + neg_indices_left_of_zero: Negative values in `indices` are interpreted as + as "before ts=0", meaning going back into the lookback buffer. + For example, an episode with rewards [4, 5, 6, 7, 8, 9], where + [4, 5, 6] is the lookback buffer range (ts=0 item is 7), will respond + to `get_rewards(-1, neg_indices_left_of_zero=True)` with `6` and + to `get_rewards(slice(-2, 1), neg_indices_left_of_zero=True)` with + `[5, 6, 7]`. + fill: An optional float value to use for filling up the returned results at + the boundaries. This filling only happens if the requested index range's + start/stop boundaries exceed the episode's boundaries (including the + lookback buffer on the left side). This comes in very handy, if users + don't want to worry about reaching such boundaries and want to zero-pad. + For example, an episode with rewards [10, 11, 12, 13, 14] and + lookback buffer size of 2 (meaning rewards `10` and `11` are part + of the lookback buffer) will respond to + `get_rewards(slice(-7, -2), fill=0.0)` with `[0.0, 0.0, 10, 11, 12]`. + + Examples: + + .. testcode:: + + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + + episode = SingleAgentEpisode( + rewards=[1.0, 2.0, 3.0], + observations=[0, 1, 2, 3], actions=[1, 2, 3], # <- not relevant here + ) + # Plain usage (`indices` arg only). + episode.get_rewards(-1) # 3.0 + episode.get_rewards(0) # 1.0 + episode.get_rewards([0, 2]) # [1.0, 3.0] + episode.get_rewards([-1, 0]) # [3.0, 1.0] + episode.get_rewards(slice(None, 2)) # [1.0, 2.0] + episode.get_rewards(slice(-2, None)) # [2.0, 3.0] + # Using `fill=...` (requesting slices beyond the boundaries). + episode.get_rewards(slice(-5, -2), fill=0.0) # [0.0, 0.0, 1.0, 2.0] + episode.get_rewards(slice(1, 5), fill=0.0) # [2.0, 3.0, 0.0, 0.0] + + Returns: + The collected rewards. + As a 0-axis batch, if there are several `indices` or a list of exactly one + index provided OR `indices` is a slice object. + As single item (B=0 -> no additional 0-axis) if `indices` is a single int. + """ + return self.rewards.get( + indices=indices, + neg_indices_left_of_zero=neg_indices_left_of_zero, + fill=fill, ) - actions = batch[SampleBatch.ACTIONS] - rewards = batch[SampleBatch.REWARDS] - # These are the infos after stepping the environment, i.e. without the - # initial info. - infos = batch[SampleBatch.INFOS] - # Concatenate an intiial empty info. - infos = np.concatenate([np.array([{}]), infos]) - - # TODO (simon): This is very ugly, but right now - # we can only do it according to the exclusion principle. - extra_model_output_keys = [] - for k in batch.keys(): - if k not in [ - SampleBatch.EPS_ID, - SampleBatch.AGENT_INDEX, - SampleBatch.ENV_ID, - SampleBatch.AGENT_INDEX, - SampleBatch.T, - SampleBatch.SEQ_LENS, - SampleBatch.OBS, - SampleBatch.INFOS, - SampleBatch.NEXT_OBS, - SampleBatch.ACTIONS, - SampleBatch.PREV_ACTIONS, - SampleBatch.REWARDS, - SampleBatch.PREV_REWARDS, - SampleBatch.TERMINATEDS, - SampleBatch.TRUNCATEDS, - SampleBatch.UNROLL_ID, - SampleBatch.DONES, - SampleBatch.CUR_OBS, - ]: - extra_model_output_keys.append(k) - return SingleAgentEpisode( - id_=batch[SampleBatch.EPS_ID][0], - observations=observations if is_done else observations.tolist(), - actions=actions if is_done else actions.tolist(), - rewards=rewards if is_done else rewards.tolist(), - t_started=batch[SampleBatch.T][0], - is_terminated=batch[SampleBatch.TERMINATEDS][-1], - is_truncated=batch[SampleBatch.TRUNCATEDS][-1], - infos=infos if is_done else infos.tolist(), - extra_model_outputs={ - k: (batch[k] if is_done else batch[k].tolist()) - for k in extra_model_output_keys - }, + def get_extra_model_outputs( + self, + key: str, + indices: Optional[Union[int, List[int], slice]] = None, + *, + neg_indices_left_of_zero: bool = False, + fill: Optional[Any] = None, + ) -> Any: + """Returns extra model outputs (under given key) from this episode. + + Args: + key: The `key` within `self.extra_model_outputs` to extract data for. + indices: A single int is interpreted as an index, from which to return an + individual extra model output stored under `key` at index. + A list of ints is interpreted as a list of indices from which to gather + individual actions in a batch of size len(indices). + A slice object is interpreted as a range of extra model outputs to be + returned. Thereby, negative indices by default are interpreted as + "before the end" unless the `neg_indices_left_of_zero=True` option is + used, in which case negative indices are interpreted as "before ts=0", + meaning going back into the lookback buffer. + neg_indices_left_of_zero: If True, negative values in `indices` are + interpreted as "before ts=0", meaning going back into the lookback + buffer. For example, an episode with + extra_model_outputs['a'] = [4, 5, 6, 7, 8, 9], where [4, 5, 6] is the + lookback buffer range (ts=0 item is 7), will respond to + `get_extra_model_outputs("a", -1, neg_indices_left_of_zero=True)` with + `6` and to `get_extra_model_outputs("a", slice(-2, 1), + neg_indices_left_of_zero=True)` with `[5, 6, 7]`. + fill: An optional float value to use for filling up the returned results at + the boundaries. This filling only happens if the requested index range's + start/stop boundaries exceed the episode's boundaries (including the + lookback buffer on the left side). This comes in very handy, if users + don't want to worry about reaching such boundaries and want to zero-pad. + For example, an episode with + extra_model_outputs["b"] = [10, 11, 12, 13, 14] and lookback buffer + size of 2 (meaning `10` and `11` are part of the lookback buffer) will + respond to + `get_extra_model_outputs("b", slice(-7, -2), fill=0.0)` with + `[0.0, 0.0, 10, 11, 12]`. + TODO (sven): This would require a space being provided. Maybe we can + automatically infer the space from existing data? + + Examples: + + .. testcode:: + + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + + episode = SingleAgentEpisode( + extra_model_outputs={"mo": [1, 2, 3]}, + # The following is needed, but not relevant for this demo. + observations=[0, 1, 2, 3], actions=[1, 2, 3], rewards=[1, 2, 3], + ) + + # Plain usage (`indices` arg only). + episode.get_extra_model_outputs("mo", -1) # 3 + episode.get_extra_model_outputs("mo", 1) # 0 + episode.get_extra_model_outputs("mo", [0, 2]) # [1, 3] + episode.get_extra_model_outputs("mo", [-1, 0]) # [3, 1] + episode.get_extra_model_outputs("mo", slice(None, 2)) # [1, 2] + episode.get_extra_model_outputs("mo", slice(-2, None)) # [2, 3] + # Using `fill=...` (requesting slices beyond the boundaries). + # TODO (sven): This would require a space being provided. Maybe we can + # automatically infer the space from existing data? + # episode.get_extra_model_outputs("mo", slice(-5, -2), fill=0) # [0, 0, 1] + # episode.get_extra_model_outputs("mo", slice(2, 5), fill=-1) # [3, -1, -1] + + Returns: + The collected extra_model_outputs[`key`]. + As a 0-axis batch, if there are several `indices` or a list of exactly one + index provided OR `indices` is a slice object. + As single item (B=0 -> no additional 0-axis) if `indices` is a single int. + """ + value = self.extra_model_outputs[key] + # The expected case is: `value` is a `BufferWithInfiniteLookback`. + if isinstance(value, BufferWithInfiniteLookback): + return value.get( + indices=indices, + neg_indices_left_of_zero=neg_indices_left_of_zero, + fill=fill, + ) + # It might be that the user has added new key/value pairs in their custom + # postprocessing/connector logic. The values are then most likely numpy + # arrays. We convert them automatically to buffers and get the requested + # indices (with the given options) from there. + return BufferWithInfiniteLookback(value).get( + indices, fill=fill, neg_indices_left_of_zero=neg_indices_left_of_zero ) - def get_return(self) -> float: - """Calculates an episode's return. + def slice(self, slice_: slice) -> "SingleAgentEpisode": + """Returns a slice of this episode with the given slice object. - The return is computed by a simple sum, neglecting the discount factor. - This is used predominantly for metrics. + For example, if `self` contains o0 (the reset observation), o1, o2, o3, and o4 + and the actions a1, a2, a3, and a4 (len of `self` is 4), then a call to + `self.slice(slice(1, 3))` would return a new SingleAgentEpisode with + observations o1, o2, and o3, and actions a2 and a3. Note here that there is + always one observation more in an episode than there are actions (and rewards + and extra model outputs) due to the initial observation received after an env + reset. + + Note that in any case, the lookback buffer will remain (if possible) at the same + size as it has been previously set to (`self._len_lookback_buffer`) and the + given slice object will NOT have to provide for this extra offset at the + beginning. + + Args: + slice_: The slice object to use for slicing. This should exclude the + lookback buffer, which will be prepended automatically to the returned + slice. Returns: - The sum of rewards collected during this episode. + The new SingleAgentEpisode representing the requested slice. """ - return sum(self.rewards) + # Figure out, whether slicing stops at the very end of this episode to know + # whether `self.is_terminated/is_truncated` should be kept as-is. + keep_done = slice_.stop is None or slice_.stop == len(self) + start = slice_.start or 0 + t_started = self.t_started + start + (0 if start >= 0 else len(self)) + + neg_indices_left_of_zero = (slice_.start or 0) >= 0 + slice_ = slice( + # Make sure that the lookback buffer is part of the new slice as well. + (slice_.start or 0) - self._len_lookback_buffer, + slice_.stop, + slice_.step, + ) + slice_obs_infos = slice( + slice_.start, + # Obs and infos need one more step at the end. + ((slice_.stop if slice_.stop != -1 else (len(self) - 1)) or len(self)) + 1, + slice_.step, + ) + return SingleAgentEpisode( + id_=self.id_, + observations=self.get_observations( + slice_obs_infos, + neg_indices_left_of_zero=neg_indices_left_of_zero, + ), + infos=self.get_infos( + slice_obs_infos, + neg_indices_left_of_zero=neg_indices_left_of_zero, + ), + actions=self.get_actions( + slice_, + neg_indices_left_of_zero=neg_indices_left_of_zero, + ), + rewards=self.get_rewards( + slice_, + neg_indices_left_of_zero=neg_indices_left_of_zero, + ), + extra_model_outputs={ + k: self.get_extra_model_outputs( + k, + slice_, + neg_indices_left_of_zero=neg_indices_left_of_zero, + ) + for k in self.extra_model_outputs + }, + terminated=(self.is_terminated if keep_done else False), + truncated=(self.is_truncated if keep_done else False), + # Provide correct timestep- and pre-buffer information. + t_started=t_started, + len_lookback_buffer=self._len_lookback_buffer, + ) - def get_state(self) -> Dict[str, Any]: - """Returns the pickable state of an episode. + def get_data_dict(self): + """Converts a SingleAgentEpisode into a data dict mapping str keys to data. - The data in the episode is stored into a dictionary. Note that episodes - can also be generated from states (see `self.from_state()`). + The keys used are: + SampleBatch.EPS_ID, T, OBS, INFOS, ACTIONS, REWARDS, TERMINATEDS, TRUNCATEDS, + and those in `self.extra_model_outputs`. Returns: - A dictionary containing all the data from the episode. + A data dict mapping str keys to data records. """ - return list( + t = list(range(self.t_started, self.t)) + terminateds = [False] * (len(self) - 1) + [self.is_terminated] + truncateds = [False] * (len(self) - 1) + [self.is_truncated] + eps_id = [self.id_] * len(self) + + if self.is_finalized: + t = np.array(t) + terminateds = np.array(terminateds) + truncateds = np.array(truncateds) + eps_id = np.array(eps_id) + + return dict( { - "id_": self.id_, - "observations": self.observations, - "actions": self.actions, - "rewards": self.rewards, - "infos": self.infos, - "states": self.states, - "t_started": self.t_started, - "t": self.t, - "is_terminated": self.is_terminated, - "is_truncated": self.is_truncated, - **self.extra_model_outputs, - }.items() + # Trivial 1D data (compiled above). + SampleBatch.TERMINATEDS: terminateds, + SampleBatch.TRUNCATEDS: truncateds, + SampleBatch.T: t, + SampleBatch.EPS_ID: eps_id, + # Retrieve obs, infos, actions, rewards using our get_... APIs, + # which return all relevant timesteps (excluding the lookback + # buffer!). + SampleBatch.OBS: self.get_observations(slice(None, -1)), + SampleBatch.INFOS: self.get_infos(), + SampleBatch.ACTIONS: self.get_actions(), + SampleBatch.REWARDS: self.get_rewards(), + }, + # All `extra_model_outs`: Same as obs: Use get_... API. + **{ + k: self.get_extra_model_outputs(k) + for k in self.extra_model_outputs.keys() + }, ) - @staticmethod - def from_state(state: Dict[str, Any]) -> "SingleAgentEpisode": - """Generates a `SingleAegntEpisode` from a pickable state. + def get_sample_batch(self) -> SampleBatch: + """Converts this `SingleAgentEpisode` into a `SampleBatch`. - The data in the state has to be complete. This is always the case when the state - was created by a `SingleAgentEpisode` itself calling `self.get_state()`. + Returns: + A SampleBatch containing all of this episode's data. + """ + return SampleBatch(self.get_data_dict()) - Args: - state: A dictionary containing all episode data. + def get_return(self) -> float: + """Calculates an episode's return, excluding the lookback buffer's rewards. + + The return is computed by a simple sum, neglecting the discount factor. Returns: - A `SingleAgentEpisode` instance holding all the data provided by `state`. + The sum of rewards collected during this episode, excluding possible data + inside the lookback buffer. """ - eps = SingleAgentEpisode(id_=state[0][1]) - eps.observations = state[1][1] - eps.actions = state[2][1] - eps.rewards = state[3][1] - eps.infos = state[4][1] - eps.states = state[5][1] - eps.t_started = state[6][1] - eps.t = state[7][1] - eps.is_terminated = state[8][1] - eps.is_truncated = state[9][1] - eps.extra_model_outputs = {k: v for k, v in state[10:]} - # Validate the episode to ensure complete data. - eps.validate() - return eps + return sum(self.get_rewards()) def __len__(self) -> int: """Returning the length of an episode. - The length of an episode is defined by the length of its data. This is the - number of timesteps an agent has stepped through an environment so far. - The length is undefined in case of a just started episode. + The length of an episode is defined by the length of its data, excluding + the lookback buffer data. The length is the number of timesteps an agent has + stepped through an environment thus far. + + The length is 0 in case of an episode whose env has NOT been reset yet, but + also 0 right after the `env.reset()` data has been added via + `self.add_env_reset()`. Only after the first call to `env.step()` (and + `self.add_env_step()`, the length will be 1. Returns: An integer, defining the length of an episode. - - Raises: - AssertionError: If episode has never been stepped so far. """ - assert len(self.observations) > 0, ( - "ERROR: Cannot determine length of episode that hasn't started yet! Call " - "`SingleAgentEpisode.add_initial_observation(initial_observation=...)` " - "first (after which `len(SingleAgentEpisode)` will be 0)." - ) - return len(self.observations) - 1 + return self.t - self.t_started def __repr__(self): return f"SAEps({self.id_} len={len(self)})" + + def __getitem__(self, item: slice) -> "SingleAgentEpisode": + """Enable squared bracket indexing- and slicing syntax, e.g. episode[-4:].""" + if isinstance(item, slice): + return self.slice(slice_=item) + else: + raise NotImplementedError( + f"SingleAgentEpisode does not support getting item '{item}'! " + "Only slice objects allowed with the syntax: `episode[a:b]`." + ) diff --git a/rllib/env/testing/__init__.py b/rllib/env/testing/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib/env/testing/single_agent_gym_env_runner.py b/rllib/env/testing/single_agent_gym_env_runner.py deleted file mode 100644 index 2bf885b94c54a..0000000000000 --- a/rllib/env/testing/single_agent_gym_env_runner.py +++ /dev/null @@ -1,243 +0,0 @@ -from typing import List, Optional, Tuple - -import gymnasium as gym - -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.env.env_runner import EnvRunner -from ray.rllib.env.single_agent_episode import SingleAgentEpisode -from ray.rllib.utils.annotations import override - - -class SingleAgentGymEnvRunner(EnvRunner): - """A simple single-agent EnvRunner subclass for testing purposes. - - Uses a gym.vector.Env environment and random actions. - """ - - def __init__(self, *, config: AlgorithmConfig, **kwargs): - """Initializes a SingleAgentGymEnvRunner instance. - - Args: - config: The config to use to setup this EnvRunner. - """ - super().__init__(config=config, **kwargs) - - # Create the gym.vector.Env object. - self.env = gym.vector.make( - self.config.env, - num_envs=self.config.num_envs_per_worker, - asynchronous=self.config.remote_worker_envs, - **dict(self.config.env_config, **{"render_mode": "rgb_array"}), - ) - self.num_envs = self.env.num_envs - - self._needs_initial_reset = True - self._episodes = [None for _ in range(self.num_envs)] - - @override(EnvRunner) - def sample( - self, - *, - num_timesteps: Optional[int] = None, - num_episodes: Optional[int] = None, - force_reset: bool = False, - **kwargs, - ) -> Tuple[List[SingleAgentEpisode], List[SingleAgentEpisode]]: - """Returns a tuple (list of completed episodes, list of ongoing episodes). - - Args: - num_timesteps: If provided, will step exactly this number of timesteps - through the environment. Note that only one or none of `num_timesteps` - and `num_episodes` may be provided, but never both. If both - `num_timesteps` and `num_episodes` are None, will determine how to - sample via `self.config`. - num_episodes: If provided, will step through the env(s) until exactly this - many episodes have been completed. Note that only one or none of - `num_timesteps` and `num_episodes` may be provided, but never both. - If both `num_timesteps` and `num_episodes` are None, will determine how - to sample via `self.config`. - force_reset: If True, will force-reset the env at the very beginning and - thus begin sampling from freshly started episodes. - **kwargs: Forward compatibility kwargs. - - Returns: - A tuple consisting of: A list of SingleAgentEpisode instances that are - already done (either terminated or truncated, hence their `is_done` property - is True), a list of SingleAgentEpisode instances that are still ongoing - (their `is_done` property is False). - """ - assert not (num_timesteps is not None and num_episodes is not None) - - # If no counters are provided, use our configured default settings. - if num_timesteps is None and num_episodes is None: - # Truncate episodes -> num_timesteps = rollout fragment * num_envs. - if self.config.batch_mode == "truncate_episodes": - num_timesteps = self.config.rollout_fragment_length * self.num_envs - # Complete episodes -> each env runs one episode. - else: - num_episodes = self.num_envs - - if num_timesteps is not None: - return self._sample_timesteps( - num_timesteps=num_timesteps, - force_reset=force_reset, - ) - else: - return self._sample_episodes(num_episodes=num_episodes) - - def _sample_timesteps( - self, - num_timesteps: int, - force_reset: bool = False, - ) -> Tuple[List[SingleAgentEpisode], List[SingleAgentEpisode]]: - """Runs n timesteps on the environment(s) and returns experiences. - - Timesteps are counted in total (across all vectorized sub-environments). For - example, if self.num_envs=2 and num_timesteps=10, each sub-environment - will be sampled for 5 steps. - """ - done_episodes_to_return = [] - - # Have to reset the env (on all vector sub-envs). - if force_reset or self._needs_initial_reset: - obs, _ = self.env.reset() - # Start new episodes. - # Set initial observations of the new episodes. - self._episodes = [ - SingleAgentEpisode(observations=[o]) for o in self._split_by_env(obs) - ] - self._needs_initial_reset = False - - # Loop for as long as we have not reached `num_timesteps` timesteps. - ts = 0 - while ts < num_timesteps: - # Act randomly. - actions = self.env.action_space.sample() - obs, rewards, terminateds, truncateds, infos = self.env.step(actions) - # Count timesteps across all environments. - ts += self.num_envs - - # Process env-returned data. - for i, (o, a, r, term, trunc) in enumerate( - zip( - self._split_by_env(obs), - self._split_by_env(actions), - self._split_by_env(rewards), - self._split_by_env(terminateds), - self._split_by_env(truncateds), - ) - ): - # Episode is done (terminated or truncated). - # Note that gym.vector.Env reset done sub-environments automatically - # (and start a new episode). The step-returned observation is then - # the new episode's reset observation and the final observation of - # the old episode can be found in the info dict. - if term or trunc: - # Finish the episode object with the actual terminal observation - # stored in the info dict (`o` is already the reset obs of the new - # episode). - self._episodes[i].add_timestep( - infos["final_observation"][i], - a, - r, - is_terminated=term, - is_truncated=trunc, - ) - # Add this finished episode to the list of completed ones. - done_episodes_to_return.append(self._episodes[i]) - # Start a new episode and set its initial observation to `o`. - self._episodes[i] = SingleAgentEpisode(observations=[o]) - # Episode is ongoing -> Add a timestep. - else: - self._episodes[i].add_timestep(o, a, r) - - # Return done episodes and all ongoing episode chunks, then start new episode - # chunks for those episodes that are still ongoing. - ongoing_episodes = self._episodes - # Create new chunks (using the same IDs and latest observations). - self._episodes = [ - SingleAgentEpisode(id_=eps.id_, observations=[eps.observations[-1]]) - for eps in self._episodes - ] - # Return tuple: done episodes, ongoing ones. - return done_episodes_to_return, ongoing_episodes - - def _sample_episodes( - self, - num_episodes: int, - ): - """Runs n episodes (reset first) on the environment(s) and returns experiences. - - Episodes are counted in total (across all vectorized sub-environments). For - example, if self.num_envs=2 and num_episodes=10, each sub-environment - will run 5 episodes. - """ - done_episodes_to_return = [] - - obs, _ = self.env.reset() - episodes = [ - SingleAgentEpisode(observations=[o]) for o in self._split_by_env(obs) - ] - - eps = 0 - while eps < num_episodes: - actions = self.env.action_space.sample() - obs, rewards, terminateds, truncateds, infos = self.env.step(actions) - - for i, (o, a, r, term, trunc) in enumerate( - zip( - self._split_by_env(obs), - self._split_by_env(actions), - self._split_by_env(rewards), - self._split_by_env(terminateds), - self._split_by_env(truncateds), - ) - ): - # Episode is done (terminated or truncated). - # Note that gym.vector.Env reset done sub-environments automatically - # (and start a new episode). The step-returned observation is then - # the new episode's reset observation and the final observation of - # the old episode can be found in the info dict. - if term or trunc: - eps += 1 - # Finish the episode object with the actual terminal observation - # stored in the info dict. - episodes[i].add_timestep( - infos["final_observation"][i], - a, - r, - is_terminated=term, - is_truncated=trunc, - ) - # Add this finished episode to the list of completed ones. - done_episodes_to_return.append(episodes[i]) - - # Also early-out if we reach the number of episodes within this - # for-loop. - if eps == num_episodes: - break - - # Start a new episode and set its initial observation to `o`. - episodes[i] = SingleAgentEpisode(observations=[o]) - else: - episodes[i].add_timestep(o, a, r) - - # If user calls sample(num_timesteps=..) after this, we must reset again - # at the beginning. - self._needs_initial_reset = True - - # Return 2 lists: finished and ongoing episodes. - return done_episodes_to_return, [] - - @override(EnvRunner) - def assert_healthy(self): - # Make sure, we have built our gym.vector.Env properly. - assert self.env - - @override(EnvRunner) - def stop(self): - # Close our env object via gymnasium's API. - self.env.close() - - def _split_by_env(self, inputs): - return [inputs[i] for i in range(self.num_envs)] diff --git a/rllib/env/tests/test_lookback_buffer.py b/rllib/env/tests/test_lookback_buffer.py new file mode 100644 index 0000000000000..702585e29104c --- /dev/null +++ b/rllib/env/tests/test_lookback_buffer.py @@ -0,0 +1,491 @@ +import unittest + +import gymnasium as gym +import numpy as np + +from ray.rllib.env.utils import BufferWithInfiniteLookback +from ray.rllib.utils.spaces.space_utils import batch, get_dummy_batch_for_space +from ray.rllib.utils.test_utils import check + + +class TestBufferWithInfiniteLookback(unittest.TestCase): + space = gym.spaces.Dict( + { + "a": gym.spaces.Discrete(4), + "b": gym.spaces.Box(-1.0, 1.0, (2, 3)), + "c": gym.spaces.Tuple( + [gym.spaces.MultiDiscrete([2, 3]), gym.spaces.Box(-1.0, 1.0, (1,))] + ), + } + ) + + def test_append_and_pop(self): + buffer = BufferWithInfiniteLookback(data=[0, 1, 2, 3]) + self.assertTrue(len(buffer), 4) + buffer.append(4) + self.assertTrue(len(buffer), 5) + buffer.append(5) + self.assertTrue(len(buffer), 6) + buffer.pop() + self.assertTrue(len(buffer), 5) + buffer.pop() + self.assertTrue(len(buffer), 4) + buffer.append(10) + self.assertTrue(len(buffer), 5) + check(buffer.data, [0, 1, 2, 3, 10]) + buffer.finalize() + self.assertRaises(RuntimeError, lambda: buffer.append("something")) + self.assertRaises(RuntimeError, lambda: buffer.extend(["something"])) + self.assertRaises(RuntimeError, lambda: buffer.pop()) + + def test_complex_structs(self): + buffer = BufferWithInfiniteLookback( + data=[self.space.sample() for _ in range(4)] + ) + self.assertTrue(len(buffer), 4) + buffer.append(self.space.sample()) + self.assertTrue(len(buffer), 5) + buffer.append(self.space.sample()) + self.assertTrue(len(buffer), 6) + + buffer.finalize() + self.assertRaises(RuntimeError, lambda: buffer.append("something")) + + self.assertTrue(isinstance(buffer.data, dict)) + self.assertTrue(isinstance(buffer.data["a"], np.ndarray)) + self.assertTrue(isinstance(buffer.data["b"], np.ndarray)) + self.assertTrue(isinstance(buffer.data["c"], tuple)) + self.assertTrue(isinstance(buffer.data["c"][0], np.ndarray)) + self.assertTrue(isinstance(buffer.data["c"][1], np.ndarray)) + + def test_lookback(self): + buffer = BufferWithInfiniteLookback(data=[0, 1, 2, 3], lookback=2) + self.assertTrue(len(buffer), 2) + data_no_lookback = buffer.get() + check(data_no_lookback, [2, 3]) + buffer.append(4) + self.assertTrue(len(buffer), 3) + buffer.append(5) + self.assertTrue(len(buffer), 4) + data_no_lookback = buffer.get() + check(data_no_lookback, [2, 3, 4, 5]) + buffer.pop() + self.assertTrue(len(buffer), 3) + data_no_lookback = buffer.get() + check(data_no_lookback, [2, 3, 4]) + + def test_get_with_lookback(self): + """Tests `get` and `getitem` functionalities with a lookback range > 0.""" + buffer = BufferWithInfiniteLookback(data=[0, 1, 2, 3, 4], lookback=2) + + # Test on ongoing and finalized buffer. + for finalized in [False, True]: + if finalized: + buffer.finalize() + + self.assertTrue(len(buffer), 3) + # No args: Expect all contents excluding lookback buffer. + check(buffer.get(), [2, 3, 4]) + check(buffer[:], [2, 3, 4]) + # Individual negative indices (include lookback buffer). + check(buffer.get(-1), 4) + check(buffer[-1], 4) + check(buffer.get(-2), 3) + check(buffer[-2], 3) + check(buffer.get(-4), 1) + check(buffer[-4], 1) + check(buffer.get([-4]), [1]) + check(buffer[-4:-3], [1]) + self.assertRaises(IndexError, lambda: buffer.get(-6)) + self.assertRaises(IndexError, lambda: buffer[-6]) + self.assertRaises(IndexError, lambda: buffer.get(-1000)) + self.assertRaises(IndexError, lambda: buffer[-1000]) + # Individual positive indices (do NOT include lookback buffer). + check(buffer.get(0), 2) + check(buffer[0], 2) + check(buffer.get(1), 3) + check(buffer[1], 3) + check(buffer.get(2), 4) + check(buffer[2], 4) + check(buffer.get([2]), [4]) + check(buffer[2:3], [4]) + self.assertRaises(IndexError, lambda: buffer.get(3)) + self.assertRaises(IndexError, lambda: buffer[3]) + self.assertRaises(IndexError, lambda: buffer.get(1000)) + self.assertRaises(IndexError, lambda: buffer[1000]) + # List of negative indices (include lookback buffer). + check(buffer.get([-4, -5]), [1, 0]) + check(buffer.get([-1]), [4]) + check(buffer[-1:], [4]) + check(buffer.get([-5]), [0]) + check(buffer[-5:-4], [0]) + self.assertRaises(IndexError, lambda: buffer.get([-6])) + self.assertRaises(IndexError, lambda: buffer.get([-1, -6])) + self.assertRaises(IndexError, lambda: buffer.get([-1000])) + # List of positive indices (do NOT include lookback buffer). + check(buffer.get([1, 0, 2]), [3, 2, 4]) + check(buffer.get([0, 2, 1]), [2, 4, 3]) + self.assertRaises(IndexError, lambda: buffer.get([6])) + self.assertRaises(IndexError, lambda: buffer.get([1, 6])) + self.assertRaises(IndexError, lambda: buffer.get([1000])) + # List of positive and negative indices (do NOT include lookback buffer). + check(buffer.get([1, 0, -2]), [3, 2, 3]) + check(buffer.get([-3, 1, -1]), [2, 3, 4]) + # Slices. + # Type: [None:...] + check(buffer.get(slice(None, None)), [2, 3, 4]) + check(buffer.get(slice(None, 2)), [2, 3]) + check(buffer[:2], [2, 3]) + check(buffer.get(slice(3)), [2, 3, 4]) + check(buffer[:3], [2, 3, 4]) + check(buffer.get(slice(None, -1)), [2, 3]) + check(buffer[:-1], [2, 3]) + check(buffer.get(slice(None, -2)), [2]) + check(buffer[:-2], [2]) + # Type: [...:None] + check(buffer.get(slice(2, None)), [4]) + check(buffer[2:], [4]) + check(buffer.get(slice(2, 5)), [4]) + check(buffer[2:5], [4]) + check(buffer.get(slice(1, None)), [3, 4]) + check(buffer[1:], [3, 4]) + check(buffer.get(slice(1, 5)), [3, 4]) + check(buffer[1:5], [3, 4]) + check(buffer.get(slice(-1, None)), [4]) + check(buffer[-1:], [4]) + check(buffer.get(slice(-1, 5)), [4]) + check(buffer[-1:5], [4]) + check(buffer.get(slice(-4, None)), [1, 2, 3, 4]) + check(buffer[-4:], [1, 2, 3, 4]) + check(buffer.get(slice(-4, 5)), [1, 2, 3, 4]) + check(buffer[-4:5], [1, 2, 3, 4]) + # Type: [-n:-m] + check(buffer.get(slice(-2, -1)), [3]) + check(buffer[-2:-1], [3]) + check(buffer.get(slice(-3, -1)), [2, 3]) + check(buffer[-3:-1], [2, 3]) + check(buffer.get(slice(-4, -2)), [1, 2]) + check(buffer[-4:-2], [1, 2]) + check(buffer.get(slice(-4, -1)), [1, 2, 3]) + check(buffer[-4:-1], [1, 2, 3]) + check(buffer.get(slice(-5, -1)), [0, 1, 2, 3]) + check(buffer[-5:-1], [0, 1, 2, 3]) + check(buffer.get(slice(-6, -2)), [0, 1, 2]) + check(buffer[-6:-2], [0, 1, 2]) + # Type: [+n:+m] + check(buffer.get(slice(0, 1)), [2]) + check(buffer[0:1], [2]) + check(buffer.get(slice(0, 2)), [2, 3]) + check(buffer[0:2], [2, 3]) + check(buffer.get(slice(0, 3)), [2, 3, 4]) + check(buffer[0:3], [2, 3, 4]) + check(buffer.get(slice(1, 2)), [3]) + check(buffer[1:2], [3]) + check(buffer.get(slice(1, 3)), [3, 4]) + check(buffer[1:3], [3, 4]) + check(buffer.get(slice(2, 3)), [4]) + check(buffer[2:3], [4]) + # Type: [+n:-m] + check(buffer.get(slice(0, -1)), [2, 3]) + check(buffer[0:-1], [2, 3]) + check(buffer.get(slice(0, -2)), [2]) + check(buffer[0:-2], [2]) + check(buffer.get(slice(1, -1)), [3]) + check(buffer[1:-1], [3]) + + # Check the type on the finalized buffer (numpy arrays). + data = buffer.get([1, 0, 2]) + self.assertTrue(isinstance(data, np.ndarray)) + check(data, [3, 2, 4]) + + def test_get_with_lookback_and_fill(self): + """Tests the `fill` argument of `get` with a lookback range >0.""" + buffer = BufferWithInfiniteLookback( + data=[0, 1, 2, 3, 4, 5], + lookback=3, + # Specify a space, so we can fill and one-hot discrete data properly. + space=gym.spaces.Discrete(6), + ) + + # Test on ongoing and finalized buffer. + for finalized in [False, True]: + if finalized: + buffer.finalize() + + self.assertTrue(len(buffer), 3) + + # Individual indices with fill. + check(buffer.get(-10, fill=10), 10) + check(buffer.get(-3, fill=10), 3) + check(buffer.get(-2, fill=10), 4) + check(buffer.get(-1, fill=10), 5) + check(buffer.get(0, fill=10), 3) + check(buffer.get(2, fill=10), 5) + check(buffer.get(100, fill=10), 10) + + # Left fill. + check(buffer.get(slice(-8, None), fill=10), [10, 10, 0, 1, 2, 3, 4, 5]) + check(buffer.get(slice(-9, None), fill=10), [10, 10, 10, 0, 1, 2, 3, 4, 5]) + check( + buffer.get(slice(-10, None), fill=11), + [11, 11, 11, 11, 0, 1, 2, 3, 4, 5], + ) + check(buffer.get(slice(-10, -4), fill=11), [11, 11, 11, 11, 0, 1]) + # Both start stop on left side. + check(buffer.get(slice(-10, -9), fill=0), [0]) + check(buffer.get(slice(-20, -15), fill=0), [0, 0, 0, 0, 0]) + check(buffer.get(slice(-1001, -1000), fill=6), [6]) + # Both start stop on right side. + check(buffer.get(slice(10, 15), fill=0), [0, 0, 0, 0, 0]) + check(buffer.get(slice(15, 17), fill=0), [0, 0]) + check(buffer.get(slice(1000, 1001), fill=6), [6]) + # Right fill. + check(buffer.get(slice(2, 8), fill=12), [5, 12, 12, 12, 12, 12]) + check(buffer.get(slice(1, 7), fill=13), [4, 5, 13, 13, 13, 13]) + check(buffer.get(slice(1, 5), fill=-14), [4, 5, -14, -14]) + # No fill necessary (even though requested). + check(buffer.get(slice(-5, None), fill=999), [1, 2, 3, 4, 5]) + check(buffer.get(slice(-6, -1), fill=999), [0, 1, 2, 3, 4]) + check(buffer.get(slice(0, 2), fill=999), [3, 4]) + check(buffer.get(slice(1, None), fill=999), [4, 5]) + check(buffer.get(slice(None, 3), fill=999), [3, 4, 5]) + + # Check the type on the finalized buffer (numpy arrays). + data = buffer.get(slice(15, 17), fill=0) + self.assertTrue(isinstance(data, np.ndarray)) + check(data, [0, 0]) + + def test_get_with_fill_and_neg_indices_into_lookback_buffer(self): + """Tests the `fill` argument of `get` with a lookback range >0.""" + buffer = BufferWithInfiniteLookback( + data=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + lookback=4, + # Specify a space, so we can fill and one-hot discrete data properly. + space=gym.spaces.Discrete(11), + ) + + # Test on ongoing and finalized buffer. + for finalized in [False, True]: + if finalized: + buffer.finalize() + + self.assertTrue(len(buffer), 7) + + # Lokback buffer is [0, 1, 2, 3] + # Individual indices with negative indices into lookback buffer. + check(buffer.get(-1, neg_indices_left_of_zero=True), 3) + check(buffer.get(-2, neg_indices_left_of_zero=True), 2) + check(buffer.get(-3, neg_indices_left_of_zero=True), 1) + check(buffer.get(-4, neg_indices_left_of_zero=True), 0) + check(buffer.get([-1, -3], neg_indices_left_of_zero=True), [3, 1]) + # Slices with negative indices into lookback buffer. + check(buffer.get(slice(-2, -1), neg_indices_left_of_zero=True), [2]) + check(buffer.get(slice(-2, 0), neg_indices_left_of_zero=True), [2, 3]) + check( + buffer.get(slice(-2, 4), neg_indices_left_of_zero=True), + [2, 3, 4, 5, 6, 7], + ) + check( + buffer.get(slice(-2, None), neg_indices_left_of_zero=True), + [2, 3, 4, 5, 6, 7, 8, 9, 10], + ) + # With left fill. + check(buffer.get(-8, fill=10, neg_indices_left_of_zero=True), 10) + check(buffer.get(-800, fill=10, neg_indices_left_of_zero=True), 10) + check(buffer.get([-8, -1], fill=9, neg_indices_left_of_zero=True), [9, 3]) + check( + buffer.get(slice(-8, 0), fill=10, neg_indices_left_of_zero=True), + [10, 10, 10, 10, 0, 1, 2, 3], + ) + check( + buffer.get(slice(-7, 1), fill=10, neg_indices_left_of_zero=True), + [10, 10, 10, 0, 1, 2, 3, 4], + ) + check( + buffer.get(slice(-6, None), fill=11, neg_indices_left_of_zero=True), + [11, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ) + check( + buffer.get(slice(-10, -4), fill=11, neg_indices_left_of_zero=True), + [11, 11, 11, 11, 11, 11], + ) + # Both start stop on left side. + check( + buffer.get(slice(-10, -9), fill=0, neg_indices_left_of_zero=True), [0] + ) + check( + buffer.get(slice(-20, -15), fill=0, neg_indices_left_of_zero=True), + [0, 0, 0, 0, 0], + ) + check( + buffer.get(slice(-1001, -1000), fill=6, neg_indices_left_of_zero=True), + [6], + ) + # Both start stop on right side. + check( + buffer.get(slice(10, 15), fill=0, neg_indices_left_of_zero=True), + [0, 0, 0, 0, 0], + ) + check( + buffer.get(slice(15, 17), fill=0, neg_indices_left_of_zero=True), [0, 0] + ) + check( + buffer.get(slice(1000, 1001), fill=6, neg_indices_left_of_zero=True), + [6], + ) + # Right fill. + check(buffer.get(8, fill=10, neg_indices_left_of_zero=True), 10) + check(buffer.get(800, fill=10, neg_indices_left_of_zero=True), 10) + check(buffer.get([8, 1], fill=9, neg_indices_left_of_zero=True), [9, 5]) + check( + buffer.get(slice(-2, 8), fill=12, neg_indices_left_of_zero=True), + [2, 3, 4, 5, 6, 7, 8, 9, 10, 12], + ) + check( + buffer.get(slice(-1, 9), fill=13, neg_indices_left_of_zero=True), + [3, 4, 5, 6, 7, 8, 9, 10, 13, 13], + ) + check( + buffer.get(slice(-1, 5), fill=-14, neg_indices_left_of_zero=True), + [3, 4, 5, 6, 7, 8], + ) + # No fill necessary (even though requested). + check( + buffer.get(slice(-1, None), fill=999, neg_indices_left_of_zero=True), + [3, 4, 5, 6, 7, 8, 9, 10], + ) + check( + buffer.get(slice(-4, -1), fill=999, neg_indices_left_of_zero=True), + [0, 1, 2], + ) + check( + buffer.get(slice(-1, 2), fill=999, neg_indices_left_of_zero=True), + [3, 4, 5], + ) + + # Check the type on the finalized buffer (numpy arrays). + data = buffer.get(slice(-17, -15), fill=0, neg_indices_left_of_zero=True) + self.assertTrue(isinstance(data, np.ndarray)) + check(data, [0, 0]) + data = buffer.get([-3, -1], fill=0, neg_indices_left_of_zero=True) + self.assertTrue(isinstance(data, np.ndarray)) + check(data, [1, 3]) + + def test_get_with_fill_0_and_zero_hot(self): + """Tests, whether zero-hot is properly done when fill=0.""" + buffer = BufferWithInfiniteLookback( + data=[0, 1, 0, 1], + # Specify a space, so we can fill and one-hot discrete data properly. + space=gym.spaces.Discrete(2), + ) + + # Test on ongoing and finalized buffer. + for finalized in [False, True]: + if finalized: + buffer.finalize() + + self.assertTrue(len(buffer), 4) + + # Right side fill 0. Should be zero-hot. + check(buffer.get(4, fill=0, one_hot_discrete=True), [0, 0]) + check( + buffer.get( + -1, + neg_indices_left_of_zero=True, + fill=0, + one_hot_discrete=True, + ), + [0, 0], + ) + + def test_get_with_complex_space(self): + """Tests, whether zero-hot is properly done when fill=0.""" + buffer = BufferWithInfiniteLookback( + data=[ + get_dummy_batch_for_space( + space=self.space, + batch_size=0, + fill_value=float(i), + ) + for i in range(4) + ], + lookback=2, + # Specify a space, so we can fill and one-hot discrete data properly. + space=self.space, + ) + + buffer_0 = { + "a": 0, + "b": np.array([[0, 0, 0], [0, 0, 0]]), + "c": (np.array([0, 0]), np.array([0])), + } + buffer_0_one_hot = { + "a": np.array([0.0, 0.0, 0.0, 0.0]), + "b": np.array([[0, 0, 0], [0, 0, 0]]), + "c": (np.array([0, 0, 0, 0, 0]), np.array([0])), + } + buffer_1 = { + "a": 1, + "b": np.array([[1, 1, 1], [1, 1, 1]]), + "c": (np.array([1, 1]), np.array([1])), + } + buffer_2 = { + "a": 2, + "b": np.array([[2, 2, 2], [2, 2, 2]]), + "c": (np.array([2, 2]), np.array([2])), + } + buffer_3 = { + "a": 3, + "b": np.array([[3, 3, 3], [3, 3, 3]]), + "c": (np.array([3, 3]), np.array([3])), + } + + # Test on ongoing and finalized buffer. + for finalized in [False, True]: + if finalized: + buffer.finalize() + + def batch_(s): + return batch(s) + + else: + + def batch_(s): + return s + + self.assertTrue(len(buffer), 2) + + check(buffer.get(-1), buffer_3) + check(buffer.get(-2), buffer_2) + check(buffer.get(-3), buffer_1) + check(buffer.get(-4), buffer_0) + check(buffer.get(-5, fill=0.0), buffer_0) + check(buffer.get([-5, 5], fill=0.0), batch_([buffer_0, buffer_0])) + check(buffer.get([-5, 1], fill=0.0), batch_([buffer_0, buffer_3])) + check(buffer.get([1, -10], fill=0.0), batch_([buffer_3, buffer_0])) + check( + buffer.get([-10], fill=0.0, one_hot_discrete=True), + batch_([buffer_0_one_hot]), + ) + check(buffer.get(slice(0, 1), fill=0.0), batch_([buffer_2])) + check(buffer.get(slice(1, 3), fill=0.0), batch_([buffer_3, buffer_0])) + check(buffer.get(slice(-10, -12), fill=0.0), batch_([buffer_0, buffer_0])) + check( + buffer.get(slice(-10, -12), fill=0.0, neg_indices_left_of_zero=True), + batch_([buffer_0, buffer_0]), + ) + check( + buffer.get(slice(100, 98), fill=0.0, neg_indices_left_of_zero=True), + batch_([buffer_0, buffer_0]), + ) + check( + buffer.get(slice(100, 98), fill=0.0), + batch_([buffer_0, buffer_0]), + ) + + +if __name__ == "__main__": + import pytest + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/env/tests/test_multi_agent_episode.py b/rllib/env/tests/test_multi_agent_episode.py index 8405f454faee4..a53f3ffa16e6c 100644 --- a/rllib/env/tests/test_multi_agent_episode.py +++ b/rllib/env/tests/test_multi_agent_episode.py @@ -150,8 +150,8 @@ def test_init(self): rewards = [] actions = [] infos = [] - is_terminateds = [] - is_truncateds = [] + terminateds = {} + truncateds = {} extra_model_outputs = [] # Initialize observation and info. obs, info = env.reset(seed=0) @@ -159,33 +159,31 @@ def test_init(self): infos.append(info) # Run 100 samples. for i in range(100): - agents_stepped = list(obs.keys()) - action = { - agent_id: i + 1 - for agent_id in agents_stepped - if agent_id in env._agents_alive - } + agents_to_step_next = [ + aid for aid in obs.keys() if aid in env._agents_alive + ] + action = {agent_id: i + 1 for agent_id in agents_to_step_next} # action = env.action_space_sample(agents_stepped) - obs, reward, is_terminated, is_truncated, info = env.step(action) + obs, reward, terminated, truncated, info = env.step(action) observations.append(obs) actions.append(action) rewards.append(reward) infos.append(info) - is_terminateds.append(is_terminated) - is_truncateds.append(is_truncated) + terminateds.update(terminated) + truncateds.update(truncated) extra_model_outputs.append( - {agent_id: {"extra_1": 10.5} for agent_id in agents_stepped} + {agent_id: {"extra_1": 10.5} for agent_id in agents_to_step_next} ) # Now create the episode from the recorded data. episode = MultiAgentEpisode( - agent_ids=env.get_agent_ids(), + agent_ids=list(env.get_agent_ids()), observations=observations, actions=actions, rewards=rewards, infos=infos, - is_terminated=is_terminateds, - is_truncated=is_truncateds, + terminateds=terminateds, + truncateds=truncateds, extra_model_outputs=extra_model_outputs, ) @@ -214,8 +212,8 @@ def test_init(self): rewards=rewards[-10:], infos=infos[-11:], t_started=100, - is_terminated=is_terminateds[-10:], - is_truncated=is_truncateds[-10:], + terminateds=terminateds, + truncateds=truncateds, extra_model_outputs=extra_model_outputs[-10:], ) @@ -239,8 +237,8 @@ def test_init(self): observations, actions, rewards, - is_terminated, - is_truncated, + terminateds, + truncateds, infos, ) = self._mock_multi_agent_records() @@ -251,8 +249,8 @@ def test_init(self): actions=actions, rewards=rewards, infos=infos, - is_terminated=is_terminated, - is_truncated=is_truncated, + terminateds=terminateds, + truncateds=truncateds, ) # Assert that the length of `SingleAgentEpisode`s are all correct. @@ -276,9 +274,9 @@ def test_add_initial_observation(self): # Generate initial observations and infos and add them to the episode. obs, infos = env.reset(seed=0) - episode.add_initial_observation( - initial_observation=obs, - initial_info=infos, + episode.add_env_reset( + observations=obs, + infos=infos, ) # Assert that timestep is at zero. @@ -295,15 +293,15 @@ def test_add_initial_observation(self): # TODO (simon): Test the buffers and reward storage. - def test_add_timestep(self): + def test_add_env_step(self): # Create an environment and add the initial observations, infos, and states. env = MultiAgentTestEnv() episode = MultiAgentEpisode(agent_ids=env.get_agent_ids()) obs, infos = env.reset(seed=0) - episode.add_initial_observation( - initial_observation=obs, - initial_info=infos, + episode.add_env_reset( + observations=obs, + infos=infos, ) # Sample 100 timesteps and add them to the episode. @@ -311,16 +309,16 @@ def test_add_timestep(self): action = { agent_id: i + 1 for agent_id in obs if agent_id in env._agents_alive } - obs, reward, is_terminated, is_truncated, info = env.step(action) - - episode.add_timestep( - observation=obs, - action=action, - reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={agent_id: {"extra": 10.5} for agent_id in action}, + obs, reward, terminated, truncated, info = env.step(action) + + episode.add_env_step( + observations=obs, + actions=action, + rewards=reward, + infos=info, + terminateds=terminated, + truncateds=truncated, + extra_model_outputs={agent_id: {"extra": 10.5} for agent_id in action}, ) # Assert that the timestep is at 100. @@ -349,19 +347,19 @@ def test_add_timestep(self): action = { agent_id: i + 1 for agent_id in obs if agent_id in env._agents_alive } - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode.add_timestep( - observation=obs, - action=action, - reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={agent_id: {"extra": 10.5} for agent_id in action}, + obs, reward, terminated, truncated, info = env.step(action) + episode.add_env_step( + observations=obs, + actions=action, + rewards=reward, + infos=info, + terminateds=terminated, + truncateds=truncated, + extra_model_outputs={agent_id: {"extra": 10.5} for agent_id in action}, ) # Assert that the environment is done. - self.assertTrue(is_truncated["__all__"]) + self.assertTrue(truncated["__all__"]) # Assert that each agent is done. for agent_id in episode._agent_ids: self.assertTrue(episode.agent_episodes[agent_id].is_done) @@ -378,8 +376,8 @@ def test_add_timestep(self): observations, actions, rewards, - is_terminated, - is_truncated, + terminated, + truncated, infos, ) = self._mock_multi_agent_records() @@ -389,25 +387,25 @@ def test_add_timestep(self): actions=actions, rewards=rewards, infos=infos, - is_terminated=is_terminated, - is_truncated=is_truncated, + terminateds=terminated, + truncateds=truncated, ) # Now test that intermediate rewards will get recorded and actions buffered. action = {"agent_2": 3, "agent_4": 3} observation = {"agent_1": 3, "agent_2": 3} reward = {"agent_1": 1.0, "agent_2": 1.0, "agent_3": 1.0, "agent_5": 1.0} infos = {"agent_1": {}, "agent_2": {}} - is_terminated = {k: False for k in observation.keys()} - is_terminated.update({"__all__": False}) - is_truncated = {k: False for k in observation.keys()} - is_truncated.update({"__all__": False}) - episode.add_timestep( - observation=observation, - action=action, - reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, + terminated = {k: False for k in observation.keys()} + terminated.update({"__all__": False}) + truncated = {k: False for k in observation.keys()} + truncated.update({"__all__": False}) + episode.add_env_step( + observations=observation, + actions=action, + rewards=reward, + infos=infos, + terminateds=terminated, + truncateds=truncated, ) # Assert that the action buffer for agent 4 is full. # Note, agent 4 acts, but receives no observation. @@ -420,7 +418,7 @@ def test_add_timestep(self): episode.agent_buffers["agent_3"]["rewards"].put_nowait(1.0) episode.agent_buffers["agent_5"]["rewards"].put_nowait(1.0) - def test_create_successor(self): + def test_cut(self): # Create an environment. episode_1, _ = self._mock_multi_agent_records_from_env(size=100) @@ -428,7 +426,7 @@ def test_create_successor(self): self.assertEqual(episode_1.t, 100) # Create a successor. - episode_2 = episode_1.create_successor() + episode_2 = episode_1.cut() # Assert that it has the same id. self.assertEqual(episode_1.id_, episode_2.id_) # Assert that all `SingleAgentEpisode`s have identical ids. @@ -575,8 +573,8 @@ def test_create_successor(self): observations, actions, rewards, - is_terminated, - is_truncated, + terminateds, + truncateds, infos, ) = self._mock_multi_agent_records() @@ -587,8 +585,8 @@ def test_create_successor(self): actions=actions, rewards=rewards, infos=infos, - is_terminated=is_terminated, - is_truncated=is_truncated, + terminateds=terminateds, + truncateds=truncateds, ) # Assert that agents 1 and 3's buffers are indeed full. @@ -611,17 +609,17 @@ def test_create_successor(self): # add this to the buffer and to the global reward history. reward = {"agent_1": 2.0, "agent_2": 2.0, "agent_3": 2.0, "agent_5": 2.0} info = {"agent_1": {}, "agent_2": {}} - is_terminated = {k: False for k in observation.keys()} - is_terminated.update({"__all__": False}) - is_truncated = {k: False for k in observation.keys()} - is_truncated.update({"__all__": False}) - episode_1.add_timestep( - observation=observation, - action=action, - reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, + terminateds = {k: False for k in observation.keys()} + terminateds.update({"__all__": False}) + truncateds = {k: False for k in observation.keys()} + truncateds.update({"__all__": False}) + episode_1.add_env_step( + observations=observation, + actions=action, + rewards=reward, + infos=info, + terminateds=terminateds, + truncateds=truncateds, ) # Check that the partial reward history is correct. @@ -647,7 +645,7 @@ def test_create_successor(self): self.assertEqual(episode_1.partial_rewards[agent_id][-1], 2.0) # Now create the successor. - episode_2 = episode_1.create_successor() + episode_2 = episode_1.cut() for agent_id, agent_eps in episode_2.agent_episodes.items(): if len(agent_eps.observations) > 0: @@ -704,8 +702,8 @@ def test_getters(self): actions=actions, rewards=rewards, infos=infos, - is_terminated=is_terminateds, - is_truncated=is_truncateds, + terminateds=is_terminateds, + truncateds=is_truncateds, extra_model_outputs=extra_model_outputs, ) @@ -791,9 +789,9 @@ def test_getters(self): # Test with initial observations only. episode_init_only = MultiAgentEpisode(agent_ids=agent_ids) - episode_init_only.add_initial_observation( - initial_observation=observations[0], - initial_info=infos[0], + episode_init_only.add_env_reset( + observation=observations[0], + infos=infos[0], ) # Get the last observation for agents and assert that its correct. last_observation = episode_init_only.get_observations() @@ -985,22 +983,22 @@ def test_getters(self): # Generate initial observation and info. obs, info = env.reset(seed=42) - episode_1.add_initial_observation( - initial_observation=obs, - initial_info=info, + episode_1.add_env_reset( + observations=obs, + infos=info, ) # Now, generate 100 samples. for i in range(100): action = {agent_id: i for agent_id in obs} - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode_1.add_timestep( - observation=obs, - action=action, - reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={agent_id: {"extra": 10} for agent_id in action}, + obs, reward, terminated, truncated, info = env.step(action) + episode_1.add_env_step( + observations=obs, + actions=action, + rewards=reward, + infos=info, + terminateds=terminated, + truncateds=truncated, + extra_model_outputs={agent_id: {"extra": 10} for agent_id in action}, ) # First, receive the last rewards without considering buffered values. @@ -1644,7 +1642,7 @@ def test_concat_episode(self): size=100, truncate=False ) # Now, create a successor episode. - episode_2 = episode_1.create_successor() + episode_2 = episode_1.cut() # Generate 100 more samples from the environment and store it in the episode. episode_2, env = self._mock_multi_agent_records_from_env( size=100, episode=episode_2, env=env, init=False @@ -1900,7 +1898,7 @@ def test_len(self): # Assert that the length is indeed 100. self.assertEqual(len(episode), 100) # Now, build a successor. - successor = episode.create_successor() + successor = episode.cut() # Sample another 100 timesteps. successor, env = self._mock_multi_agent_records_from_env( episode=successor, env=env, init=False @@ -1935,7 +1933,7 @@ def test_to_sample_batch(self): # Now test that when creating a successor its sample batch will # contain the correct values. - successor = episode.create_successor() + successor = episode.cut() # Run 100 more timesteps for the successor. successor, env = self._mock_multi_agent_records_from_env( episode=successor, env=env, init=False @@ -2007,7 +2005,7 @@ def _mock_multi_agent_records_from_env( # We initialize the episode, if requested. if init: obs, info = env.reset(seed=seed) - episode.add_initial_observation(initial_observation=obs, initial_info=info) + episode.add_env_reset(observations=obs, infos=info) # In the other case wer need at least the last observations for the next # actions. else: @@ -2020,15 +2018,15 @@ def _mock_multi_agent_records_from_env( # Sample `size` many records. for i in range(env.t, env.t + size): action = {agent_id: i + 1 for agent_id in obs} - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode.add_timestep( - observation=obs, - action=action, - reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={agent_id: {"extra": 10} for agent_id in action}, + obs, reward, terminated, truncated, info = env.step(action) + episode.add_env_step( + observations=obs, + actions=action, + rewards=reward, + infos=info, + terminateds=terminated, + truncateds=truncated, + extra_model_outputs={agent_id: {"extra": 10} for agent_id in action}, ) # Return both, epsiode and environment. @@ -2066,17 +2064,21 @@ def _mock_multi_agent_records(self): {"agent_2": {}, "agent_4": {}}, ] # Let no agent terminate or being truncated. - is_terminated = [ - {"__all__": False, "agent_1": False, "agent_3": False, "agent_4": False}, - {"__all__": False, "agent_2": False, "agent_4": False}, - ] - is_truncated = [ - {"__all__": False, "agent_1": False, "agent_3": False, "agent_4": False}, - {"__all__": False, "agent_2": False, "agent_4": False}, - ] + terminateds = { + "__all__": False, + "agent_1": False, + "agent_3": False, + "agent_4": False, + } + truncateds = { + "__all__": False, + "agent_1": False, + "agent_3": False, + "agent_4": False, + } # Return all observations. - return observations, actions, rewards, is_terminated, is_truncated, infos + return observations, actions, rewards, terminateds, truncateds, infos if __name__ == "__main__": diff --git a/rllib/env/tests/test_single_agent_gym_env_runner.py b/rllib/env/tests/test_single_agent_env_runner.py similarity index 54% rename from rllib/env/tests/test_single_agent_gym_env_runner.py rename to rllib/env/tests/test_single_agent_env_runner.py index 015c3837ff0f5..743e885291e37 100644 --- a/rllib/env/tests/test_single_agent_gym_env_runner.py +++ b/rllib/env/tests/test_single_agent_env_runner.py @@ -2,10 +2,10 @@ import ray from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.env.testing.single_agent_gym_env_runner import SingleAgentGymEnvRunner +from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner -class TestSingleAgentGymEnvRunner(unittest.TestCase): +class TestSingleAgentEnvRunner(unittest.TestCase): @classmethod def setUpClass(cls) -> None: ray.init() @@ -20,43 +20,41 @@ def test_sample(self): # Vectorize x2 and by default, rollout 64 timesteps per individual env. .rollouts(num_envs_per_worker=2, rollout_fragment_length=64) ) - env_runner = SingleAgentGymEnvRunner(config=config) + env_runner = SingleAgentEnvRunner(config=config) # Expect error if both num_timesteps and num_episodes given. self.assertRaises( - AssertionError, lambda: env_runner.sample(num_timesteps=10, num_episodes=10) + AssertionError, + lambda: env_runner.sample( + num_timesteps=10, num_episodes=10, random_actions=True + ), ) # Sample 10 episodes (5 per env) 100 times. for _ in range(100): - done_episodes, ongoing_episodes = env_runner.sample(num_episodes=10) - self.assertTrue(len(done_episodes + ongoing_episodes) == 10) + episodes = env_runner.sample(num_episodes=10, random_actions=True) + self.assertTrue(len(episodes) == 10) # Since we sampled complete episodes, there should be no ongoing episodes # being returned. - assert len(ongoing_episodes) == 0 - # Check, whether all done_episodes returned are indeed terminated. - self.assertTrue(all(e.is_done for e in done_episodes)) + self.assertTrue(all(e.is_done for e in episodes)) # Sample 10 timesteps (5 per env) 100 times. for _ in range(100): - done_episodes, ongoing_episodes = env_runner.sample(num_timesteps=10) - # Check, whether all done_episodes returned are indeed terminated. - self.assertTrue(all(e.is_done for e in done_episodes)) - # Check, whether all done_episodes returned are indeed terminated. - self.assertTrue(not any(e.is_done for e in ongoing_episodes)) + episodes = env_runner.sample(num_timesteps=10, random_actions=True) + # Check, whether the sum of lengths of all episodes returned is 20 + self.assertTrue(sum(len(e) for e in episodes) == 10) # Sample (by default setting: rollout_fragment_length=64) 10 times. for _ in range(100): - done_episodes, ongoing_episodes = env_runner.sample() - # Check, whether all done_episodes returned are indeed terminated. - self.assertTrue(all(e.is_done for e in done_episodes)) - # Check, whether all done_episodes returned are indeed terminated. - self.assertTrue(not any(e.is_done for e in ongoing_episodes)) + episodes = env_runner.sample(random_actions=True) + # Check, whether the sum of lengths of all episodes returned is 128 + # 2 (num_env_per_worker) * 64 (rollout_fragment_length). + self.assertTrue(sum(len(e) for e in episodes) == 128) def test_distributed_env_runner(self): """Tests, whether SingleAgentGymEnvRunner can be distributed.""" - remote_class = ray.remote(num_cpus=1, num_gpus=0)(SingleAgentGymEnvRunner) + remote_class = ray.remote(num_cpus=1, num_gpus=0)(SingleAgentEnvRunner) # Test with both parallelized sub-envs and w/o. remote_worker_envs = [False, True] @@ -78,19 +76,13 @@ def test_distributed_env_runner(self): for _ in range(config.num_rollout_workers) ] # Sample in parallel. - results = [a.sample.remote() for a in array] + results = [a.sample.remote(random_actions=True) for a in array] results = ray.get(results) # Loop over individual EnvRunner Actor's results and inspect each. - for result in results: - # SingleAgentGymEnvRunners return tuples: (completed eps, ongoing eps). - completed, ongoing = result - # Make sure all completed Episodes are indeed done. - self.assertTrue(all(e.is_done for e in completed)) - # Same for ongoing ones (make sure they are not done). - self.assertTrue(not any(e.is_done for e in ongoing)) + for episodes in results: # Assert length of all fragments is `rollout_fragment_length`. self.assertEqual( - sum(len(e) for e in completed + ongoing), + sum(len(e) for e in episodes), config.num_envs_per_worker * config.rollout_fragment_length, ) diff --git a/rllib/env/tests/test_single_agent_episode.py b/rllib/env/tests/test_single_agent_episode.py index 03389407a9e3e..7ada5e61cc944 100644 --- a/rllib/env/tests/test_single_agent_episode.py +++ b/rllib/env/tests/test_single_agent_episode.py @@ -1,12 +1,13 @@ -import gymnasium as gym -import numpy as np +from collections import defaultdict +from typing import Any, Dict, Optional, SupportsFloat, Tuple import unittest +import gymnasium as gym from gymnasium.core import ActType, ObsType -from typing import Any, Dict, Optional, SupportsFloat, Tuple +import numpy as np -import ray from ray.rllib.env.single_agent_episode import SingleAgentEpisode +from ray.rllib.utils.test_utils import check # TODO (simon): Add to the tests `info` and `extra_model_outputs` # as soon as #39732 is merged. @@ -37,14 +38,6 @@ def step( class TestSingelAgentEpisode(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - def test_init(self): """Tests initialization of `SingleAgentEpisode`. @@ -64,45 +57,35 @@ def test_init(self): episode = SingleAgentEpisode(t_started=10) self.assertTrue(episode.t == episode.t_started == 10) - # Sample 100 values and initialize episode with observations and infos. - env = gym.make("CartPole-v1") - # Initialize containers. - observations = [] - rewards = [] - actions = [] - infos = [] - extra_model_outputs = [] - states = np.random.random(10) - - # Initialize observation and info. - init_obs, init_info = env.reset() - observations.append(init_obs) - infos.append(init_info) - # Run 100 samples. - for _ in range(100): - action = env.action_space.sample() - obs, reward, is_terminated, is_truncated, info = env.step(action) - observations.append(obs) - actions.append(action) - rewards.append(reward) - infos.append(info) - extra_model_outputs.append({"extra_1": np.random.random()}) - - # Build the episode. - episode = SingleAgentEpisode( - observations=observations, - actions=actions, - rewards=rewards, - infos=infos, - states=states, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_outputs=extra_model_outputs, - ) + episode = self._create_episode(num_data=100) # The starting point and count should now be at `len(observations) - 1`. - self.assertTrue(episode.t == episode.t_started == (len(observations) - 1)) + self.assertTrue(len(episode) == 100) + self.assertTrue(episode.t == 100) + self.assertTrue(episode.t_started == 0) - def test_add_initial_observation(self): + # Build the same episode, but with a 10 ts lookback buffer. + episode = self._create_episode(num_data=100, len_lookback_buffer=10) + # The lookback buffer now takes 10 ts and the length of the episode is only 90. + self.assertTrue(len(episode) == 90) + # `t_started` is 0 by default. + self.assertTrue(episode.t_started == 0) + self.assertTrue(episode.t == 90) + self.assertTrue(len(episode.rewards) == 90) + self.assertTrue(len(episode.rewards.data) == 100) + + # Build the same episode, but with a 10 ts lookback buffer AND a specific + # `t_started`. + episode = self._create_episode( + num_data=100, len_lookback_buffer=10, t_started=50 + ) + # The lookback buffer now takes 10 ts and the length of the episode is only 90. + self.assertTrue(len(episode) == 90) + self.assertTrue(episode.t_started == 50) + self.assertTrue(episode.t == 140) + self.assertTrue(len(episode.rewards) == 90) + self.assertTrue(len(episode.rewards.data) == 100) + + def test_add_env_reset(self): """Tests adding initial observations and infos. This test ensures that when initial observation and info are provided @@ -116,7 +99,7 @@ def test_add_initial_observation(self): # Add initial observations. obs, info = env.reset() - episode.add_initial_observation(initial_observation=obs, initial_info=info) + episode.add_env_reset(observation=obs, infos=info) # Assert that the observations are added to their list. self.assertTrue(len(episode.observations) == 1) @@ -125,7 +108,7 @@ def test_add_initial_observation(self): # Assert that the timesteps are still at zero as we have not stepped, yet. self.assertTrue(episode.t == episode.t_started == 0) - def test_add_timestep(self): + def test_add_env_step(self): """Tests if adding timestep data to a `SingleAgentEpisode` works. Adding timestep data is the central part of collecting episode @@ -138,22 +121,23 @@ def test_add_timestep(self): # Set the random seed (otherwise the episode will terminate at # different points in each test run). obs, info = env.reset(seed=0) - episode.add_initial_observation(initial_observation=obs, initial_info=info) + episode.add_env_reset(observation=obs, infos=info) # Sample 100 timesteps and add them to the episode. + terminated = truncated = False for i in range(100): action = env.action_space.sample() - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode.add_timestep( + obs, reward, terminated, truncated, info = env.step(action) + episode.add_env_step( observation=obs, action=action, reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={"extra": np.random.random(1)}, + infos=info, + terminated=terminated, + truncated=truncated, + extra_model_outputs={"extra": np.random.random(1)}, ) - if is_terminated or is_truncated: + if terminated or truncated: break # Assert that the episode timestep is at 100. @@ -169,11 +153,11 @@ def test_add_timestep(self): == i + 1 ) # Assert that the flags are set correctly. - self.assertTrue(episode.is_terminated == is_terminated) - self.assertTrue(episode.is_truncated == is_truncated) - self.assertTrue(episode.is_done == is_terminated or is_truncated) + self.assertTrue(episode.is_terminated == terminated) + self.assertTrue(episode.is_truncated == truncated) + self.assertTrue(episode.is_done == terminated or truncated) - def test_create_successor(self): + def test_cut(self): """Tests creation of a scucessor of a `SingleAgentEpisode`. This test makes sure that when creating a successor the successor's @@ -188,28 +172,26 @@ def test_create_successor(self): env = TestEnv() # Add initial observation. init_obs, init_info = env.reset() - episode_1.add_initial_observation( - initial_observation=init_obs, initial_info=init_info - ) + episode_1.add_env_reset(observation=init_obs, infos=init_info) # Sample 100 steps. for i in range(100): action = i - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode_1.add_timestep( + obs, reward, terminated, truncated, info = env.step(action) + episode_1.add_env_step( observation=obs, action=action, reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={"extra": np.random.random(1)}, + infos=info, + terminated=terminated, + truncated=truncated, + extra_model_outputs={"extra": np.random.random(1)}, ) # Assert that the episode has indeed 100 timesteps. self.assertTrue(episode_1.t == 100) # Create a successor. - episode_2 = episode_1.create_successor() + episode_2 = episode_1.cut() # Assert that it has the same id. self.assertEqual(episode_1.id_, episode_2.id_) # Assert that the timestep starts at the end of the last episode. @@ -222,19 +204,126 @@ def test_create_successor(self): # Test immutability. action = 100 - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode_2.add_timestep( + obs, reward, terminated, truncated, info = env.step(action) + episode_2.add_env_step( observation=obs, action=action, reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={"extra": np.random.random(1)}, + infos=info, + terminated=terminated, + truncated=truncated, + extra_model_outputs={"extra": np.random.random(1)}, ) # Assert that this does not change also the predecessor's data. self.assertFalse(len(episode_1.observations) == len(episode_2.observations)) + def test_slices(self): + # TEST #1: even split (50/50) + episode = self._create_episode(100) + self.assertTrue(episode.t == 100 and episode.t_started == 0) + + # Convert to numpy before splitting. + episode.finalize() + + # Create two 50/50 episode chunks. + e1 = episode[:50] + self.assertTrue(e1.is_finalized) + e2 = episode.slice(slice(50, None)) + self.assertTrue(e2.is_finalized) + + # Make sure, `e1` and `e2` make sense. + self.assertTrue(len(e1) == 50) + self.assertTrue(len(e2) == 50) + self.assertTrue(e1.id_ == e2.id_) + self.assertTrue(e1.t_started == 0) + self.assertTrue(e1.t == 50) + self.assertTrue(e2.t_started == 50) + self.assertTrue(e2.t == 100) + # Make sure the chunks are not identical, but last obs of `e1` matches + # last obs of `e2`. + check(e1.get_observations(-1), e2.get_observations(0)) + check(e1.observations[4], e2.observations[4], false=True) + check(e1.observations[10], e2.observations[10], false=True) + + # TEST #2: Uneven split (33/66). + episode = self._create_episode(99) + self.assertTrue(episode.t == 99 and episode.t_started == 0) + + # Convert to numpy before splitting. + episode.finalize() + + # Create two 50/50 episode chunks. + e1 = episode.slice(slice(None, 33)) + self.assertTrue(e1.is_finalized) + e2 = episode[33:] + self.assertTrue(e2.is_finalized) + + # Make sure, `e1` and `e2` chunk make sense. + self.assertTrue(len(e1) == 33) + self.assertTrue(len(e2) == 66) + self.assertTrue(e1.id_ == e2.id_) + self.assertTrue(e1.t_started == 0) + self.assertTrue(e1.t == 33) + self.assertTrue(e2.t_started == 33) + self.assertTrue(e2.t == 99) + # Make sure the chunks are not identical, but last obs of `e1` matches + # last obs of `e2`. + check(e1.get_observations(-1), e2.get_observations(0)) + check(e1.observations[4], e2.observations[4], false=True) + check(e1.observations[10], e2.observations[10], false=True) + + # TEST #3: Split with lookback buffer (buffer=10, split=20/30). + episode = self._create_episode( + num_data=60, t_started=15, len_lookback_buffer=10 + ) + self.assertTrue(episode.t == 65 and episode.t_started == 15) + + # Convert to numpy before splitting. + episode.finalize() + + # Create two 20/30 episode chunks. + e1 = episode.slice(slice(None, 20)) + self.assertTrue(e1.is_finalized) + e2 = episode[20:] + self.assertTrue(e2.is_finalized) + + # Make sure, `e1` and `e2` make sense. + self.assertTrue(len(e1) == 20) + self.assertTrue(len(e2) == 30) + self.assertTrue(e1.id_ == e2.id_) + self.assertTrue(e1.t_started == 15) + self.assertTrue(e1.t == 35) + self.assertTrue(e2.t_started == 35) + self.assertTrue(e2.t == 65) + # Make sure the chunks are not identical, but last obs of `e1` matches + # last obs of `e2`. + check(e1.get_observations(-1), e2.get_observations(0)) + check(e1.observations[5], e2.observations[5], false=True) + check(e1.observations[11], e2.observations[11], false=True) + # Make sure the lookback buffers of both chunks are still working. + check( + e1.get_observations(-1, neg_indices_left_of_zero=True), + episode.observations.data[episode._len_lookback_buffer - 1], + ) + check( + e1.get_actions(-1, neg_indices_left_of_zero=True), + episode.actions.data[episode._len_lookback_buffer - 1], + ) + check( + e2.get_observations([-5, -2], neg_indices_left_of_zero=True), + [ + episode.observations.data[20 + episode._len_lookback_buffer - 5], + episode.observations.data[20 + episode._len_lookback_buffer - 2], + ], + ) + check( + e2.get_rewards([-5, -2], neg_indices_left_of_zero=True), + [ + episode.rewards.data[20 + episode._len_lookback_buffer - 5], + episode.rewards.data[20 + episode._len_lookback_buffer - 2], + ], + ) + def test_concat_episode(self): """Tests if concatenation of two `SingleAgentEpisode`s works. @@ -248,38 +337,36 @@ def test_concat_episode(self): env = TestEnv() init_obs, init_info = env.reset() episode_1 = SingleAgentEpisode() - episode_1.add_initial_observation( - initial_observation=init_obs, initial_info=init_info - ) + episode_1.add_env_reset(observation=init_obs, infos=init_info) # Sample 100 timesteps. for i in range(100): action = i - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode_1.add_timestep( + obs, reward, terminated, truncated, info = env.step(action) + episode_1.add_env_step( observation=obs, action=action, reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={"extra": np.random.random(1)}, + infos=info, + terminated=terminated, + truncated=truncated, + extra_model_outputs={"extra": np.random.random(1)}, ) # Create a successor. - episode_2 = episode_1.create_successor() + episode_2 = episode_1.cut() # Now, sample 100 more timesteps. for i in range(100, 200): action = i - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode_2.add_timestep( + obs, reward, terminated, truncated, info = env.step(action) + episode_2.add_env_step( observation=obs, action=action, reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={"extra": np.random.random(1)}, + infos=info, + terminated=terminated, + truncated=truncated, + extra_model_outputs={"extra": np.random.random(1)}, ) # Assert that the second episode's `t_started` is at the first episode's @@ -309,7 +396,7 @@ def test_concat_episode(self): # Reset `is_terminated`. episode_1.is_terminated = False - # Concate the episodes. + # Concatenate the episodes. episode_1.concat_episode(episode_2) # Assert that the concatenated episode start at `t_started=0` @@ -331,118 +418,39 @@ def test_concat_episode(self): # self.assertNotEqual(id(episode_2.observations[5]), # id(episode_1.observations[105])) - def test_get_and_from_state(self): - """Tests, if a `SingleAgentEpisode` can be reconstructed form state. + def _create_episode(self, num_data, t_started=None, len_lookback_buffer=0): + # Sample 100 values and initialize episode with observations and infos. + env = gym.make("CartPole-v1") + # Initialize containers. + observations = [] + rewards = [] + actions = [] + infos = [] + extra_model_outputs = defaultdict(list) - This test constructs an episode, stores it to its dictionary state and - recreates a new episode form this state. Thereby it ensures that all - atttributes are indeed identical to the primer episode and the data is - complete. - """ - # Create an empty episode. - episode = SingleAgentEpisode() - # Create an environment. - env = TestEnv() - # Add initial observation. + # Initialize observation and info. init_obs, init_info = env.reset() - episode.add_initial_observation( - initial_observation=init_obs, initial_info=init_info - ) - # Sample 100 steps. - for i in range(100): - action = i - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode.add_timestep( - observation=obs, - action=action, - reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={"extra": np.random.random(1)}, - ) - - # Get the state and reproduce it from state. - state = episode.get_state() - episode_reproduced = SingleAgentEpisode.from_state(state) - - # Assert that the data is complete. - self.assertEqual(episode.id_, episode_reproduced.id_) - self.assertEqual(episode.t, episode_reproduced.t) - self.assertEqual(episode.t_started, episode_reproduced.t_started) - self.assertEqual(episode.is_terminated, episode_reproduced.is_terminated) - self.assertEqual(episode.is_truncated, episode_reproduced.is_truncated) - self.assertListEqual(episode.observations, episode_reproduced.observations) - self.assertListEqual(episode.actions, episode_reproduced.actions) - self.assertListEqual(episode.rewards, episode_reproduced.rewards) - self.assertListEqual(episode.infos, episode_reproduced.infos) - self.assertEqual(episode.is_terminated, episode_reproduced.is_terminated) - self.assertEqual(episode.is_truncated, episode_reproduced.is_truncated) - self.assertEqual(episode.states, episode_reproduced.states) - self.assertListEqual(episode.render_images, episode_reproduced.render_images) - self.assertDictEqual( - episode.extra_model_outputs, episode_reproduced.extra_model_outputs - ) - - # Assert that reconstruction breaks, if the data is not complete. - state[1][1].pop() - with self.assertRaises(AssertionError): - episode_reproduced = SingleAgentEpisode.from_state(state) - - def test_to_and_from_sample_batch(self): - """Tests if a `SingelAgentEpisode` can be reconstructed from a `SampleBatch`. - - This tests converst an episode to a `SampleBatch` and reconstructs the - episode then from this sample batch. It is then tested, if all data is - complete. - Note that `extra_model_outputs` are defined by the user and as the format - in the episode from which a `SampleBatch` was created is unknown this - reconstruction would only work, if the user does take care of it (as a - counter example just rempve the index [0] from the `extra_model_output`). - """ - # Create an empty episode. - episode = SingleAgentEpisode() - # Create an environment. - env = TestEnv() - # Add initial observation. - init_obs, init_obs = env.reset() - episode.add_initial_observation( - initial_observation=init_obs, initial_info=init_obs - ) - # Sample 100 steps. - for i in range(100): - action = i - obs, reward, is_terminated, is_truncated, info = env.step(action) - episode.add_timestep( - observation=obs, - action=action, - reward=reward, - info=info, - is_terminated=is_terminated, - is_truncated=is_truncated, - extra_model_output={"extra": np.random.random(1)[0]}, - ) + observations.append(init_obs) + infos.append(init_info) + # Run n samples. + for _ in range(num_data): + action = env.action_space.sample() + obs, reward, _, _, info = env.step(action) + observations.append(obs) + actions.append(action) + rewards.append(reward) + infos.append(info) + extra_model_outputs["extra_1"].append(np.random.random()) + extra_model_outputs["state_out"].append(np.random.random()) - # Create `SampleBatch`. - batch = episode.to_sample_batch() - # Reproduce form `SampleBatch`. - episode_reproduced = SingleAgentEpisode.from_sample_batch(batch) - # Assert that the data is complete. - self.assertEqual(episode.id_, episode_reproduced.id_) - self.assertEqual(episode.t, episode_reproduced.t) - self.assertEqual(episode.t_started, episode_reproduced.t_started) - self.assertEqual(episode.is_terminated, episode_reproduced.is_terminated) - self.assertEqual(episode.is_truncated, episode_reproduced.is_truncated) - self.assertListEqual(episode.observations, episode_reproduced.observations) - self.assertListEqual(episode.actions, episode_reproduced.actions) - self.assertListEqual(episode.rewards, episode_reproduced.rewards) - self.assertEqual(episode.infos, episode_reproduced.infos) - self.assertEqual(episode.is_terminated, episode_reproduced.is_terminated) - self.assertEqual(episode.is_truncated, episode_reproduced.is_truncated) - self.assertEqual(episode.states, episode_reproduced.states) - self.assertListEqual(episode.render_images, episode_reproduced.render_images) - self.assertDictEqual( - episode.extra_model_outputs, episode_reproduced.extra_model_outputs + return SingleAgentEpisode( + observations=observations, + infos=infos, + actions=actions, + rewards=rewards, + extra_model_outputs=extra_model_outputs, + t_started=t_started, + len_lookback_buffer=len_lookback_buffer, ) diff --git a/rllib/env/utils.py b/rllib/env/utils.py index a9deba95548db..30642462bbc10 100644 --- a/rllib/env/utils.py +++ b/rllib/env/utils.py @@ -1,7 +1,9 @@ import logging -from typing import Type, Union +from typing import List, Optional, Type, Union import gymnasium as gym +import numpy as np +import tree # pip install dm_tree from ray.rllib.env.env_context import EnvContext from ray.rllib.env.multi_agent_env import MultiAgentEnv @@ -14,6 +16,12 @@ EnvError, ) from ray.rllib.utils.gym import check_old_gym_env +from ray.rllib.utils.numpy import one_hot, one_hot_multidiscrete +from ray.rllib.utils.spaces.space_utils import ( + batch, + get_dummy_batch_for_space, + get_base_struct_from_space, +) from ray.util import log_once from ray.util.annotations import PublicAPI @@ -169,3 +177,328 @@ def _gym_env_creator( raise EnvError(ERR_MSG_INVALID_ENV_DESCRIPTOR.format(env_descriptor)) return env + + +class BufferWithInfiniteLookback: + def __init__( + self, + data: Optional[Union[List, np.ndarray]] = None, + lookback: int = 0, + space: Optional[gym.Space] = None, + ): + self.data = data if data is not None else [] + self.lookback = lookback + self.finalized = not isinstance(self.data, list) + self._final_len = None + self.space = space + self.space_struct = get_base_struct_from_space(self.space) + + def append(self, item) -> None: + """Appends the given item to the end of this buffer.""" + if self.finalized: + raise RuntimeError(f"Cannot `append` to a finalized {type(self).__name__}.") + self.data.append(item) + + def extend(self, items): + """Appends all items in `items` to the end of this buffer.""" + if self.finalized: + raise RuntimeError(f"Cannot `extend` a finalized {type(self).__name__}.") + for item in items: + self.append(item) + + def pop(self, index: int = -1): + """Removes the item at `index` from this buffer.""" + if self.finalized: + raise RuntimeError(f"Cannot `pop` from a finalized {type(self).__name__}.") + return self.data.pop(index) + + def finalize(self): + """Finalizes this buffer by converting internal data lists into numpy arrays. + + Thereby, if the individual items in the list are complex (nested 2) + """ + if not self.finalized: + self._final_len = len(self.data) - self.lookback + self.data = batch(self.data) + self.finalized = True + + def get( + self, + indices: Optional[Union[int, slice, List[int]]] = None, + neg_indices_left_of_zero: bool = False, + fill: Optional[float] = None, + one_hot_discrete: bool = False, + ): + """Returns data, based on the given args, from this buffer. + + Args: + indices: A single int is interpreted as an index, from which to return the + individual data stored at this index. + A list of ints is interpreted as a list of indices from which to gather + individual data in a batch of size len(indices). + A slice object is interpreted as a range of data to be returned. + Thereby, negative indices by default are interpreted as "before the end" + unless the `neg_indices_left_of_zero=True` option is used, in which case + negative indices are interpreted as "before ts=0", meaning going back + into the lookback buffer. + neg_indices_left_of_zero: If True, negative values in `indices` are + interpreted as "before ts=0", meaning going back into the lookback + buffer. For example, an buffer with data [4, 5, 6, 7, 8, 9], + where [4, 5, 6] is the lookback buffer range (ts=0 item is 7), will + respond to `get(-1, neg_indices_left_of_zero=True)` with `6` and to + `get(slice(-2, 1), neg_indices_left_of_zero=True)` with `[5, 6, 7]`. + fill: An optional float value to use for filling up the returned results at + the boundaries. This filling only happens if the requested index range's + start/stop boundaries exceed the buffer's boundaries (including the + lookback buffer on the left side). This comes in very handy, if users + don't want to worry about reaching such boundaries and want to zero-pad. + For example, a buffer with data [10, 11, 12, 13, 14] and lookback + buffer size of 2 (meaning `10` and `11` are part of the lookback buffer) + will respond to `get(slice(-7, -2), fill=0.0)` + with `[0.0, 0.0, 10, 11, 12]`. + one_hot_discrete: If True, will return one-hot vectors (instead of + int-values) for those sub-components of a (possibly complex) space + that are Discrete or MultiDiscrete. Note that if `fill=0` and the + requested `indices` are out of the range of our data, the returned + one-hot vectors will actually be zero-hot (all slots zero). + """ + if fill is not None and self.space is None: + raise ValueError( + f"Cannot use `fill` argument in `{type(self).__name__}.get()` if a " + "gym.Space was NOT provided during construction!" + ) + + if indices is None: + data = self._get_all_data(one_hot_discrete=one_hot_discrete) + elif isinstance(indices, slice): + data = self._get_slice( + indices, + fill=fill, + neg_indices_left_of_zero=neg_indices_left_of_zero, + one_hot_discrete=one_hot_discrete, + ) + elif isinstance(indices, list): + data = [ + self._get_int_index( + idx, + fill=fill, + neg_indices_left_of_zero=neg_indices_left_of_zero, + one_hot_discrete=one_hot_discrete, + ) + for idx in indices + ] + if self.finalized: + data = batch(data) + else: + assert isinstance(indices, int) + data = self._get_int_index( + indices, + fill=fill, + neg_indices_left_of_zero=neg_indices_left_of_zero, + one_hot_discrete=one_hot_discrete, + ) + + return data + + def __getitem__(self, item): + """Support squared bracket syntax, e.g. buffer[:5].""" + return self.get(item) + + def __len__(self): + """Return the length of our data, excluding the lookback buffer.""" + if self._final_len is not None: + assert self.finalized + return self._final_len + return len(self.data) - self.lookback + + def _get_all_data(self, one_hot_discrete=False): + data = self[:] + if one_hot_discrete: + data = self._one_hot(data, space_struct=self.space_struct) + return data + + def _get_slice( + self, + slice_, + fill=None, + neg_indices_left_of_zero=False, + one_hot_discrete=False, + ): + len_self_plus_lookback = len(self) + self.lookback + fill_left_count = fill_right_count = 0 + + # Re-interpret slice bounds as absolute positions (>=0) within our + # internal data. + start = slice_.start + stop = slice_.stop + + # Start is None -> Exclude lookback buffer. + if start is None: + start = self.lookback + # Start is negative. + elif start < 0: + # `neg_indices_left_of_zero=True` -> User wants to index into the lookback + # range. + if neg_indices_left_of_zero: + start = self.lookback + start + # Interpret index as counting "from end". + else: + start = len_self_plus_lookback + start + # Start is 0 or positive -> timestep right after lookback is interpreted as 0. + else: + start = self.lookback + start + + # Stop is None -> Set stop to very last index + 1 of our internal data. + if stop is None: + stop = len_self_plus_lookback + # Stop is negative. + elif stop < 0: + # `neg_indices_left_of_zero=True` -> User wants to index into the lookback + # range. Set to 0 (beginning of lookback buffer) if result is a negative + # index. + if neg_indices_left_of_zero: + stop = self.lookback + stop + # Interpret index as counting "from end". Set to 0 (beginning of actual + # episode) if result is a negative index. + else: + stop = len_self_plus_lookback + stop + # Stop is positive -> Add lookback range to it. + else: + stop = self.lookback + stop + + # Both start and stop are on left side. + if start < 0 and stop < 0: + fill_left_count = abs(start - stop) + fill_right_count = 0 + start = stop = 0 + # Both start and stop are on right side. + elif start >= len_self_plus_lookback and stop >= len_self_plus_lookback: + fill_right_count = abs(start - stop) + fill_left_count = 0 + start = stop = len_self_plus_lookback + # Set to 0 (beginning of actual episode) if result is a negative index. + elif start < 0: + fill_left_count = -start + start = 0 + elif stop >= len_self_plus_lookback: + fill_right_count = stop - len_self_plus_lookback + stop = len_self_plus_lookback + + assert start >= 0 and stop >= 0, (start, stop) + assert start <= len_self_plus_lookback and stop <= len_self_plus_lookback, ( + start, + stop, + ) + slice_ = slice(start, stop, slice_.step) + + # Perform the actual slice. + if self.finalized: + data_slice = tree.map_structure(lambda s: s[slice_], self.data) + else: + data_slice = self.data[slice_] + + if one_hot_discrete: + data_slice = self._one_hot(data_slice, space_struct=self.space_struct) + + # Data is shorter than the range requested -> Fill the rest with `fill` data. + if fill is not None and (fill_right_count > 0 or fill_left_count > 0): + if self.finalized: + if fill_left_count: + fill_batch = get_dummy_batch_for_space( + self.space, + fill_value=fill, + batch_size=fill_left_count, + one_hot_discrete=one_hot_discrete, + ) + data_slice = tree.map_structure( + lambda s0, s: np.concatenate([s0, s]), fill_batch, data_slice + ) + if fill_right_count: + fill_batch = get_dummy_batch_for_space( + self.space, + fill_value=fill, + batch_size=fill_right_count, + one_hot_discrete=one_hot_discrete, + ) + data_slice = tree.map_structure( + lambda s0, s: np.concatenate([s, s0]), fill_batch, data_slice + ) + + else: + fill_batch = [ + get_dummy_batch_for_space( + self.space, + fill_value=fill, + batch_size=0, + one_hot_discrete=one_hot_discrete, + ) + ] + data_slice = ( + fill_batch * fill_left_count + + data_slice + + fill_batch * fill_right_count + ) + + return data_slice + + def _get_int_index( + self, + idx: int, + fill=None, + neg_indices_left_of_zero=False, + one_hot_discrete=False, + ): + # If index >= 0 -> Ignore lookback buffer. + # Otherwise, include lookback buffer. + if idx >= 0 or neg_indices_left_of_zero: + idx = self.lookback + idx + # Negative indices mean: Go to left into lookback buffer starting from idx=0. + # But if we pass the lookback buffer, the index should be invalid and we will + # have to fill, if required. Invalidate the index by setting it to one larger + # than max. + if neg_indices_left_of_zero and idx < 0: + idx = len(self) + self.lookback + + try: + if self.finalized: + data = tree.map_structure(lambda s: s[idx], self.data) + else: + data = self.data[idx] + # Out of range index -> If `fill`, use a fill dummy (B=0), if not, error out. + except IndexError as e: + if fill is not None: + return get_dummy_batch_for_space( + self.space, + fill_value=fill, + batch_size=0, + one_hot_discrete=one_hot_discrete, + ) + else: + raise e + + # Convert discrete/multi-discrete components to one-hot vectors, if required. + if one_hot_discrete: + data = self._one_hot(data, self.space_struct) + return data + + def _one_hot(self, data, space_struct): + if space_struct is None: + raise ValueError( + f"Cannot `one_hot` data in `{type(self).__name__}` if a " + "gym.Space was NOT provided during construction!" + ) + + def _convert(dat_, space): + if isinstance(space, gym.spaces.Discrete): + return one_hot(dat_, depth=space.n) + elif isinstance(space, gym.spaces.MultiDiscrete): + return one_hot_multidiscrete(dat_, depths=space.nvec) + return dat_ + + if isinstance(data, list): + data = [ + tree.map_structure(_convert, dslice, space_struct) for dslice in data + ] + else: + data = tree.map_structure(_convert, data, space_struct) + return data diff --git a/rllib/evaluation/postprocessing_v2.py b/rllib/evaluation/postprocessing_v2.py index 9fae6c1ce3254..a50da0718e5d7 100644 --- a/rllib/evaluation/postprocessing_v2.py +++ b/rllib/evaluation/postprocessing_v2.py @@ -6,13 +6,13 @@ from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.core.models.base import STATE_IN from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.env.single_agent_episode import SingleAgentEpisode from ray.rllib.evaluation.postprocessing import discount_cumsum from ray.rllib.policy.sample_batch import concat_samples, SampleBatch from ray.rllib.utils.annotations import DeveloperAPI from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.env.single_agent_episode import SingleAgentEpisode from ray.rllib.utils.torch_utils import convert_to_torch_tensor from ray.rllib.utils.typing import TensorType @@ -43,10 +43,10 @@ def postprocess_episodes_to_sample_batch( # a list. if isinstance(episode_or_list, list): for episode in episode_or_list: - batches.append(episode.to_sample_batch()) + batches.append(episode.get_sample_batch()) # During exploration we have an episode. else: - batches.append(episode_or_list.to_sample_batch()) + batches.append(episode_or_list.get_sample_batch()) batch = concat_samples(batches) # TODO (sven): During evalaution we do not have infos at all. diff --git a/rllib/tuned_examples/ppo/memory-leak-test-ppo-new-stack.py b/rllib/tuned_examples/ppo/memory-leak-test-ppo-new-stack.py new file mode 100644 index 0000000000000..f049692edfaf2 --- /dev/null +++ b/rllib/tuned_examples/ppo/memory-leak-test-ppo-new-stack.py @@ -0,0 +1,17 @@ +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner +from ray.rllib.examples.env.random_env import RandomLargeObsSpaceEnv + + +config = ( + PPOConfig() + .experimental(_enable_new_api_stack=True) + # Switch off np.random, which is known to have memory leaks. + .environment(RandomLargeObsSpaceEnv, env_config={"static_samples": True}) + .rollouts( + env_runner_cls=SingleAgentEnvRunner, + num_rollout_workers=4, + num_envs_per_worker=5, + ) + .training(train_batch_size=500, sgd_minibatch_size=256, num_sgd_iter=5) +) diff --git a/rllib/utils/numpy.py b/rllib/utils/numpy.py index 371885f3744b9..9f040c8a0c286 100644 --- a/rllib/utils/numpy.py +++ b/rllib/utils/numpy.py @@ -235,7 +235,8 @@ def flatten_inputs_to_1d_tensor( Args: inputs: The inputs to be flattened. - spaces_struct: The structure of the spaces that behind the input + spaces_struct: The (possibly nested) structure of the spaces that `inputs` + belongs to. time_axis: Whether all inputs have a time-axis (after the batch axis). If True, will keep not only the batch axis (0th), but the time axis (1st) as-is and flatten everything from the 2nd axis up. @@ -501,10 +502,7 @@ def one_hot( ) shape = x.shape - # Python 2.7 compatibility, (*shape, depth) is not allowed. - shape_list = list(shape[:]) - shape_list.append(depth) - out = np.ones(shape_list) * off_value + out = np.ones(shape=(*shape, depth)) * off_value indices = [] for i in range(x.ndim): tiles = [1] * x.ndim @@ -520,6 +518,22 @@ def one_hot( return out.astype(dtype) +@PublicAPI +def one_hot_multidiscrete(x, depths=List[int]): + # Handle torch arrays properly. + if torch and isinstance(x, torch.Tensor): + x = x.numpy() + + shape = x.shape + return np.concatenate( + [ + one_hot(x[i] if len(shape) == 1 else x[:, i], depth=n).astype(np.float32) + for i, n in enumerate(depths) + ], + axis=-1, + ) + + @PublicAPI def relu(x: np.ndarray, alpha: float = 0.0) -> np.ndarray: """Implementation of the leaky ReLU function. diff --git a/rllib/utils/replay_buffers/tests/test_episode_replay_buffer.py b/rllib/utils/replay_buffers/tests/test_episode_replay_buffer.py index 95662eb14cbfa..2f9dd1b20e10e 100644 --- a/rllib/utils/replay_buffers/tests/test_episode_replay_buffer.py +++ b/rllib/utils/replay_buffers/tests/test_episode_replay_buffer.py @@ -13,11 +13,11 @@ def _get_episode(episode_len=None, id_=None): eps = SingleAgentEpisode(id_=id_, observations=[0.0], infos=[{}]) ts = np.random.randint(1, 200) if episode_len is None else episode_len for t in range(ts): - eps.add_timestep( + eps.add_env_step( observation=float(t + 1), action=int(t), reward=0.1 * (t + 1), - info={}, + infos={}, ) eps.is_terminated = np.random.random() > 0.5 eps.is_truncated = False if eps.is_terminated else np.random.random() > 0.8 diff --git a/rllib/utils/spaces/space_utils.py b/rllib/utils/spaces/space_utils.py index c3d621189d8f8..426abb13c0121 100644 --- a/rllib/utils/spaces/space_utils.py +++ b/rllib/utils/spaces/space_utils.py @@ -100,11 +100,13 @@ def get_dummy_batch_for_space( fill_value: Union[float, int, str] = 0.0, time_size: Optional[int] = None, time_major: bool = False, + one_hot_discrete: bool = False, ) -> np.ndarray: """Returns batched dummy data (using `batch_size`) for the given `space`. Note: The returned batch will not pass a `space.contains(batch)` test - as an additional batch dimension has to be added as dim=0. + as an additional batch dimension has to be added at axis 0, unless `batch_size` is + set to 0. Args: space: The space to get a dummy batch for. @@ -114,23 +116,45 @@ def get_dummy_batch_for_space( fill_value: The value to fill the batch with or "random" for random values. time_size: If not None, add an optional time axis - of `time_size` size to the returned batch. + of `time_size` size to the returned batch. This time axis might either + be inserted at axis=1 (default) or axis=0, if `time_major` is True. time_major: If True AND `time_size` is not None, return batch as shape [T x B x ...], otherwise as [B x T x ...]. If `time_size` if None, ignore this setting and return [B x ...]. + one_hot_discrete: If True, will return one-hot vectors (instead of + int-values) for those sub-components of a (possibly complex) `space` + that are Discrete or MultiDiscrete. Note that in case `fill_value` is 0.0, + this will result in zero-hot vectors (where all slots have a value of 0.0). Returns: The dummy batch of size `bqtch_size` matching the given space. """ # Complex spaces. Perform recursive calls of this function. - if isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple)): + if isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple, dict, tuple)): + base_struct = space + if isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple)): + base_struct = get_base_struct_from_space(space) return tree.map_structure( - lambda s: get_dummy_batch_for_space(s, batch_size, fill_value), - get_base_struct_from_space(space), + lambda s: get_dummy_batch_for_space( + space=s, + batch_size=batch_size, + fill_value=fill_value, + time_size=time_size, + time_major=time_major, + one_hot_discrete=one_hot_discrete, + ), + base_struct, ) + + if one_hot_discrete: + if isinstance(space, gym.spaces.Discrete): + space = gym.spaces.Box(0.0, 1.0, (space.n,), np.float32) + elif isinstance(space, gym.spaces.MultiDiscrete): + space = gym.spaces.Box(0.0, 1.0, (np.sum(space.nvec),), np.float32) + # Primivite spaces: Box, Discrete, MultiDiscrete. # Random values: Use gym's sample() method. - elif fill_value == "random": + if fill_value == "random": if time_size is not None: assert batch_size > 0 and time_size > 0 if time_major: diff --git a/rllib/utils/spaces/tests/test_space_utils.py b/rllib/utils/spaces/tests/test_space_utils.py index c0200b6870ddb..69d007ed12d02 100644 --- a/rllib/utils/spaces/tests/test_space_utils.py +++ b/rllib/utils/spaces/tests/test_space_utils.py @@ -76,6 +76,12 @@ def test_unsquash_action(self): def test_batch_and_unbatch(self): """Tests the two utility functions `batch` and `unbatch`.""" + # Test, whether simple structs are batch/unbatch'able as well. + # B=8 + simple_struct = [0, 1, 2, 3, 4, 5, 6, 7] + simple_struct_batched = batch(simple_struct) + check(unbatch(simple_struct_batched), simple_struct) + # Create a complex struct of individual batches (B=2). complex_struct = { "a": ( diff --git a/rllib/utils/tests/run_memory_leak_tests.py b/rllib/utils/tests/run_memory_leak_tests.py index 5b770a0884106..4fc509fd7c880 100644 --- a/rllib/utils/tests/run_memory_leak_tests.py +++ b/rllib/utils/tests/run_memory_leak_tests.py @@ -93,6 +93,8 @@ # For python files, need to make sure, we only deliver the module name into the # `load_experiments_from_file` function (everything from "/ray/rllib" on). if file.endswith(".py"): + if file.endswith("__init__.py"): # weird CI learning test (BAZEL) case + continue experiments = load_experiments_from_file(file, SupportedFileType.python) else: experiments = load_experiments_from_file(file, SupportedFileType.yaml) @@ -122,11 +124,12 @@ leaking = True try: ray.init(num_cpus=5, local_mode=args.local_mode) - algo = get_trainable_cls(experiment["run"])(experiment["config"]) - results = check_memory_leaks( - algo, - to_check=set(args.to_check), - ) + if isinstance(experiment["run"], str): + algo_cls = get_trainable_cls(experiment["run"]) + else: + algo_cls = get_trainable_cls(experiment["run"].__name__) + algo = algo_cls(experiment["config"]) + results = check_memory_leaks(algo, to_check=set(args.to_check)) if not results: leaking = False finally: