From 2b483c47c19987a5786cbb674de870fc12bab636 Mon Sep 17 00:00:00 2001 From: lucaw Date: Mon, 11 Apr 2022 18:12:03 -0700 Subject: [PATCH 01/11] HM3D and increasing rollouts. --- ...ctnav_habitat_rgb_clipresnet50gru_ddppo.py | 34 ++--- ...lipresnet50gru_ddppo_increasingrollouts.py | 106 ++++++++++++++++ .../habitat/objectnav_habitat_base.py | 117 ++++++++++++------ 3 files changed, 204 insertions(+), 53 deletions(-) create mode 100644 projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo_increasingrollouts.py diff --git a/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py b/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py index c974254ec..ba068e9b2 100644 --- a/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py +++ b/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py @@ -1,6 +1,7 @@ from typing import Sequence, Union import torch.nn as nn +from torch.distributions.utils import lazy_property from allenact.base_abstractions.preprocessor import Preprocessor from allenact.utils.experiment_utils import Builder, TrainingPipeline @@ -25,19 +26,6 @@ class ObjectNavHabitatRGBClipResNet50GRUDDPPOExperimentConfig( CLIP_MODEL_TYPE = "RN50" - SENSORS = [ - RGBSensorHabitat( - height=ObjectNavHabitatBaseConfig.SCREEN_SIZE, - width=ObjectNavHabitatBaseConfig.SCREEN_SIZE, - use_resnet_normalization=True, - mean=ClipResNetPreprocessor.CLIP_RGB_MEANS, - stdev=ClipResNetPreprocessor.CLIP_RGB_STDS, - ), - TargetObjectSensorHabitat( - len(ObjectNavHabitatBaseConfig.DEFAULT_OBJECT_CATEGORIES_TO_IND) - ), - ] - def __init__(self, lr: float, **kwargs): super().__init__(**kwargs) @@ -50,6 +38,21 @@ def __init__(self, lr: float, **kwargs): goal_sensor_type=TargetObjectSensorHabitat, ) + @lazy_property + def SENSORS(self): + return [ + RGBSensorHabitat( + height=ObjectNavHabitatBaseConfig.SCREEN_SIZE, + width=ObjectNavHabitatBaseConfig.SCREEN_SIZE, + use_resnet_normalization=True, + mean=ClipResNetPreprocessor.CLIP_RGB_MEANS, + stdev=ClipResNetPreprocessor.CLIP_RGB_STDS, + ), + TargetObjectSensorHabitat( + len(self.DEFAULT_OBJECT_CATEGORIES_TO_IND) + ), + ] + def training_pipeline(self, **kwargs) -> TrainingPipeline: return ObjectNavPPOMixin.training_pipeline( lr=self.lr, @@ -67,4 +70,7 @@ def create_model(self, **kwargs) -> nn.Module: ) def tag(self): - return f"ObjectNav-Habitat-RGB-ClipResNet50GRU-DDPPO-lr{self.lr}" + return ( + f"{super(ObjectNavHabitatRGBClipResNet50GRUDDPPOExperimentConfig, self).tag()}" + f"-RGB-ClipResNet50GRU-DDPPO-lr{self.lr}" + ) diff --git a/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo_increasingrollouts.py b/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo_increasingrollouts.py new file mode 100644 index 000000000..4792f1a55 --- /dev/null +++ b/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo_increasingrollouts.py @@ -0,0 +1,106 @@ +import torch +import torch.optim as optim + +from allenact.algorithms.onpolicy_sync.losses import PPO +from allenact.algorithms.onpolicy_sync.losses.ppo import PPOConfig +from allenact.utils.experiment_utils import ( + Builder, + TrainingPipeline, + PipelineStage, + TrainingSettings, +) +from projects.objectnav_baselines.experiments.habitat.clip.objectnav_habitat_rgb_clipresnet50gru_ddppo import ( + ObjectNavHabitatRGBClipResNet50GRUDDPPOExperimentConfig, +) +from projects.objectnav_baselines.mixins import update_with_auxiliary_losses + + +class ObjectNavHabitatRGBClipResNet50GRUDDPPOIncreasingLengthExpConfig( + ObjectNavHabitatRGBClipResNet50GRUDDPPOExperimentConfig +): + def __init__(self, lr=1e-4, **kwargs): + super().__init__(lr, **kwargs) + self.lr = lr + + def training_pipeline(self, **kwargs) -> TrainingPipeline: + auxiliary_uuids = [] + multiple_beliefs = False + normalize_advantage = False + advance_scene_rollout_period = self.ADVANCE_SCENE_ROLLOUT_PERIOD + log_interval_small = ( + self.num_train_processes * 32 * 10 if torch.cuda.is_available() else 1 + ) + log_interval_med = ( + self.num_train_processes * 64 * 5 if torch.cuda.is_available() else 1 + ) + log_interval_large = ( + self.num_train_processes * 128 * 5 if torch.cuda.is_available() else 1 + ) + + batch_steps_0 = int(10e6) + batch_steps_1 = int(10e6) + batch_steps_2 = int(1e9) - batch_steps_0 - batch_steps_1 + + lr = self.lr + num_mini_batch = 1 + update_repeats = 4 + save_interval = 5000000 + gamma = 0.99 + use_gae = True + gae_lambda = 0.95 + max_grad_norm = 0.5 + + named_losses = { + "ppo_loss": (PPO(**PPOConfig, normalize_advantage=normalize_advantage), 1.0) + } + named_losses = update_with_auxiliary_losses( + named_losses=named_losses, + auxiliary_uuids=auxiliary_uuids, + multiple_beliefs=multiple_beliefs, + ) + + return TrainingPipeline( + save_interval=save_interval, + optimizer_builder=Builder(optim.Adam, dict(lr=lr)), + num_mini_batch=num_mini_batch, + update_repeats=update_repeats, + max_grad_norm=max_grad_norm, + named_losses={key: val[0] for key, val in named_losses.items()}, + gamma=gamma, + use_gae=use_gae, + gae_lambda=gae_lambda, + advance_scene_rollout_period=advance_scene_rollout_period, + pipeline_stages=[ + PipelineStage( + loss_names=["ppo_loss"], + max_stage_steps=batch_steps_0, + training_settings=TrainingSettings( + num_steps=32, metric_accumulate_interval=log_interval_small + ), + ), + PipelineStage( + loss_names=["ppo_loss"], + max_stage_steps=batch_steps_1, + training_settings=TrainingSettings( + num_steps=64, metric_accumulate_interval=log_interval_med, + ), + ), + PipelineStage( + loss_names=["ppo_loss"], + max_stage_steps=batch_steps_2, + training_settings=TrainingSettings( + num_steps=128, metric_accumulate_interval=log_interval_large, + ), + ), + ], + lr_scheduler_builder=None, + ) + + def tag(self): + return ( + super( + ObjectNavHabitatRGBClipResNet50GRUDDPPOIncreasingLengthExpConfig, self + ) + .tag() + .replace("-DDPPO-lr", "-DDPPO-IncRollouts-lr") + ) diff --git a/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py b/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py index 20ab4af2c..7b5a03777 100644 --- a/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py +++ b/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py @@ -1,3 +1,4 @@ +import glob import math import os from abc import ABC @@ -124,13 +125,6 @@ class ObjectNavHabitatBaseConfig(ObjectNavBaseConfig, ABC): FAILED_END_REWARD = -1.0 - TASK_DATA_DIR_TEMPLATE = os.path.join( - HABITAT_DATASETS_DIR, "objectnav/mp3d/v1/{}/{}.json.gz" - ) - BASE_CONFIG_YAML_PATH = os.path.join( - HABITAT_CONFIGS_DIR, "tasks/objectnav_mp3d.yaml" - ) - ACTION_SPACE = gym.spaces.Discrete(len(ObjectNavTask.class_action_names())) DEFAULT_NUM_TRAIN_PROCESSES = ( @@ -142,32 +136,9 @@ class ObjectNavHabitatBaseConfig(ObjectNavBaseConfig, ABC): DEFAULT_VALID_GPU_IDS = [torch.cuda.device_count() - 1] DEFAULT_TEST_GPU_IDS = tuple(range(torch.cuda.device_count())) - DEFAULT_OBJECT_CATEGORIES_TO_IND = { - "chair": 0, - "table": 1, - "picture": 2, - "cabinet": 3, - "cushion": 4, - "sofa": 5, - "bed": 6, - "chest_of_drawers": 7, - "plant": 8, - "sink": 9, - "toilet": 10, - "stool": 11, - "towel": 12, - "tv_monitor": 13, - "shower": 14, - "bathtub": 15, - "counter": 16, - "fireplace": 17, - "gym_equipment": 18, - "seating": 19, - "clothes": 20, - } - def __init__( self, + scene_dataset: str, # Should be "mp3d" or "hm3d" debug: bool = False, num_train_processes: Optional[int] = None, num_test_processes: Optional[int] = None, @@ -179,6 +150,8 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + + self.scene_dataset = scene_dataset self.debug = debug def v_or_default(v, default): @@ -224,6 +197,56 @@ def _create_config( num_episode_sample=num_episode_sample, ) + @lazy_property + def DEFAULT_OBJECT_CATEGORIES_TO_IND(self): + if self.scene_dataset == "mp3d": + return { + "chair": 0, + "table": 1, + "picture": 2, + "cabinet": 3, + "cushion": 4, + "sofa": 5, + "bed": 6, + "chest_of_drawers": 7, + "plant": 8, + "sink": 9, + "toilet": 10, + "stool": 11, + "towel": 12, + "tv_monitor": 13, + "shower": 14, + "bathtub": 15, + "counter": 16, + "fireplace": 17, + "gym_equipment": 18, + "seating": 19, + "clothes": 20, + } + elif self.scene_dataset == "hm3d": + return { + "chair": 0, + "bed": 1, + "plant": 2, + "toilet": 3, + "tv_monitor": 4, + "sofa": 5, + } + else: + raise NotImplementedError + + @lazy_property + def TASK_DATA_DIR_TEMPLATE(self): + return os.path.join( + HABITAT_DATASETS_DIR, f"objectnav/{self.scene_dataset}/v1/{{}}/{{}}.json.gz" + ) + + @lazy_property + def BASE_CONFIG_YAML_PATH(self): + return os.path.join( + HABITAT_CONFIGS_DIR, f"tasks/objectnav_{self.scene_dataset}.yaml" + ) + @lazy_property def TRAIN_CONFIG(self): return self._create_config( @@ -265,10 +288,20 @@ def TRAIN_CONFIGS_PER_PROCESS(self): for config in configs: assert len(config.DATASET.CONTENT_SCENES) == 1 scene_name = config.DATASET.CONTENT_SCENES[0] - glb_path = os.path.join( - scenes_dir, "mp3d", scene_name, f"{scene_name}.glb" + + paths = glob.glob( + os.path.join( + scenes_dir, self.scene_dataset, "**", f"{scene_name}.*" + ), + recursive=True, ) - memory_use_per_config.append(os.path.getsize(glb_path)) + + if self.scene_dataset == "mp3d": + assert len(paths) == 4 + else: + assert len(paths) == 2 + + memory_use_per_config.append(sum(os.path.getsize(p) for p in paths)) max_configs_per_device = math.ceil(len(configs) / len(self.train_gpu_ids)) mem_per_device = np.array([0.0 for _ in range(len(self.train_gpu_ids))]) @@ -290,10 +323,17 @@ def TRAIN_CONFIGS_PER_PROCESS(self): configs = sum(configs_per_device, []) if self.debug: - get_logger().warning("IN DEBUG MODE, WILL ONLY USE `1LXtFkjw3qL` SCENE!!!") + get_logger().warning( + "IN DEBUG MODE, WILL ONLY USE `1LXtFkjw3qL` SCENE IN MP3D OR `1S7LAXRdDqK` scene in HM3D!!!" + ) for config in configs: config.defrost() - config.DATASET.CONTENT_SCENES = ["1LXtFkjw3qL"] + if self.scene_dataset == "mp3d": + config.DATASET.CONTENT_SCENES = ["1LXtFkjw3qL"] + elif self.scene_dataset == "hm3d": + config.DATASET.CONTENT_SCENES = ["1S7LAXRdDqK"] + else: + raise NotImplementedError config.freeze() return configs @@ -312,9 +352,8 @@ def test_scenes_path(self): return self.TASK_DATA_DIR_TEMPLATE.format(*(["val"] * 2)) # return self.TASK_DATA_DIR_TEMPLATE.format(*(["test"] * 2)) - @classmethod - def tag(cls): - return "ObjectNav" + def tag(self): + return f"ObjectNav-Habitat-{self.scene_dataset.upper()}" def preprocessors(self) -> Sequence[Union[Preprocessor, Builder[Preprocessor]]]: return tuple() From bb300987cb0eb2cd1a7002c3a733be5095606eba Mon Sep 17 00:00:00 2001 From: lucaw Date: Tue, 19 Apr 2022 11:41:26 -0700 Subject: [PATCH 02/11] Minor inference agent / vision sensor / habitat experiment improvements. --- .../onpolicy_sync/vector_sampled_tasks.py | 85 +++++++++++++------ allenact/embodiedai/sensors/vision_sensors.py | 27 ++++-- allenact/utils/inference.py | 4 + ...ctnav_habitat_rgb_clipresnet50gru_ddppo.py | 4 +- .../habitat/objectnav_habitat_base.py | 8 +- 5 files changed, 87 insertions(+), 41 deletions(-) diff --git a/allenact/algorithms/onpolicy_sync/vector_sampled_tasks.py b/allenact/algorithms/onpolicy_sync/vector_sampled_tasks.py index f4045f2fc..981a0fff0 100644 --- a/allenact/algorithms/onpolicy_sync/vector_sampled_tasks.py +++ b/allenact/algorithms/onpolicy_sync/vector_sampled_tasks.py @@ -144,6 +144,7 @@ class VectorSampledTasks: _mp_ctx: BaseContext _connection_read_fns: List[Callable[[], Any]] _connection_write_fns: List[Callable[[Any], None]] + _read_timeout: Optional[float] def __init__( self, @@ -154,12 +155,16 @@ def __init__( mp_ctx: Optional[BaseContext] = None, should_log: bool = True, max_processes: Optional[int] = None, + read_timeout: Optional[ + float + ] = 60, # Seconds to wait for a task to return a response before timing out ) -> None: self._is_waiting = False self._is_closed = True self.should_log = should_log self.max_processes = max_processes + self.read_timeout = read_timeout assert ( sampler_fn_args is not None and len(sampler_fn_args) > 0 @@ -195,7 +200,8 @@ def __init__( for args in sampler_fn_args: args["mp_ctx"] = self._mp_ctx ( - self._connection_read_fns, + connection_poll_fns, + connection_read_fns, self._connection_write_fns, ) = self._spawn_workers( # noqa make_sampler_fn=make_sampler_fn, @@ -204,6 +210,13 @@ def __init__( ], ) + self._connection_read_fns = [ + self._create_read_function_with_timeout( + read_fn=read_fn, poll_fn=poll_fn, timeout=self.read_timeout + ) + for read_fn, poll_fn in zip(connection_read_fns, connection_poll_fns) + ] + self._is_closed = False for write_fn in self._connection_write_fns: @@ -234,6 +247,25 @@ def __init__( space for read_fn in self._connection_read_fns for space in read_fn() ] + @staticmethod + def _create_read_function_with_timeout( + *, + read_fn: Callable[[], Any], + poll_fn: Callable[[float], bool], + timeout: Optional[float], + ) -> Callable[[], Any]: + def read_with_timeout(timeout_to_use: Optional[float] = timeout): + if timeout_to_use is not None: + # noinspection PyArgumentList + if not poll_fn(timeout=timeout_to_use): + raise TimeoutError( + f"Did not recieve output from `VectorSampledTask` worker for {timeout_to_use} seconds." + ) + + return read_fn() + + return read_with_timeout + def _reset_sampler_index_to_process_ind_and_subprocess_ind(self): self.sampler_index_to_process_ind_and_subprocess_ind = [ [i, j] @@ -297,7 +329,7 @@ def _task_sampling_loop_worker( """process worker for creating and interacting with the Tasks/TaskSampler.""" - ptitle("VectorSampledTask: {}".format(worker_id)) + ptitle(f"VectorSampledTask: {worker_id}") sp_vector_sampled_tasks = SingleProcessVectorSampledTasks( make_sampler_fn=make_sampler_fn, @@ -307,7 +339,7 @@ def _task_sampling_loop_worker( ) if parent_pipe is not None: - parent_pipe.close() + parent_pipe.close() # Means this pipe will close when the calling process closes it try: while True: read_input = connection_read_fn() @@ -368,7 +400,9 @@ def _task_sampling_loop_worker( if should_log: get_logger().info(f"Worker {worker_id} KeyboardInterrupt") except Exception as e: - get_logger().error(traceback.format_exc()) + get_logger().error( + f"Worker {worker_id} encountered an exception:\n{traceback.format_exc()}" + ) raise e finally: if child_pipe is not None: @@ -380,52 +414,50 @@ def _spawn_workers( self, make_sampler_fn: Callable[..., TaskSampler], sampler_fn_args_list: Sequence[Sequence[Dict[str, Any]]], - ) -> Tuple[List[Callable[[], Any]], List[Callable[[Any], None]]]: + ) -> Tuple[ + List[Callable[[], bool]], List[Callable[[], Any]], List[Callable[[Any], None]] + ]: parent_connections, worker_connections = zip( *[self._mp_ctx.Pipe(duplex=True) for _ in range(self._num_processes)] ) self._workers = [] k = 0 id: Union[int, str] - for id, stuff in enumerate( + for id, (worker_conn, parent_conn, current_sampler_fn_args_list) in enumerate( zip(worker_connections, parent_connections, sampler_fn_args_list) ): - worker_conn, parent_conn, current_sampler_fn_args_list = stuff # type: ignore - if len(current_sampler_fn_args_list) != 1: - id = "{}({}-{})".format( - id, k, k + len(current_sampler_fn_args_list) - 1 - ) + id = f"{id}({k}-{k + len(current_sampler_fn_args_list) - 1})" k += len(current_sampler_fn_args_list) if self.should_log: get_logger().info( - "Starting {}-th VectorSampledTask worker with args {}".format( - id, current_sampler_fn_args_list - ) + f"Starting {id}-th VectorSampledTask worker with args {current_sampler_fn_args_list}" ) + ps = self._mp_ctx.Process( # type: ignore target=self._task_sampling_loop_worker, - args=( - id, - worker_conn.recv, - worker_conn.send, - make_sampler_fn, - current_sampler_fn_args_list, - self._auto_resample_when_done, - self.should_log, - worker_conn, - parent_conn, + kwargs=dict( + worker_id=id, + connection_read_fn=worker_conn.recv, + connection_write_fn=worker_conn.send, + make_sampler_fn=make_sampler_fn, + sampler_fn_args_list=current_sampler_fn_args_list, + auto_resample_when_done=self._auto_resample_when_done, + should_log=self.should_log, + child_pipe=worker_conn, + parent_pipe=parent_conn, ), ) self._workers.append(ps) ps.daemon = True ps.start() - worker_conn.close() + worker_conn.close() # Means this pipe will close when the child process closes it time.sleep( 0.1 ) # Useful to ensure things don't lock up when spawning many envs return ( + [p.poll for p in parent_connections], [p.recv for p in parent_connections], [p.send for p in parent_connections], ) @@ -593,7 +625,8 @@ def close(self) -> None: if self._is_waiting: for read_fn in self._connection_read_fns: try: - read_fn() + # noinspection PyArgumentList + read_fn(0) # Time out immediately except Exception: pass diff --git a/allenact/embodiedai/sensors/vision_sensors.py b/allenact/embodiedai/sensors/vision_sensors.py index 33a7b2fd5..3ca2a3cb5 100644 --- a/allenact/embodiedai/sensors/vision_sensors.py +++ b/allenact/embodiedai/sensors/vision_sensors.py @@ -162,28 +162,37 @@ def frame_from_env(self, env: EnvType, task: Optional[SubTaskType]) -> np.ndarra def process_img(self, img: np.ndarray): assert ( - img.dtype == np.float32 and (len(img.shape) == 2 or img.shape[-1] == 1) - ) or (img.shape[-1] == 3 and img.dtype == np.uint8), ( + np.issubdtype(img.dtype, np.float32) + and (len(img.shape) == 2 or img.shape[-1] == 1) + ) or (img.shape[-1] == 3 and np.issubdtype(img.dtype, np.uint8)), ( "Input frame must either have 3 channels and be of" " type np.uint8 or have one channel and be of type np.float32" ) - if self._scale_first: - if self.scaler is not None and img.shape[:2] != (self._height, self._width): - img = np.array(self.scaler(self.to_pil(img)), dtype=img.dtype) # hwc + if ( + self._scale_first + and self.scaler is not None + and img.shape[:2] != (self._height, self._width) + ): + img = np.array(self.scaler(self.to_pil(img)), dtype=img.dtype) # hwc + elif np.issubdtype(img.dtype, np.float32): + img = img.copy() assert img.dtype in [np.uint8, np.float32] - if img.dtype == np.uint8: + if np.issubdtype(img.dtype, np.uint8): img = img.astype(np.float32) / 255.0 if self._should_normalize: img -= self._norm_means img /= self._norm_sds - if not self._scale_first: - if self.scaler is not None and img.shape[:2] != (self._height, self._width): - img = np.array(self.scaler(self.to_pil(img)), dtype=np.float32) # hwc + if ( + (not self._scale_first) + and self.scaler is not None + and img.shape[:2] != (self._height, self._width) + ): + img = np.array(self.scaler(self.to_pil(img)), dtype=np.float32) # hwc return img diff --git a/allenact/utils/inference.py b/allenact/utils/inference.py index 0cef51bea..9fef87086 100644 --- a/allenact/utils/inference.py +++ b/allenact/utils/inference.py @@ -27,6 +27,7 @@ class InferenceAgent: memory: Optional[Memory] = attr.ib(default=None) steps_taken_in_task: int = attr.ib(default=0) last_action_flat: Optional = attr.ib(default=None) + has_initialized: Optional = attr.ib(default=False) def __attrs_post_init__(self): self.actor_critic.eval() @@ -74,6 +75,8 @@ def from_experiment_config( ) def reset(self): + if self.has_initialized: + self.rollout_storage.after_updates() self.steps_taken_in_task = 0 self.memory = None @@ -84,6 +87,7 @@ def act(self, observations: ObservationType): obs_batch = self.sensor_preprocessor_graph.get_observations(obs_batch) if self.steps_taken_in_task == 0: + self.has_initialized = True self.rollout_storage.initialize( observations=obs_batch, num_samplers=1, diff --git a/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py b/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py index ba068e9b2..3826f75d8 100644 --- a/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py +++ b/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py @@ -48,9 +48,7 @@ def SENSORS(self): mean=ClipResNetPreprocessor.CLIP_RGB_MEANS, stdev=ClipResNetPreprocessor.CLIP_RGB_STDS, ), - TargetObjectSensorHabitat( - len(self.DEFAULT_OBJECT_CATEGORIES_TO_IND) - ), + TargetObjectSensorHabitat(len(self.DEFAULT_OBJECT_CATEGORIES_TO_IND)), ] def training_pipeline(self, **kwargs) -> TrainingPipeline: diff --git a/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py b/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py index 7b5a03777..9b6f0405a 100644 --- a/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py +++ b/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py @@ -1,16 +1,18 @@ import glob import math import os +import warnings from abc import ABC from typing import Dict, Any, List, Optional, Sequence, Union import gym + +# noinspection PyUnresolvedReferences +import habitat import numpy as np import torch from torch.distributions.utils import lazy_property -# noinspection PyUnresolvedReferences -import habitat from allenact.base_abstractions.experiment_config import MachineParams from allenact.base_abstractions.preprocessor import ( SensorPreprocessorGraph, @@ -323,7 +325,7 @@ def TRAIN_CONFIGS_PER_PROCESS(self): configs = sum(configs_per_device, []) if self.debug: - get_logger().warning( + warnings.warning( "IN DEBUG MODE, WILL ONLY USE `1LXtFkjw3qL` SCENE IN MP3D OR `1S7LAXRdDqK` scene in HM3D!!!" ) for config in configs: From caa61ade7cbaf618a7e6a463b4e42aded486ba82 Mon Sep 17 00:00:00 2001 From: lucaw Date: Tue, 19 Apr 2022 11:41:47 -0700 Subject: [PATCH 03/11] First attempt at introducing fault tolerance to training. --- allenact/algorithms/onpolicy_sync/engine.py | 50 +++++++++++++++++--- allenact/algorithms/onpolicy_sync/runner.py | 13 ++++- allenact/algorithms/onpolicy_sync/storage.py | 9 ++-- 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/allenact/algorithms/onpolicy_sync/engine.py b/allenact/algorithms/onpolicy_sync/engine.py index a458cf9c9..24501687a 100644 --- a/allenact/algorithms/onpolicy_sync/engine.py +++ b/allenact/algorithms/onpolicy_sync/engine.py @@ -113,6 +113,7 @@ def __init__( deterministic_agents: bool = False, max_sampler_processes_per_worker: Optional[int] = None, initial_model_state_dict: Optional[Union[Dict[str, Any], int]] = None, + try_restart_after_task_timeout: bool = False, **kwargs, ): """Initializer. @@ -140,6 +141,7 @@ def __init__( self.device = torch.device("cpu") if device == -1 else torch.device(device) # type: ignore self.distributed_ip = distributed_ip self.distributed_port = distributed_port + self.try_restart_after_task_timeout = try_restart_after_task_timeout self.mode = mode.lower().strip() assert self.mode in [ @@ -1414,11 +1416,47 @@ def run_pipeline(self): for k, v in self.training_pipeline.current_stage_storage.items() } - for step in range(cur_stage_training_settings.num_steps): - num_paused = self.collect_step_across_all_task_samplers( - rollout_storage_uuid=self.training_pipeline.rollout_storage_uuid, - uuid_to_storage=uuid_to_storage, - ) + vector_tasks_already_restarted = False + step = -1 + while step < cur_stage_training_settings.num_steps - 1: + step += 1 + + try: + num_paused = self.collect_step_across_all_task_samplers( + rollout_storage_uuid=self.training_pipeline.rollout_storage_uuid, + uuid_to_storage=uuid_to_storage, + ) + except TimeoutError: + if ( + not self.try_restart_after_task_timeout + ) or self.mode != TRAIN_MODE_STR: + # Apparently you can just call `raise` here and doing so will just raise the exception as though + # it was not caught (so the stacktrace isn't messed up) + raise + elif vector_tasks_already_restarted: + raise RuntimeError( + f"[{self.mode} worker {self.worker_id}] `vector_tasks` has timed out twice in the same" + f" rollout. This suggests that this error was not recoverable. Timeout exception:\n{traceback.format_exc()}" + ) + else: + get_logger().warning( + f"[{self.mode} worker {self.worker_id}] `vector_tasks` appears to have crashed during" + f" training as it has timed out. You have set `try_restart_after_task_timeout` to `True` so" + f" we will attempt to restart these tasks from the beginning. USE THIS FEATURE AT YOUR OWN" + f" RISK. Timeout exception:\n{traceback.format_exc()}." + ) + self.vector_tasks.close() + self._vector_tasks = None + + vector_tasks_already_restarted = True + for ( + storage + ) in self.training_pipeline.current_stage_storage.values(): + storage.after_updates() + self.initialize_storage_and_viz( + storage_to_initialize=list(uuid_to_storage.values()) + ) + num_paused = 0 # A more informative error message should already have been thrown in be given in # `collect_step_across_all_task_samplers` if `num_paused != 0` here but this serves @@ -1595,7 +1633,7 @@ def train( get_logger().error( f"[{self.mode} worker {self.worker_id}] Encountered {type(e).__name__}, exiting." ) - get_logger().exception(traceback.format_exc()) + get_logger().error(traceback.format_exc()) finally: if training_completed_successfully: if self.worker_id == 0: diff --git a/allenact/algorithms/onpolicy_sync/runner.py b/allenact/algorithms/onpolicy_sync/runner.py index 872d09ebf..7a3120d72 100644 --- a/allenact/algorithms/onpolicy_sync/runner.py +++ b/allenact/algorithms/onpolicy_sync/runner.py @@ -281,9 +281,9 @@ def handler(_signo, _frame): except Exception: get_logger().error( f"Error occurred when closing the RL engine used by work {mode}-{id}." - f" We cannot recover from this and will simply exit. The exception:" + f" We cannot recover from this and will simply exit. The exception:\n" + f"{traceback.format_exc()}" ) - get_logger().exception(traceback.format_exc()) sys.exit(1) sys.exit(0) else: @@ -429,6 +429,15 @@ def start_train( distributed_port = 0 if num_workers == 1 else self.get_port() + if ( + num_workers > 1 + and "NCCL_ASYNC_ERROR_HANDLING" not in os.environ + and "NCCL_BLOCKING_WAIT" not in os.environ + ): + # This ensures the NCCL distributed backend will throw errors + # if we timeout at a call to `barrier()` + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" + worker_ids = self.local_worker_ids(TRAIN_MODE_STR) model_hash = None diff --git a/allenact/algorithms/onpolicy_sync/storage.py b/allenact/algorithms/onpolicy_sync/storage.py index 504b29a9f..c6b8fc14d 100644 --- a/allenact/algorithms/onpolicy_sync/storage.py +++ b/allenact/algorithms/onpolicy_sync/storage.py @@ -199,7 +199,7 @@ def initialize( self.full_size + 1, num_samplers, action_flat_dim, device=self.device ) - assert self.step == 0, "Must call `after_update` before calling `initialize`" + assert self.step == 0, "Must call `after_updates` before calling `initialize`" self.insert_observations(observations=observations, time_step=0) self.prev_actions[0].zero_() # Have to zero previous actions self.masks[0].zero_() # Have to zero masks @@ -529,8 +529,11 @@ def after_updates(self, **kwargs): for key in storage: storage[key][0][0].copy_(storage[key][0][-1]) - self.masks[0].copy_(self.masks[-1]) - self.prev_actions[0].copy_(self.prev_actions[-1]) + if self._masks_full is not None: + self.masks[0].copy_(self.masks[-1]) + + if self._prev_actions_full is not None: + self.prev_actions[0].copy_(self.prev_actions[-1]) self._before_update_called = False self._advantages = None From b8f4664cbe028ca826497cbb2695500e9a2021d7 Mon Sep 17 00:00:00 2001 From: lucaw Date: Wed, 20 Apr 2022 08:50:05 -0700 Subject: [PATCH 04/11] Fixing pointnav experiments. --- .../habitat/pointnav_habitat_depth_simpleconvgru_ddppo.py | 2 +- .../habitat/pointnav_habitat_rgb_simpleconvgru_ddppo.py | 2 +- .../habitat/pointnav_habitat_rgbd_simpleconvgru_ddppo.py | 2 +- .../ithor/pointnav_ithor_depth_simpleconvgru_ddppo.py | 2 +- .../ithor/pointnav_ithor_rgb_simpleconvgru_ddppo.py | 2 +- .../ithor/pointnav_ithor_rgbd_simpleconvgru_ddppo.py | 2 +- .../pointnav_baselines/experiments/pointnav_thor_base.py | 5 ++++- .../robothor/pointnav_robothor_depth_simpleconvgru_ddppo.py | 2 +- .../robothor/pointnav_robothor_rgb_simpleconvgru_ddppo.py | 2 +- .../robothor/pointnav_robothor_rgbd_simpleconvgru_ddppo.py | 2 +- 10 files changed, 13 insertions(+), 10 deletions(-) diff --git a/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_depth_simpleconvgru_ddppo.py b/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_depth_simpleconvgru_ddppo.py index a201ddf16..d0076105d 100644 --- a/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_depth_simpleconvgru_ddppo.py +++ b/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_depth_simpleconvgru_ddppo.py @@ -31,7 +31,7 @@ def __init__(self): super().__init__() self.model_creation_handler = PointNavUnfrozenResNetWithGRUActorCriticMixin( - backbone="simpleconv", + backbone="simple_cnn", sensors=self.SENSORS, auxiliary_uuids=[], add_prev_actions=True, diff --git a/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_rgb_simpleconvgru_ddppo.py b/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_rgb_simpleconvgru_ddppo.py index f43e001e7..bf07f6dff 100644 --- a/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_rgb_simpleconvgru_ddppo.py +++ b/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_rgb_simpleconvgru_ddppo.py @@ -31,7 +31,7 @@ def __init__(self): super().__init__() self.model_creation_handler = PointNavUnfrozenResNetWithGRUActorCriticMixin( - backbone="simpleconv", + backbone="simple_cnn", sensors=self.SENSORS, auxiliary_uuids=[], add_prev_actions=True, diff --git a/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_rgbd_simpleconvgru_ddppo.py b/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_rgbd_simpleconvgru_ddppo.py index 3da80b140..b9d79a1e7 100644 --- a/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_rgbd_simpleconvgru_ddppo.py +++ b/projects/pointnav_baselines/experiments/habitat/pointnav_habitat_rgbd_simpleconvgru_ddppo.py @@ -37,7 +37,7 @@ def __init__(self): super().__init__() self.model_creation_handler = PointNavUnfrozenResNetWithGRUActorCriticMixin( - backbone="simpleconv", + backbone="simple_cnn", sensors=self.SENSORS, auxiliary_uuids=[], add_prev_actions=True, diff --git a/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_depth_simpleconvgru_ddppo.py b/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_depth_simpleconvgru_ddppo.py index 7dd5af097..df7ee39f0 100644 --- a/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_depth_simpleconvgru_ddppo.py +++ b/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_depth_simpleconvgru_ddppo.py @@ -30,7 +30,7 @@ def __init__(self): super().__init__() self.model_creation_handler = PointNavUnfrozenResNetWithGRUActorCriticMixin( - backbone="simpleconv", + backbone="simple_cnn", sensors=self.SENSORS, auxiliary_uuids=[], add_prev_actions=True, diff --git a/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_rgb_simpleconvgru_ddppo.py b/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_rgb_simpleconvgru_ddppo.py index 4bd711c5c..e6af23663 100644 --- a/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_rgb_simpleconvgru_ddppo.py +++ b/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_rgb_simpleconvgru_ddppo.py @@ -27,7 +27,7 @@ def __init__(self): super().__init__() self.model_creation_handler = PointNavUnfrozenResNetWithGRUActorCriticMixin( - backbone="simpleconv", + backbone="simple_cnn", sensors=self.SENSORS, auxiliary_uuids=[], add_prev_actions=True, diff --git a/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_rgbd_simpleconvgru_ddppo.py b/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_rgbd_simpleconvgru_ddppo.py index fe28eb004..dcfa647d3 100644 --- a/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_rgbd_simpleconvgru_ddppo.py +++ b/projects/pointnav_baselines/experiments/ithor/pointnav_ithor_rgbd_simpleconvgru_ddppo.py @@ -35,7 +35,7 @@ def __init__(self): super().__init__() self.model_creation_handler = PointNavUnfrozenResNetWithGRUActorCriticMixin( - backbone="simpleconv", + backbone="simple_cnn", sensors=self.SENSORS, auxiliary_uuids=[], add_prev_actions=True, diff --git a/projects/pointnav_baselines/experiments/pointnav_thor_base.py b/projects/pointnav_baselines/experiments/pointnav_thor_base.py index d98d0e7a5..945761f30 100644 --- a/projects/pointnav_baselines/experiments/pointnav_thor_base.py +++ b/projects/pointnav_baselines/experiments/pointnav_thor_base.py @@ -64,6 +64,9 @@ def __init__(self): renderDepthImage=any(isinstance(s, DepthSensorThor) for s in self.SENSORS), ) + def preprocessors(self): + return tuple() + def machine_params(self, mode="train", **kwargs): sampler_devices: Sequence[int] = [] if mode == "train": @@ -91,7 +94,7 @@ def machine_params(self, mode="train", **kwargs): sensor_preprocessor_graph = ( SensorPreprocessorGraph( source_observation_spaces=SensorSuite(self.SENSORS).observation_spaces, - preprocessors=self.PREPROCESSORS, + preprocessors=self.preprocessors(), ) if mode == "train" or ( diff --git a/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_depth_simpleconvgru_ddppo.py b/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_depth_simpleconvgru_ddppo.py index db41055aa..a767b2fd4 100644 --- a/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_depth_simpleconvgru_ddppo.py +++ b/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_depth_simpleconvgru_ddppo.py @@ -32,7 +32,7 @@ def __init__(self): super().__init__() self.model_creation_handler = PointNavUnfrozenResNetWithGRUActorCriticMixin( - backbone="simpleconv", + backbone="simple_cnn", sensors=self.SENSORS, auxiliary_uuids=[], add_prev_actions=True, diff --git a/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_rgb_simpleconvgru_ddppo.py b/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_rgb_simpleconvgru_ddppo.py index 74d4007f6..27e935318 100644 --- a/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_rgb_simpleconvgru_ddppo.py +++ b/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_rgb_simpleconvgru_ddppo.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__() self.model_creation_handler = PointNavUnfrozenResNetWithGRUActorCriticMixin( - backbone="simpleconv", + backbone="simple_cnn", sensors=self.SENSORS, auxiliary_uuids=[], add_prev_actions=True, diff --git a/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_rgbd_simpleconvgru_ddppo.py b/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_rgbd_simpleconvgru_ddppo.py index a900a59b3..ab236fbe6 100644 --- a/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_rgbd_simpleconvgru_ddppo.py +++ b/projects/pointnav_baselines/experiments/robothor/pointnav_robothor_rgbd_simpleconvgru_ddppo.py @@ -36,7 +36,7 @@ def __init__(self): super().__init__() self.model_creation_handler = PointNavUnfrozenResNetWithGRUActorCriticMixin( - backbone="simpleconv", + backbone="simple_cnn", sensors=self.SENSORS, auxiliary_uuids=[], add_prev_actions=True, From 04b1a2c1a98c382a782b7f7a5bfb0ba1f8f9b303 Mon Sep 17 00:00:00 2001 From: lucaw Date: Wed, 20 Apr 2022 13:00:20 -0700 Subject: [PATCH 05/11] Habitat experiment warning fix and making timeouts not happen when debugging. --- allenact/algorithms/onpolicy_sync/engine.py | 13 ++++++++++++- .../experiments/habitat/objectnav_habitat_base.py | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/allenact/algorithms/onpolicy_sync/engine.py b/allenact/algorithms/onpolicy_sync/engine.py index 24501687a..a3b5699cd 100644 --- a/allenact/algorithms/onpolicy_sync/engine.py +++ b/allenact/algorithms/onpolicy_sync/engine.py @@ -77,6 +77,16 @@ ) from allenact.utils.viz_utils import VizSuite +try: + # When debugging we don't want to timeout in the VectorSampledTasks + + # noinspection PyPackageRequirements + import pydevd + + DEBUGGING = True +except ImportError: + DEBUGGING = False + TRAIN_MODE_STR = "train" VALID_MODE_STR = "valid" TEST_MODE_STR = "test" @@ -248,7 +258,7 @@ def __init__( # During testing, we sometimes found that default timeout was too short # resulting in the run terminating surprisingly, we increase it here. timeout=datetime.timedelta(minutes=3000) - if self.mode == TEST_MODE_STR + if (self.mode == TEST_MODE_STR or DEBUGGING) else dist.default_pg_timeout, ) self.is_distributed = True @@ -302,6 +312,7 @@ def vector_tasks( else None, mp_ctx=self.mp_ctx, max_processes=self.max_sampler_processes_per_worker, + read_timeout=None if DEBUGGING else 60 ) return self._vector_tasks diff --git a/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py b/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py index 9b6f0405a..2ae1b4f52 100644 --- a/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py +++ b/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py @@ -325,7 +325,7 @@ def TRAIN_CONFIGS_PER_PROCESS(self): configs = sum(configs_per_device, []) if self.debug: - warnings.warning( + warnings.warn( "IN DEBUG MODE, WILL ONLY USE `1LXtFkjw3qL` SCENE IN MP3D OR `1S7LAXRdDqK` scene in HM3D!!!" ) for config in configs: From 21b663a6007c31122fa84b1b8542b24e9e7878ef Mon Sep 17 00:00:00 2001 From: lucaw Date: Tue, 26 Apr 2022 15:55:47 -0700 Subject: [PATCH 06/11] add_prev_action option and fix to inference agent. --- allenact/utils/inference.py | 4 ++-- .../objectnav_baselines/experiments/clip/mixins.py | 3 ++- .../objectnav_habitat_rgb_clipresnet50gru_ddppo.py | 2 +- .../experiments/habitat/objectnav_habitat_base.py | 13 ++++++++----- .../objectnav_robothor_rgb_clipresnet50gru_ddppo.py | 5 +++-- ...jectnav_robothor_rgb_clipresnet50x16gru_ddppo.py | 5 +++-- .../pointnav_habitat_rgb_clipresnet50gru_ddppo.py | 7 +++++-- 7 files changed, 24 insertions(+), 15 deletions(-) diff --git a/allenact/utils/inference.py b/allenact/utils/inference.py index 9fef87086..4549ef936 100644 --- a/allenact/utils/inference.py +++ b/allenact/utils/inference.py @@ -13,8 +13,8 @@ DistributionType, ) from allenact.base_abstractions.preprocessor import SensorPreprocessorGraph -from allenact.utils.tensor_utils import batch_observations from allenact.utils import spaces_utils as su +from allenact.utils.tensor_utils import batch_observations @attr.s(kw_only=True) @@ -121,7 +121,7 @@ def act(self, observations: ObservationType): self.steps_taken_in_task += 1 - if self.steps_taken_in_task % self.steps_before_rollout_refresh: + if self.steps_taken_in_task % self.steps_before_rollout_refresh == 0: self.rollout_storage.after_updates() return su.action_list(self.actor_critic.action_space, self.last_action_flat)[0] diff --git a/projects/objectnav_baselines/experiments/clip/mixins.py b/projects/objectnav_baselines/experiments/clip/mixins.py index 8ef3b59fa..09e7f3b84 100644 --- a/projects/objectnav_baselines/experiments/clip/mixins.py +++ b/projects/objectnav_baselines/experiments/clip/mixins.py @@ -67,7 +67,7 @@ def preprocessors(self) -> Sequence[Union[Preprocessor, Builder[Preprocessor]]]: return preprocessors - def create_model(self, num_actions: int, **kwargs) -> nn.Module: + def create_model(self, num_actions: int, add_prev_actions: bool, **kwargs) -> nn.Module: has_rgb = any(isinstance(s, RGBSensor) for s in self.sensors) has_depth = any(isinstance(s, DepthSensor) for s in self.sensors) @@ -84,4 +84,5 @@ def create_model(self, num_actions: int, **kwargs) -> nn.Module: depth_resnet_preprocessor_uuid="depth_clip_resnet" if has_depth else None, hidden_size=512, goal_dims=32, + add_prev_actions=add_prev_actions ) diff --git a/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py b/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py index 3826f75d8..7d255d8fc 100644 --- a/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py +++ b/projects/objectnav_baselines/experiments/habitat/clip/objectnav_habitat_rgb_clipresnet50gru_ddppo.py @@ -64,7 +64,7 @@ def preprocessors(self) -> Sequence[Union[Preprocessor, Builder[Preprocessor]]]: def create_model(self, **kwargs) -> nn.Module: return self.preprocessing_and_model.create_model( - num_actions=self.ACTION_SPACE.n, **kwargs + num_actions=self.ACTION_SPACE.n, add_prev_actions=self.add_prev_actions, **kwargs ) def tag(self): diff --git a/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py b/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py index 2ae1b4f52..7e23353d4 100644 --- a/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py +++ b/projects/objectnav_baselines/experiments/habitat/objectnav_habitat_base.py @@ -6,13 +6,12 @@ from typing import Dict, Any, List, Optional, Sequence, Union import gym - -# noinspection PyUnresolvedReferences -import habitat import numpy as np import torch from torch.distributions.utils import lazy_property +# noinspection PyUnresolvedReferences +import habitat from allenact.base_abstractions.experiment_config import MachineParams from allenact.base_abstractions.preprocessor import ( SensorPreprocessorGraph, @@ -116,7 +115,6 @@ class ObjectNavHabitatBaseConfig(ObjectNavBaseConfig, ABC): # CPCA8Loss.UUID, # CPCA16Loss.UUID, ] - ADD_PREV_ACTIONS = False MULTIPLE_BELIEFS = False BELIEF_FUSION = ( # choose one None @@ -149,6 +147,7 @@ def __init__( train_gpu_ids: Optional[Sequence[int]] = None, val_gpu_ids: Optional[Sequence[int]] = None, test_gpu_ids: Optional[Sequence[int]] = None, + add_prev_actions: bool = False, **kwargs, ): super().__init__(**kwargs) @@ -172,6 +171,7 @@ def v_or_default(v, default): val_gpu_ids, self.DEFAULT_VALID_GPU_IDS if run_valid else [] ) self.test_gpu_ids = v_or_default(test_gpu_ids, self.DEFAULT_TEST_GPU_IDS) + self.add_prev_actions = add_prev_actions def _create_config( self, @@ -355,7 +355,10 @@ def test_scenes_path(self): # return self.TASK_DATA_DIR_TEMPLATE.format(*(["test"] * 2)) def tag(self): - return f"ObjectNav-Habitat-{self.scene_dataset.upper()}" + t = f"ObjectNav-Habitat-{self.scene_dataset.upper()}" + if not self.add_prev_actions: + return t + return f"{t}-PrevActions" def preprocessors(self) -> Sequence[Union[Preprocessor, Builder[Preprocessor]]]: return tuple() diff --git a/projects/objectnav_baselines/experiments/robothor/clip/objectnav_robothor_rgb_clipresnet50gru_ddppo.py b/projects/objectnav_baselines/experiments/robothor/clip/objectnav_robothor_rgb_clipresnet50gru_ddppo.py index e9db7aae0..b027e27b9 100644 --- a/projects/objectnav_baselines/experiments/robothor/clip/objectnav_robothor_rgb_clipresnet50gru_ddppo.py +++ b/projects/objectnav_baselines/experiments/robothor/clip/objectnav_robothor_rgb_clipresnet50gru_ddppo.py @@ -38,7 +38,7 @@ class ObjectNavRoboThorClipRGBPPOExperimentConfig(ObjectNavRoboThorBaseConfig): ), ] - def __init__(self, **kwargs): + def __init__(self, add_prev_actions: bool = False, **kwargs): super().__init__(**kwargs) self.preprocessing_and_model = ClipResNetPreprocessGRUActorCriticMixin( @@ -47,6 +47,7 @@ def __init__(self, **kwargs): screen_size=self.SCREEN_SIZE, goal_sensor_type=GoalObjectTypeThorSensor, ) + self.add_prev_actions = add_prev_actions def training_pipeline(self, **kwargs) -> TrainingPipeline: return ObjectNavPPOMixin.training_pipeline( @@ -60,7 +61,7 @@ def preprocessors(self) -> Sequence[Union[Preprocessor, Builder[Preprocessor]]]: def create_model(self, **kwargs) -> nn.Module: return self.preprocessing_and_model.create_model( - num_actions=self.ACTION_SPACE.n, **kwargs + num_actions=self.ACTION_SPACE.n, add_prev_actions=self.add_prev_actions, **kwargs ) @classmethod diff --git a/projects/objectnav_baselines/experiments/robothor/clip/objectnav_robothor_rgb_clipresnet50x16gru_ddppo.py b/projects/objectnav_baselines/experiments/robothor/clip/objectnav_robothor_rgb_clipresnet50x16gru_ddppo.py index 80220d4aa..e772ae9f5 100644 --- a/projects/objectnav_baselines/experiments/robothor/clip/objectnav_robothor_rgb_clipresnet50x16gru_ddppo.py +++ b/projects/objectnav_baselines/experiments/robothor/clip/objectnav_robothor_rgb_clipresnet50x16gru_ddppo.py @@ -38,7 +38,7 @@ class ObjectNavRoboThorRGBPPOExperimentConfig(ObjectNavRoboThorBaseConfig): ), ] - def __init__(self, **kwargs): + def __init__(self, add_prev_actions: bool = False, **kwargs): super().__init__(**kwargs) self.preprocessing_and_model = ClipResNetPreprocessGRUActorCriticMixin( @@ -47,6 +47,7 @@ def __init__(self, **kwargs): screen_size=self.SCREEN_SIZE, goal_sensor_type=GoalObjectTypeThorSensor, ) + self.add_prev_actions = add_prev_actions def training_pipeline(self, **kwargs) -> TrainingPipeline: return ObjectNavPPOMixin.training_pipeline( @@ -60,7 +61,7 @@ def preprocessors(self) -> Sequence[Union[Preprocessor, Builder[Preprocessor]]]: def create_model(self, **kwargs) -> nn.Module: return self.preprocessing_and_model.create_model( - num_actions=self.ACTION_SPACE.n, **kwargs + num_actions=self.ACTION_SPACE.n, add_prev_actions=self.add_prev_actions, **kwargs ) @classmethod diff --git a/projects/pointnav_baselines/experiments/habitat/clip/pointnav_habitat_rgb_clipresnet50gru_ddppo.py b/projects/pointnav_baselines/experiments/habitat/clip/pointnav_habitat_rgb_clipresnet50gru_ddppo.py index 3375b8c7a..c3271d6d8 100644 --- a/projects/pointnav_baselines/experiments/habitat/clip/pointnav_habitat_rgb_clipresnet50gru_ddppo.py +++ b/projects/pointnav_baselines/experiments/habitat/clip/pointnav_habitat_rgb_clipresnet50gru_ddppo.py @@ -35,7 +35,7 @@ class PointNavHabitatRGBClipResNet50GRUDDPPOExperimentConfig(PointNavHabitatBase TargetCoordinatesSensorHabitat(coordinate_dims=2), ] - def __init__(self, **kwargs): + def __init__(self, add_prev_actions: bool = False, **kwargs): super().__init__(**kwargs) self.preprocessing_and_model = ClipResNetPreprocessGRUActorCriticMixin( @@ -44,6 +44,7 @@ def __init__(self, **kwargs): screen_size=self.SCREEN_SIZE, goal_sensor_type=TargetCoordinatesSensorHabitat, ) + self.add_prev_actions = add_prev_actions def training_pipeline(self, **kwargs) -> TrainingPipeline: return PointNavPPOMixin.training_pipeline( @@ -58,7 +59,9 @@ def preprocessors(self) -> Sequence[Union[Preprocessor, Builder[Preprocessor]]]: def create_model(self, **kwargs) -> nn.Module: return self.preprocessing_and_model.create_model( - num_actions=self.ACTION_SPACE.n, **kwargs + num_actions=self.ACTION_SPACE.n, + add_prev_actions=self.add_prev_actions, + **kwargs, ) @classmethod From 481c44980fcf6e59ca2693646805e99258aaf2d1 Mon Sep 17 00:00:00 2001 From: lucaw Date: Thu, 28 Apr 2022 12:46:33 -0700 Subject: [PATCH 07/11] Additional inference agent parameter. --- allenact/utils/inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/allenact/utils/inference.py b/allenact/utils/inference.py index 4549ef936..744c45d9e 100644 --- a/allenact/utils/inference.py +++ b/allenact/utils/inference.py @@ -46,10 +46,11 @@ def from_experiment_config( exp_config: ExperimentConfig, device: torch.device, checkpoint_path: Optional[str] = None, + mode: str = "test", ): rollout_storage = exp_config.training_pipeline().rollout_storage - machine_params = exp_config.machine_params("test") + machine_params = exp_config.machine_params(mode) if not isinstance(machine_params, MachineParams): machine_params = MachineParams(**machine_params) From 6aaeb1b4efe5edae0099dbdd09e6333ed03e063a Mon Sep 17 00:00:00 2001 From: lucaw Date: Thu, 28 Apr 2022 21:44:32 -0700 Subject: [PATCH 08/11] Longer task timeouts. --- allenact/algorithms/onpolicy_sync/engine.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/allenact/algorithms/onpolicy_sync/engine.py b/allenact/algorithms/onpolicy_sync/engine.py index a3b5699cd..612a508e5 100644 --- a/allenact/algorithms/onpolicy_sync/engine.py +++ b/allenact/algorithms/onpolicy_sync/engine.py @@ -24,7 +24,6 @@ import torch.multiprocessing as mp # type: ignore import torch.nn as nn import torch.optim as optim - # noinspection PyProtectedMember from torch._C._distributed_c10d import ReduceOp @@ -312,7 +311,7 @@ def vector_tasks( else None, mp_ctx=self.mp_ctx, max_processes=self.max_sampler_processes_per_worker, - read_timeout=None if DEBUGGING else 60 + read_timeout=None if DEBUGGING else 5 * 60 ) return self._vector_tasks From 18fd558d54451aeaa8fdf529b423b8aed68068a0 Mon Sep 17 00:00:00 2001 From: lucaw Date: Tue, 10 May 2022 10:29:16 -0700 Subject: [PATCH 09/11] Improvements to the `get_open_x_displays` function. --- allenact_plugins/ithor_plugin/ithor_util.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/allenact_plugins/ithor_plugin/ithor_util.py b/allenact_plugins/ithor_plugin/ithor_util.py index 16627c1d5..0446bc75d 100644 --- a/allenact_plugins/ithor_plugin/ithor_util.py +++ b/allenact_plugins/ithor_plugin/ithor_util.py @@ -2,6 +2,8 @@ import math import os import platform +import traceback +import warnings from contextlib import contextmanager from typing import Sequence @@ -70,11 +72,14 @@ def get_open_x_displays(throw_error_if_empty: bool = False) -> Sequence[str]: for open_display_str in sorted(open_display_strs): try: open_display_str = str(int(open_display_str)) + display = Xlib.display.Display(f":{open_display_str}") except Exception: + warnings.warn( + f"Encountered error when attempting to open display :{open_display_str}," + f" error message:\n{traceback.format_exc()}" + ) continue - display = Xlib.display.Display(":{}".format(open_display_str)) - displays.extend( [f"{open_display_str}.{i}" for i in range(display.screen_count())] ) From 7f3da1076c07ba2b671cb0761a7c964d0fe25be4 Mon Sep 17 00:00:00 2001 From: lucaw Date: Thu, 12 May 2022 16:16:40 -0700 Subject: [PATCH 10/11] Fix to experimental timeout code. --- allenact/algorithms/onpolicy_sync/engine.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/allenact/algorithms/onpolicy_sync/engine.py b/allenact/algorithms/onpolicy_sync/engine.py index 612a508e5..c6bf903d7 100644 --- a/allenact/algorithms/onpolicy_sync/engine.py +++ b/allenact/algorithms/onpolicy_sync/engine.py @@ -86,6 +86,10 @@ except ImportError: DEBUGGING = False +DEBUG_VST_TIMEOUT: Optional[int] = (lambda x: int(x) if x is not None else x)( + os.getenv("ALLENACT_DEBUG_VST_TIMEOUT", None) +) + TRAIN_MODE_STR = "train" VALID_MODE_STR = "valid" TEST_MODE_STR = "test" @@ -311,7 +315,7 @@ def vector_tasks( else None, mp_ctx=self.mp_ctx, max_processes=self.max_sampler_processes_per_worker, - read_timeout=None if DEBUGGING else 5 * 60 + read_timeout=DEBUG_VST_TIMEOUT if DEBUGGING else 5 * 60, ) return self._vector_tasks @@ -1466,7 +1470,8 @@ def run_pipeline(self): self.initialize_storage_and_viz( storage_to_initialize=list(uuid_to_storage.values()) ) - num_paused = 0 + step = -1 + continue # A more informative error message should already have been thrown in be given in # `collect_step_across_all_task_samplers` if `num_paused != 0` here but this serves From 1d3c64ae1f4d9dc44763d7e9c449e01add0464af Mon Sep 17 00:00:00 2001 From: lucaw Date: Thu, 2 Jun 2022 10:25:14 -0700 Subject: [PATCH 11/11] Updating pytest.yml --- .github/workflows/pytest.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 4a291ca6b..822f8800f 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -28,6 +28,7 @@ jobs: python -m pip install --editable="./allenact_plugins[all]" python -m pip install pip install -r allenact_plugins/babyai_plugin/extra_requirements.txt # Required as babyai is not on PyPI python -m pip install compress_pickle # Needed for some mapping tests + python -m pip install -U protobuf==3.20.1 # Required until tensorboardX is fixed: https://github.com/lanpa/tensorboardX/issues/663 pip list - name: Test with pytest