From 40f338f6ef4c6c267c16eb5006b374f0c4dfb083 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Wed, 24 Jan 2024 22:47:43 +0100
Subject: [PATCH 01/19] Adds imitation learning example

---
 examples/sb3_imitation.py                   | 201 ++++++++++++++++++++
 godot_rl/wrappers/sbg_single_obs_wrapper.py |  13 +-
 2 files changed, 209 insertions(+), 5 deletions(-)
 create mode 100644 examples/sb3_imitation.py

diff --git a/examples/sb3_imitation.py b/examples/sb3_imitation.py
new file mode 100644
index 00000000..5796c7a3
--- /dev/null
+++ b/examples/sb3_imitation.py
@@ -0,0 +1,201 @@
+import argparse
+import json
+import os
+import pathlib
+from math import log
+
+import imitation.data
+import numpy as np
+from imitation.algorithms.adversarial.gail import GAIL
+from imitation.rewards.reward_nets import BasicRewardNet
+from imitation.util.networks import RunningNorm
+from stable_baselines3 import PPO
+from stable_baselines3.common.evaluation import evaluate_policy
+from stable_baselines3.common.vec_env.vec_monitor import VecMonitor
+
+from godot_rl.wrappers.onnx.stable_baselines_export import export_ppo_model_as_onnx
+from godot_rl.wrappers.sbg_single_obs_wrapper import SBGSingleObsEnv
+
+parser = argparse.ArgumentParser(allow_abbrev=False)
+parser.add_argument(
+    "--env_path",
+    default=None,
+    type=str,
+    help="The Godot binary to use, do not include for in editor training",
+)
+parser.add_argument(
+    "--demo_files",
+    nargs="+",
+    type=str,
+    help="""One or more files with recoded expert demos, with a space in between, e.g. "demo1.json", demo2.json""",
+)
+parser.add_argument("--seed", type=int, default=0, help="seed of the experiment")
+parser.add_argument(
+    "--save_model_path",
+    default=None,
+    type=str,
+    help="The path to use for saving the trained sb3 model after training is complete. Saved model can be used later "
+    "to resume training. Extension will be set to .zip",
+)
+parser.add_argument(
+    "--onnx_export_path",
+    default=None,
+    type=str,
+    help="The Godot binary to use, do not include for in editor training",
+)
+parser.add_argument(
+    "--inference",
+    default=False,
+    action="store_true",
+    help="Instead of training, it will run inference on a loaded model for --timesteps steps. "
+    "Requires --resume_model_path to be set.",
+)
+parser.add_argument(
+    "--viz",
+    action="store_true",
+    help="If set, the simulation will be displayed in a window during training. Otherwise "
+    "training will run without rendering the simulation. This setting does not apply to in-editor training.",
+    default=False,
+)
+parser.add_argument(
+    "--run_eval_after_training",
+    action="store_true",
+    help="Evaluate policy in an env after training in a single env. Will always visualize.",
+    default=False,
+)
+parser.add_argument("--speedup", default=1, type=int, help="Whether to speed up the physics in the env")
+parser.add_argument(
+    "--n_parallel",
+    default=1,
+    type=int,
+    help="How many instances of the environment executable to " "launch - requires --env_path to be set if > 1.",
+)
+parser.add_argument(
+    "--il_timesteps",
+    default=100_000,
+    type=int,
+    help="How many timesteps to train for using imitation learning.",
+)
+parser.add_argument(
+    "--rl_timesteps",
+    default=0,
+    type=int,
+    help="[Optional] Additional timesteps to train for using RL after IL.",
+)
+parser.add_argument(
+    "--eval_episode_count",
+    default=0,
+    type=int,
+    help="[Optional] How many episodes to evaluate for after training.",
+)
+
+
+args, extras = parser.parse_known_args()
+
+
+def handle_onnx_export():
+    # Enforce the extension of onnx and zip when saving model to avoid potential conflicts in case of same name
+    # and extension used for both
+    if args.onnx_export_path is not None:
+        path_onnx = pathlib.Path(args.onnx_export_path).with_suffix(".onnx")
+        print("Exporting onnx to: " + os.path.abspath(path_onnx))
+        export_ppo_model_as_onnx(learner, str(path_onnx))
+
+
+def handle_model_save():
+    if args.save_model_path is not None:
+        zip_save_path = pathlib.Path(args.save_model_path).with_suffix(".zip")
+        print("Saving model to: " + os.path.abspath(zip_save_path))
+        learner.save(zip_save_path)
+
+
+def close_env():
+    try:
+        print("closing env")
+        env.close()
+    except Exception as e:
+        print("Exception while closing env: ", e)
+
+
+trajectories = []
+
+for file_path in args.demo_files:
+    with open(file_path, "r") as file:
+        data = json.load(file)
+
+    for i in range(0, len(data)):
+        trajectories.append(
+            imitation.data.rollout.types.Trajectory(
+                obs=np.array(data[i][0]),
+                acts=np.array(data[i][1]),
+                infos=None,
+                terminal=True,
+            )
+        )
+
+
+env = SBGSingleObsEnv(
+    env_path=args.env_path,
+    show_window=args.viz,
+    seed=args.seed,
+    n_parallel=args.n_parallel,
+    speedup=args.speedup,
+    obs_key="obs",
+)
+
+env = VecMonitor(env)
+
+
+policy_kwargs = dict(log_std_init=log(1.0))
+
+learner = PPO(
+    batch_size=64,
+    env=env,
+    policy="MlpPolicy",
+    learning_rate=0.0003,
+    clip_range=0.3,
+    n_epochs=20,
+    n_steps=64,
+    ent_coef=0.0001,
+    target_kl=0.025,
+    policy_kwargs=policy_kwargs,
+)
+
+reward_net = BasicRewardNet(
+    observation_space=env.observation_space,
+    action_space=env.action_space,
+    normalize_input_layer=RunningNorm,
+)
+
+gail_trainer = GAIL(
+    demonstrations=trajectories,
+    demo_batch_size=64,
+    gen_replay_buffer_capacity=64,
+    n_disc_updates_per_round=24,
+    venv=env,
+    gen_algo=learner,
+    reward_net=reward_net,
+    allow_variable_horizon=True,
+)
+
+print("Starting Imitation Learning Training using GAIL:")
+gail_trainer.train(args.il_timesteps)
+
+if args.rl_timesteps:
+    print("Starting RL Training:")
+    learner.learn(args.rl_timesteps, progress_bar=True)
+
+close_env()
+
+if args.eval_episode_count:
+    print("Evaluating:")
+    env = SBGSingleObsEnv(env_path=args.env_path, show_window=True, seed=args.seed, n_parallel=1, speedup=args.speedup)
+    env = VecMonitor(env)
+    from stable_baselines3.common.evaluation import evaluate_policy
+
+    mean_reward, _ = evaluate_policy(learner, env, n_eval_episodes=args.eval_episode_count)
+    print(f"Mean reward after evaluation: {mean_reward}")
+
+close_env()
+handle_onnx_export()
+handle_model_save()
diff --git a/godot_rl/wrappers/sbg_single_obs_wrapper.py b/godot_rl/wrappers/sbg_single_obs_wrapper.py
index d4136430..2f261c19 100644
--- a/godot_rl/wrappers/sbg_single_obs_wrapper.py
+++ b/godot_rl/wrappers/sbg_single_obs_wrapper.py
@@ -5,11 +5,8 @@
 
 from godot_rl.wrappers.stable_baselines_wrapper import StableBaselinesGodotEnv
 
-# A variant of the Stable Baselines Godot Env that only supports a single
-# obs space from the dictionary - obs["obs"] by default.
-
-# This provides some basic support for using envs that have a single obs
-# space with policies other than MultiInputPolicy.
+# A variant of the Stable Baselines Godot Env that only supports a single obs space from the dictionary - obs["obs"] by default.
+# This provides some basic support for using envs that have a single obs space with policies other than MultiInputPolicy.
 
 
 class SBGSingleObsEnv(StableBaselinesGodotEnv):
@@ -19,6 +16,12 @@ def __init__(self, obs_key="obs", *args, **kwargs) -> None:
 
     def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, List[Dict[str, Any]]]:
         obs, rewards, term, info = super().step(action)
+
+        # Terminal obs info is needed for imitation learning
+        for idx, done in enumerate(term):
+            if done:
+                info[idx]["terminal_observation"] = obs[self.obs_key][idx]
+
         return obs[self.obs_key], rewards, term, info
 
     def reset(self) -> np.ndarray:

From f233169fe93859120d3883c6838420ec2d59ded4 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Wed, 24 Jan 2024 23:13:19 +0100
Subject: [PATCH 02/19] Remove duplicated import in sb3_imitation.py

---
 examples/sb3_imitation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/sb3_imitation.py b/examples/sb3_imitation.py
index 5796c7a3..16498323 100644
--- a/examples/sb3_imitation.py
+++ b/examples/sb3_imitation.py
@@ -191,8 +191,6 @@ def close_env():
     print("Evaluating:")
     env = SBGSingleObsEnv(env_path=args.env_path, show_window=True, seed=args.seed, n_parallel=1, speedup=args.speedup)
     env = VecMonitor(env)
-    from stable_baselines3.common.evaluation import evaluate_policy
-
     mean_reward, _ = evaluate_policy(learner, env, n_eval_episodes=args.eval_episode_count)
     print(f"Mean reward after evaluation: {mean_reward}")
 

From 4a373db9989d3fc5d3d027cd6865de19a113978f Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 25 Jan 2024 18:43:25 +0100
Subject: [PATCH 03/19] Update to sb3 imitation support

- Adds a draft of the doc page for imitation learning
- Adds support for exporting onnx when using `SBGSingleObsEnv`
---
 docs/IMITATION_LEARNING.md                    |  7 +++
 examples/sb3_imitation.py                     | 10 +++-
 .../wrappers/onnx/stable_baselines_export.py  | 56 ++++++++++++++-----
 3 files changed, 56 insertions(+), 17 deletions(-)
 create mode 100644 docs/IMITATION_LEARNING.md

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
new file mode 100644
index 00000000..1f896bac
--- /dev/null
+++ b/docs/IMITATION_LEARNING.md
@@ -0,0 +1,7 @@
+# Imitation Learning
+
+For imitation learning, we use the imitation library: https://github.com/HumanCompatibleAI/imitation
+
+From the docs:
+> Imitation provides clean implementations of imitation and reward learning algorithms, under a unified and user-friendly API. Currently, we have implementations of Behavioral Cloning, DAgger (with synthetic examples), density-based reward modeling, Maximum Causal Entropy Inverse Reinforcement Learning, Adversarial Inverse Reinforcement Learning, Generative Adversarial Imitation Learning, and Deep RL from Human Preferences.
+
diff --git a/examples/sb3_imitation.py b/examples/sb3_imitation.py
index 16498323..8e5d301b 100644
--- a/examples/sb3_imitation.py
+++ b/examples/sb3_imitation.py
@@ -99,7 +99,7 @@ def handle_onnx_export():
     if args.onnx_export_path is not None:
         path_onnx = pathlib.Path(args.onnx_export_path).with_suffix(".onnx")
         print("Exporting onnx to: " + os.path.abspath(path_onnx))
-        export_ppo_model_as_onnx(learner, str(path_onnx))
+        export_ppo_model_as_onnx(learner, str(path_onnx), use_obs_array=True)
 
 
 def handle_model_save():
@@ -189,7 +189,13 @@ def close_env():
 
 if args.eval_episode_count:
     print("Evaluating:")
-    env = SBGSingleObsEnv(env_path=args.env_path, show_window=True, seed=args.seed, n_parallel=1, speedup=args.speedup)
+    env = SBGSingleObsEnv(
+        env_path=args.env_path,
+        show_window=True,
+        seed=args.seed,
+        n_parallel=1,
+        speedup=args.speedup,
+    )
     env = VecMonitor(env)
     mean_reward, _ = evaluate_policy(learner, env, n_eval_episodes=args.eval_episode_count)
     print(f"Mean reward after evaluation: {mean_reward}")
diff --git a/godot_rl/wrappers/onnx/stable_baselines_export.py b/godot_rl/wrappers/onnx/stable_baselines_export.py
index 19f679f9..67803ce3 100644
--- a/godot_rl/wrappers/onnx/stable_baselines_export.py
+++ b/godot_rl/wrappers/onnx/stable_baselines_export.py
@@ -3,24 +3,39 @@
 
 
 class OnnxableMultiInputPolicy(torch.nn.Module):
-    def __init__(self, obs_keys, features_extractor, mlp_extractor, action_net, value_net):
+    def __init__(
+        self,
+        obs_keys,
+        features_extractor,
+        mlp_extractor,
+        action_net,
+        value_net,
+        use_obs_array,
+    ):
         super().__init__()
         self.obs_keys = obs_keys
         self.features_extractor = features_extractor
         self.mlp_extractor = mlp_extractor
         self.action_net = action_net
         self.value_net = value_net
+        self.use_obs_array = use_obs_array
 
     def forward(self, obs, state_ins):
-        obs_dict = {k: v for k, v in zip(self.obs_keys, obs)}
         # NOTE: You may have to process (normalize) observation in the correct
         #       way before using this. See `common.preprocessing.preprocess_obs`
-        features = self.features_extractor(obs_dict)
+        features = None
+
+        if self.use_obs_array:
+            features = self.features_extractor(obs)
+        else:
+            obs_dict = {k: v for k, v in zip(self.obs_keys, obs)}
+            features = self.features_extractor(obs_dict)
+
         action_hidden, value_hidden = self.mlp_extractor(features)
         return self.action_net(action_hidden), state_ins
 
 
-def export_ppo_model_as_onnx(ppo: PPO, onnx_model_path: str):
+def export_ppo_model_as_onnx(ppo: PPO, onnx_model_path: str, use_obs_array: bool = False):
     ppo_policy = ppo.policy.to("cpu")
     onnxable_model = OnnxableMultiInputPolicy(
         ["obs"],
@@ -28,12 +43,17 @@ def export_ppo_model_as_onnx(ppo: PPO, onnx_model_path: str):
         ppo_policy.mlp_extractor,
         ppo_policy.action_net,
         ppo_policy.value_net,
+        use_obs_array,
     )
-    dummy_input = dict(ppo.observation_space.sample())
-    for k, v in dummy_input.items():
-        dummy_input[k] = torch.from_numpy(v).unsqueeze(0)
 
-    dummy_input = [v for v in dummy_input.values()]
+    if use_obs_array:
+        dummy_input = torch.unsqueeze(torch.tensor(ppo.observation_space.sample()), 0)
+    else:
+        dummy_input = dict(ppo.observation_space.sample())
+        for k, v in dummy_input.items():
+            dummy_input[k] = torch.from_numpy(v).unsqueeze(0)
+            dummy_input = [v for v in dummy_input.values()]
+
     torch.onnx.export(
         onnxable_model,
         args=(dummy_input, torch.zeros(1).float()),
@@ -48,10 +68,10 @@ def export_ppo_model_as_onnx(ppo: PPO, onnx_model_path: str):
             "state_outs": {0: "batch_size"},
         },
     )
-    verify_onnx_export(ppo, onnx_model_path)
+    verify_onnx_export(ppo, onnx_model_path, use_obs_array=use_obs_array)
 
 
-def verify_onnx_export(ppo: PPO, onnx_model_path: str, num_tests=10):
+def verify_onnx_export(ppo: PPO, onnx_model_path: str, num_tests=10, use_obs_array: bool = False):
     import numpy as np
     import onnx
     import onnxruntime as ort
@@ -63,16 +83,22 @@ def verify_onnx_export(ppo: PPO, onnx_model_path: str, num_tests=10):
     ort_sess = ort.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
 
     for i in range(num_tests):
-        obs = dict(ppo.observation_space.sample())
+        obs = None
+        obs2 = None
 
-        obs2 = {}
-        for k, v in obs.items():
-            obs2[k] = torch.from_numpy(v).unsqueeze(0)
+        if use_obs_array:
+            obs = np.expand_dims(ppo.observation_space.sample(), axis=0)
+            obs2 = torch.tensor(obs)
+        else:
+            obs = dict(ppo.observation_space.sample())
+            obs2 = {}
+            for k, v in obs.items():
+                obs2[k] = torch.from_numpy(v).unsqueeze(0)
+            obs = [v for v in obs.values()]
 
         with torch.no_grad():
             action_sb3, _, _ = sb3_model(obs2, deterministic=True)
 
-        obs = [v for v in obs.values()]
         action_onnx, state_outs = ort_sess.run(None, {"obs": obs, "state_ins": np.array([0.0], dtype=np.float32)})
         assert np.allclose(action_sb3, action_onnx, atol=1e-5), "Mismatch in action output"
         assert np.allclose(state_outs, np.array([0.0]), atol=1e-5), "Mismatch in state_outs output"

From 3ba5e521cec2deaa6f9f7f206d813db8e9403cd9 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 25 Jan 2024 19:41:32 +0100
Subject: [PATCH 04/19] Update IMITATION_LEARNING.md

Started the tutorial.
---
 docs/IMITATION_LEARNING.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index 1f896bac..048e6d1f 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -5,3 +5,13 @@ For imitation learning, we use the imitation library: https://github.com/HumanCo
 From the docs:
 > Imitation provides clean implementations of imitation and reward learning algorithms, under a unified and user-friendly API. Currently, we have implementations of Behavioral Cloning, DAgger (with synthetic examples), density-based reward modeling, Maximum Causal Entropy Inverse Reinforcement Learning, Adversarial Inverse Reinforcement Learning, Generative Adversarial Imitation Learning, and Deep RL from Human Preferences.
 
+### Installation:
+In the conda env or Python venv where you have Godot-RL installed, use:
+`pip install imitation`.
+
+Then you can use it by using and or modifying [this example](/examples/sb3_imitation.py).
+
+### Tutorial
+For a quick tutorial on how to use Imitation Learning, we'll modify one of the example environments to use imitation learning. This tutorial assumes you have Godot, Godot RL Agents, Imitation, and Blender installed, and have completed the quick-start guide from the readme of this repository and potentially the [custom env tutorial](https://github.com/edbeeching/godot_rl_agents/blob/main/docs/CUSTOM_ENV.md) as well.
+
+#### Download all of the examples from here: https://github.com/edbeeching/godot_rl_agents_examples/tree/main (either clone or click on `Code` > `Download ZIP`)

From 640928c0cd0c0d57f19a13fdd29cceb43ccb3572 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 25 Jan 2024 21:36:03 +0100
Subject: [PATCH 05/19] Update IMITATION_LEARNING.md

Update to the tutorial
---
 docs/IMITATION_LEARNING.md | 128 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 1 deletion(-)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index 048e6d1f..0896d186 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -14,4 +14,130 @@ Then you can use it by using and or modifying [this example](/examples/sb3_imita
 ### Tutorial
 For a quick tutorial on how to use Imitation Learning, we'll modify one of the example environments to use imitation learning. This tutorial assumes you have Godot, Godot RL Agents, Imitation, and Blender installed, and have completed the quick-start guide from the readme of this repository and potentially the [custom env tutorial](https://github.com/edbeeching/godot_rl_agents/blob/main/docs/CUSTOM_ENV.md) as well.
 
-#### Download all of the examples from here: https://github.com/edbeeching/godot_rl_agents_examples/tree/main (either clone or click on `Code` > `Download ZIP`)
+##### Download all of the examples: 
+https://github.com/edbeeching/godot_rl_agents_examples/tree/main (either clone or click on `Code` > `Download ZIP`)
+
+##### Update plugin:
+At the time of writing this tutorials, envs don't currently have the plugin version that includes the demo recorder. We'll use the MultiLevelRobotEnv example. First download the [latest plugin from Github](https://github.com/edbeeching/godot_rl_agents_plugin) and then copy the `addons` folder from the plugin folder to the previously downloaded example folder:
+`godot_rl_agents_examples-main\examples\MultiLevelRobot\addons` (replace all of the files or remove the addons folder in the game example before pasting the one from the plugin).
+
+##### In Godot editor, import the MultiLevelRobotEnv example.
+
+##### Open the testing scene:
+![testing scene](https://github.com/edbeeching/godot_rl_agents/assets/61947090/212ae90b-9077-472b-81b9-4f1a10fff1a1)
+
+We'll use this scene to record the demonstrations. First, we have to modify the AIController settings to use the demo recorder mode.
+
+##### Right click on `GameScene`, then click on `Editable Children`:
+
+![make game scene editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/c899ec23-45fd-41fa-a1f7-d9a4b836283e)
+
+##### Right click on `Robot`, then click on `Editable Children`:
+
+![make robot editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/16d6819f-77e9-491b-be45-900172b36a8e)
+
+##### Set Control Mode to `Record Expert Demos` and write a file path to save the demos to:
+
+![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/be531a1e-14e9-4fb3-8055-698d3f99a1e5)
+
+##### Add an `InputEventKey` to `Remove Last Episode Key`:
+![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/ff94ffb6-23e8-4c3e-8dc3-ca8d07cbfc45)
+
+##### Set a key of your choice, then click on `OK`:
+![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/1d10a016-2944-411d-a4ab-cb00409fda04)
+
+This key will be used to remove the last episode during recording. We can use this during demo recording if we recorded an episode with a non-optimal outcome (e.g. if the robot fell or hit an enemy robot, the episode timed out, etc.).
+
+##### Set action_repeat to 10:
+This will produce some input lag while recording demos, but this is what is set for training/inference as well. What it means is that the currently set action will repeat for 10 frames before the next action is set. Also, only once every 10 frames, the obs/action will be read and saved to the demo file. You can optionally set a lower value here, in that case you may also want to lower it in the `sync` node in `training_scene.tscn` and `testing_scene.tscn`.
+
+![action repeat 10](https://github.com/edbeeching/godot_rl_agents/assets/61947090/50dd4ca3-1386-4435-a229-2becd71c42a1)
+
+
+##### Open `RobotAIController.gd`:
+(You can search for it in `Filter Files` box in the `FileSystem` if it's not showing up)
+
+![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/49651e15-e1e9-4307-936d-7e6fbf434637)
+
+We need to change some things in the AIController to allow for demo recording to work properly.
+
+##### Modify `set_action` and implement `get_action`
+
+Find the set_action method, and replace the code with this:
+
+```gdscript
+## Returns the action that is currently applied to the robot.
+func get_action():
+	return [robot.requested_movement.x, robot.requested_movement.z]
+
+## Sets the action to the robot. 
+func set_action(action = null) -> void:	
+	if action:
+		robot.requested_movement = Vector3(
+			clampf(action.movement[0], -1.0, 1.0), 
+			0.0, 
+			clampf(action.movement[1], -1.0, 1.0)).limit_length(1.0)
+	else:
+		robot.requested_movement = Vector3(
+			int(Input.is_action_pressed("ui_down")) - int(Input.is_action_pressed("ui_up")), 
+			0.0, 
+			int(Input.is_action_pressed("ui_left")) - int(Input.is_action_pressed("ui_right"))
+		).limit_length(1.0)
+```
+
+The way this works is that if we are running training or inference with a RL agent, the set_action method will be called with action provided. However, during demo recording, set_action will be called without any action provided, so we need to manually set the values. 
+
+`set_action()` will be called just before `get_action()` so the demo recorder will record the currently applied action for the current state/observations.
+
+Now we can simplify the heuristic handling code (for when "human control" mode is used) in robot.gd.
+
+##### Open `robot.gd`
+
+Change the `handle_movement` method to the following code:
+```gdscript
+func handle_movement(delta):
+	var movement := Vector3()
+
+	if ai_controller.heuristic == "human":
+		ai_controller.set_action()
+
+	movement = requested_movement
+
+	apply_acceleration(movement, delta)
+	apply_gravity(delta)
+	apply_friction(delta)
+	apply_conveyor_belt_velocity(delta)
+	limit_horizontal_speed()
+
+	move_and_slide()
+
+	rotate_toward_movement(delta)
+	update_wheels_and_visual_rotation(delta)
+```
+
+Let's also set the game to only use the last level. This will simplify the demo recording and training for this tutorial.
+
+Find the `reset()` method and change it to:
+
+```gdscript
+func reset():
+	current_level = 7
+	velocity = Vector3.ZERO
+	global_position = level_manager.get_spawn_position(current_level)
+	current_goal_transform = level_manager.randomize_goal(current_level)
+	previous_distance_to_goal = global_position.distance_to(current_goal_transform.origin)
+```
+
+##### Record some demos
+To record demos, press `F6` or click on `Run Current Scene`.
+
+Record some demos of successfully completing the level. You can use the key previously set for removing the last episode if the robot hits an enemy or falls down during recording. 
+
+Here's a highly sped-up video of recording 18 episodes:
+
+https://github.com/edbeeching/godot_rl_agents/assets/61947090/7bdc19ba-6e88-431d-b87b-7ec3e0ce1a7c
+
+Note: I found it difficult to control the robot with action repeat 10, and I removed a few episodes where the robot hit an enemy robot during recording so that they don't end up in the recorded demos file. I would recommend setting action repeat to a lower value like 6-8 (both in AIController and sync node in the two scenes mentioned previously). It's also possible to change the `speed up` property of the `sync` node while recording demos to make the process easier, as it will slow down or speed up the game according to the setting.
+
+Once you are done recording, click on `x` to close the game window (do not use the `Stop` button in the editor as that will not save the file), and you will see `demo.json` in the filesystem. 
+

From bb65f7c4556d57a7bf0472b311f30642f382c78c Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 25 Jan 2024 22:50:06 +0100
Subject: [PATCH 06/19] Update IMITATION_LEARNING.md

Updates the tutorial
---
 docs/IMITATION_LEARNING.md | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index 0896d186..09af09cb 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -141,3 +141,40 @@ Note: I found it difficult to control the robot with action repeat 10, and I rem
 
 Once you are done recording, click on `x` to close the game window (do not use the `Stop` button in the editor as that will not save the file), and you will see `demo.json` in the filesystem. 
 
+##### Export the game
+Click on `Project` > `Export` and export the game for your current OS. We will use the exported game for training.
+
+##### Open conda terminal or venv terminal:
+I use conda in this example, but you can use the corresponding commands with venv or the system that you are using for managing Python environments.
+
+##### Type `conda activate gdrl_il`
+(replace gdrl_il with the name you're using for the virtual env, it should have godot rl agents and imitation library installed)
+
+##### Move to the folder with the Imitation Learning Example
+Download the [sb3 imitation example](/examples/sb3_imitation.py) and `cd` into the folder where the file is.
+
+##### Set the arguments and start the training
+E.g. on Windows:
+````
+python sb3_imitation.py --env_path="PATH_TO_EXPORTED_GAME_EXE_FILE_HERE" --il_timesteps=250_000 --demo_files="PATH_TO_THE_RECORDED_demo.json_FILE_HERE" --eval_episode_count=20 --n_parallel=5 --speedup=15
+````
+
+Training should begin. As we set a small amount of timesteps, the results won't be perfect but it shouldn't take too long (may still take a while, you can reduce the timesteps if you wish to run a quick test). Beside increasing timesteps, you can open the script and modify the hyperaparameters to get better results. Having more high quality recorded demos can help too. You can load multiple files by adding them to the `--demo_files` argument, e.g. `--demo_files="file1_path" "file2_path" "file3_path"`.
+After the training is done, an evaluation environment should open and you will see the trained agent solving the env for 20 episodes.
+
+In my case, I got:
+```Mean reward after evaluation: 5.906429767608643```
+The exact results you get may be different for various reasons, including the possibility that the hyperparameters and/or other variables may have changed since then. 
+
+For comparison, when training just with `--rl_timesteps=250_000` I got a reward of:
+```Mean reward after evaluation: 9.194426536560059```
+
+The imitation-learned reward could be improved by tweaking hyperaparameters (the parameters provided in the script are not optimized), recording more higher quality demos, doing some RL timesteps after it, etc.
+As this environment was designed and tested with PPO RL, in this case the environment is simple enough that PPO alone can learn it quickly from the reward function and imitation learning isn't necessary. 
+However, in more complex environments where it might be difficult to define a good dense reward function, learning from demonstrations and/or combining it with RL learning from sparse rewards could be helpful.
+
+There are a couple of other options to mention:
+
+After imitation learning, you can continue model training with PPO using the environment rewards to further improve the results. This is done by adding an argument to the script, e.g. `--rl_timesteps=250_000`.
+
+You can set the script to export the trained model to onnx by adding e.g. `--onnx_export_path="model.onnx"`. That model can be then be copied to the game folder, and set in sync node in testing_scene to be used for inference without the Python server.

From 4b8e88bdeea99de3b41a0015dde26a1b70902cba Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 25 Jan 2024 22:52:49 +0100
Subject: [PATCH 07/19] Update IMITATION_LEARNING.md

Some formatting improvements
---
 docs/IMITATION_LEARNING.md | 139 ++++++++++++++++++++++++++-----------
 1 file changed, 97 insertions(+), 42 deletions(-)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index 09af09cb..c7b608fb 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -3,65 +3,88 @@
 For imitation learning, we use the imitation library: https://github.com/HumanCompatibleAI/imitation
 
 From the docs:
-> Imitation provides clean implementations of imitation and reward learning algorithms, under a unified and user-friendly API. Currently, we have implementations of Behavioral Cloning, DAgger (with synthetic examples), density-based reward modeling, Maximum Causal Entropy Inverse Reinforcement Learning, Adversarial Inverse Reinforcement Learning, Generative Adversarial Imitation Learning, and Deep RL from Human Preferences.
+> Imitation provides clean implementations of imitation and reward learning algorithms, under a unified and
+> user-friendly API. Currently, we have implementations of Behavioral Cloning, DAgger (with synthetic examples),
+> density-based reward modeling, Maximum Causal Entropy Inverse Reinforcement Learning, Adversarial Inverse Reinforcement
+> Learning, Generative Adversarial Imitation Learning, and Deep RL from Human Preferences.
 
 ### Installation:
+
 In the conda env or Python venv where you have Godot-RL installed, use:
 `pip install imitation`.
 
 Then you can use it by using and or modifying [this example](/examples/sb3_imitation.py).
 
 ### Tutorial
-For a quick tutorial on how to use Imitation Learning, we'll modify one of the example environments to use imitation learning. This tutorial assumes you have Godot, Godot RL Agents, Imitation, and Blender installed, and have completed the quick-start guide from the readme of this repository and potentially the [custom env tutorial](https://github.com/edbeeching/godot_rl_agents/blob/main/docs/CUSTOM_ENV.md) as well.
 
-##### Download all of the examples: 
+For a quick tutorial on how to use Imitation Learning, we'll modify one of the example environments to use imitation
+learning. This tutorial assumes you have Godot, Godot RL Agents, Imitation, and Blender installed, and have completed
+the quick-start guide from the readme of this repository and potentially
+the [custom env tutorial](https://github.com/edbeeching/godot_rl_agents/blob/main/docs/CUSTOM_ENV.md) as well.
+
+#### Download all of the examples:
+
 https://github.com/edbeeching/godot_rl_agents_examples/tree/main (either clone or click on `Code` > `Download ZIP`)
 
-##### Update plugin:
-At the time of writing this tutorials, envs don't currently have the plugin version that includes the demo recorder. We'll use the MultiLevelRobotEnv example. First download the [latest plugin from Github](https://github.com/edbeeching/godot_rl_agents_plugin) and then copy the `addons` folder from the plugin folder to the previously downloaded example folder:
-`godot_rl_agents_examples-main\examples\MultiLevelRobot\addons` (replace all of the files or remove the addons folder in the game example before pasting the one from the plugin).
+#### Update plugin:
+
+At the time of writing this tutorials, envs don't currently have the plugin version that includes the demo recorder.
+We'll use the MultiLevelRobotEnv example. First download
+the [latest plugin from Github](https://github.com/edbeeching/godot_rl_agents_plugin) and then copy the `addons` folder
+from the plugin folder to the previously downloaded example folder:
+`godot_rl_agents_examples-main\examples\MultiLevelRobot\addons` (replace all of the files or remove the addons folder in
+the game example before pasting the one from the plugin).
 
-##### In Godot editor, import the MultiLevelRobotEnv example.
+#### In Godot editor, import the MultiLevelRobotEnv example.
+
+#### Open the testing scene:
 
-##### Open the testing scene:
 ![testing scene](https://github.com/edbeeching/godot_rl_agents/assets/61947090/212ae90b-9077-472b-81b9-4f1a10fff1a1)
 
-We'll use this scene to record the demonstrations. First, we have to modify the AIController settings to use the demo recorder mode.
+We'll use this scene to record the demonstrations. First, we have to modify the AIController settings to use the demo
+recorder mode.
 
-##### Right click on `GameScene`, then click on `Editable Children`:
+#### Right click on `GameScene`, then click on `Editable Children`:
 
 ![make game scene editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/c899ec23-45fd-41fa-a1f7-d9a4b836283e)
 
-##### Right click on `Robot`, then click on `Editable Children`:
+#### Right click on `Robot`, then click on `Editable Children`:
 
 ![make robot editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/16d6819f-77e9-491b-be45-900172b36a8e)
 
-##### Set Control Mode to `Record Expert Demos` and write a file path to save the demos to:
+#### Set Control Mode to `Record Expert Demos` and write a file path to save the demos to:
 
 ![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/be531a1e-14e9-4fb3-8055-698d3f99a1e5)
 
-##### Add an `InputEventKey` to `Remove Last Episode Key`:
+#### Add an `InputEventKey` to `Remove Last Episode Key`:
+
 ![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/ff94ffb6-23e8-4c3e-8dc3-ca8d07cbfc45)
 
-##### Set a key of your choice, then click on `OK`:
+#### Set a key of your choice, then click on `OK`:
+
 ![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/1d10a016-2944-411d-a4ab-cb00409fda04)
 
-This key will be used to remove the last episode during recording. We can use this during demo recording if we recorded an episode with a non-optimal outcome (e.g. if the robot fell or hit an enemy robot, the episode timed out, etc.).
+This key will be used to remove the last episode during recording. We can use this during demo recording if we recorded
+an episode with a non-optimal outcome (e.g. if the robot fell or hit an enemy robot, the episode timed out, etc.).
+
+#### Set action_repeat to 10:
 
-##### Set action_repeat to 10:
-This will produce some input lag while recording demos, but this is what is set for training/inference as well. What it means is that the currently set action will repeat for 10 frames before the next action is set. Also, only once every 10 frames, the obs/action will be read and saved to the demo file. You can optionally set a lower value here, in that case you may also want to lower it in the `sync` node in `training_scene.tscn` and `testing_scene.tscn`.
+This will produce some input lag while recording demos, but this is what is set for training/inference as well. What it
+means is that the currently set action will repeat for 10 frames before the next action is set. Also, only once every 10
+frames, the obs/action will be read and saved to the demo file. You can optionally set a lower value here, in that case
+you may also want to lower it in the `sync` node in `training_scene.tscn` and `testing_scene.tscn`.
 
 ![action repeat 10](https://github.com/edbeeching/godot_rl_agents/assets/61947090/50dd4ca3-1386-4435-a229-2becd71c42a1)
 
+#### Open `RobotAIController.gd`:
 
-##### Open `RobotAIController.gd`:
 (You can search for it in `Filter Files` box in the `FileSystem` if it's not showing up)
 
 ![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/49651e15-e1e9-4307-936d-7e6fbf434637)
 
 We need to change some things in the AIController to allow for demo recording to work properly.
 
-##### Modify `set_action` and implement `get_action`
+#### Modify `set_action` and implement `get_action`
 
 Find the set_action method, and replace the code with this:
 
@@ -85,15 +108,19 @@ func set_action(action = null) -> void:
 		).limit_length(1.0)
 ```
 
-The way this works is that if we are running training or inference with a RL agent, the set_action method will be called with action provided. However, during demo recording, set_action will be called without any action provided, so we need to manually set the values. 
+The way this works is that if we are running training or inference with a RL agent, the set_action method will be called
+with action provided. However, during demo recording, set_action will be called without any action provided, so we need
+to manually set the values.
 
-`set_action()` will be called just before `get_action()` so the demo recorder will record the currently applied action for the current state/observations.
+`set_action()` will be called just before `get_action()` so the demo recorder will record the currently applied action
+for the current state/observations.
 
 Now we can simplify the heuristic handling code (for when "human control" mode is used) in robot.gd.
 
-##### Open `robot.gd`
+#### Open `robot.gd`
 
 Change the `handle_movement` method to the following code:
+
 ```gdscript
 func handle_movement(delta):
 	var movement := Vector3()
@@ -115,7 +142,8 @@ func handle_movement(delta):
 	update_wheels_and_visual_rotation(delta)
 ```
 
-Let's also set the game to only use the last level. This will simplify the demo recording and training for this tutorial.
+Let's also set the game to only use the last level. This will simplify the demo recording and training for this
+tutorial.
 
 Find the `reset()` method and change it to:
 
@@ -128,53 +156,80 @@ func reset():
 	previous_distance_to_goal = global_position.distance_to(current_goal_transform.origin)
 ```
 
-##### Record some demos
+#### Record some demos
+
 To record demos, press `F6` or click on `Run Current Scene`.
 
-Record some demos of successfully completing the level. You can use the key previously set for removing the last episode if the robot hits an enemy or falls down during recording. 
+Record some demos of successfully completing the level. You can use the key previously set for removing the last episode
+if the robot hits an enemy or falls down during recording.
 
 Here's a highly sped-up video of recording 18 episodes:
 
 https://github.com/edbeeching/godot_rl_agents/assets/61947090/7bdc19ba-6e88-431d-b87b-7ec3e0ce1a7c
 
-Note: I found it difficult to control the robot with action repeat 10, and I removed a few episodes where the robot hit an enemy robot during recording so that they don't end up in the recorded demos file. I would recommend setting action repeat to a lower value like 6-8 (both in AIController and sync node in the two scenes mentioned previously). It's also possible to change the `speed up` property of the `sync` node while recording demos to make the process easier, as it will slow down or speed up the game according to the setting.
+Note: I found it difficult to control the robot with action repeat 10, and I removed a few episodes where the robot hit
+an enemy robot during recording so that they don't end up in the recorded demos file. I would recommend setting action
+repeat to a lower value like 6-8 (both in AIController and sync node in the two scenes mentioned previously). It's also
+possible to change the `speed up` property of the `sync` node while recording demos to make the process easier, as it
+will slow down or speed up the game according to the setting.
 
-Once you are done recording, click on `x` to close the game window (do not use the `Stop` button in the editor as that will not save the file), and you will see `demo.json` in the filesystem. 
+Once you are done recording, click on `x` to close the game window (do not use the `Stop` button in the editor as that
+will not save the file), and you will see `demo.json` in the filesystem.
+
+#### Export the game
 
-##### Export the game
 Click on `Project` > `Export` and export the game for your current OS. We will use the exported game for training.
 
-##### Open conda terminal or venv terminal:
-I use conda in this example, but you can use the corresponding commands with venv or the system that you are using for managing Python environments.
+#### Open conda terminal or venv terminal:
+
+I use conda in this example, but you can use the corresponding commands with venv or the system that you are using for
+managing Python environments.
 
-##### Type `conda activate gdrl_il`
-(replace gdrl_il with the name you're using for the virtual env, it should have godot rl agents and imitation library installed)
+#### Type `conda activate gdrl_il`
+
+(replace gdrl_il with the name you're using for the virtual env, it should have godot rl agents and imitation library
+installed)
+
+#### Move to the folder with the Imitation Learning Example
 
-##### Move to the folder with the Imitation Learning Example
 Download the [sb3 imitation example](/examples/sb3_imitation.py) and `cd` into the folder where the file is.
 
-##### Set the arguments and start the training
+#### Set the arguments and start the training
+
 E.g. on Windows:
+
 ````
 python sb3_imitation.py --env_path="PATH_TO_EXPORTED_GAME_EXE_FILE_HERE" --il_timesteps=250_000 --demo_files="PATH_TO_THE_RECORDED_demo.json_FILE_HERE" --eval_episode_count=20 --n_parallel=5 --speedup=15
 ````
 
-Training should begin. As we set a small amount of timesteps, the results won't be perfect but it shouldn't take too long (may still take a while, you can reduce the timesteps if you wish to run a quick test). Beside increasing timesteps, you can open the script and modify the hyperaparameters to get better results. Having more high quality recorded demos can help too. You can load multiple files by adding them to the `--demo_files` argument, e.g. `--demo_files="file1_path" "file2_path" "file3_path"`.
-After the training is done, an evaluation environment should open and you will see the trained agent solving the env for 20 episodes.
+Training should begin. As we set a small amount of timesteps, the results won't be perfect but it shouldn't take too
+long (may still take a while, you can reduce the timesteps if you wish to run a quick test). Beside increasing
+timesteps, you can open the script and modify the hyperaparameters to get better results. Having more high quality
+recorded demos can help too. You can load multiple files by adding them to the `--demo_files` argument,
+e.g. `--demo_files="file1_path" "file2_path" "file3_path"`.
+After the training is done, an evaluation environment should open and you will see the trained agent solving the env for
+20 episodes.
 
 In my case, I got:
 ```Mean reward after evaluation: 5.906429767608643```
-The exact results you get may be different for various reasons, including the possibility that the hyperparameters and/or other variables may have changed since then. 
+The exact results you get may be different for various reasons, including the possibility that the hyperparameters
+and/or other variables may have changed since then.
 
 For comparison, when training just with `--rl_timesteps=250_000` I got a reward of:
 ```Mean reward after evaluation: 9.194426536560059```
 
-The imitation-learned reward could be improved by tweaking hyperaparameters (the parameters provided in the script are not optimized), recording more higher quality demos, doing some RL timesteps after it, etc.
-As this environment was designed and tested with PPO RL, in this case the environment is simple enough that PPO alone can learn it quickly from the reward function and imitation learning isn't necessary. 
-However, in more complex environments where it might be difficult to define a good dense reward function, learning from demonstrations and/or combining it with RL learning from sparse rewards could be helpful.
+The imitation-learned reward could be improved by tweaking hyperaparameters (the parameters provided in the script are
+not optimized), recording more higher quality demos, doing some RL timesteps after it, etc.
+As this environment was designed and tested with PPO RL, in this case the environment is simple enough that PPO alone
+can learn it quickly from the reward function and imitation learning isn't necessary.
+However, in more complex environments where it might be difficult to define a good dense reward function, learning from
+demonstrations and/or combining it with RL learning from sparse rewards could be helpful.
 
 There are a couple of other options to mention:
 
-After imitation learning, you can continue model training with PPO using the environment rewards to further improve the results. This is done by adding an argument to the script, e.g. `--rl_timesteps=250_000`.
+After imitation learning, you can continue model training with PPO using the environment rewards to further improve the
+results. This is done by adding an argument to the script, e.g. `--rl_timesteps=250_000`.
 
-You can set the script to export the trained model to onnx by adding e.g. `--onnx_export_path="model.onnx"`. That model can be then be copied to the game folder, and set in sync node in testing_scene to be used for inference without the Python server.
+You can set the script to export the trained model to onnx by adding e.g. `--onnx_export_path="model.onnx"`. That model
+can be then be copied to the game folder, and set in sync node in testing_scene to be used for inference without the
+Python server.

From 0222285827b03ece41fb351899f7323e51fe19b0 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 25 Jan 2024 23:08:14 +0100
Subject: [PATCH 08/19] imitation hyperparam update, set rewards to float

---
 examples/sb3_imitation.py                     | 53 +++++++++++--------
 godot_rl/wrappers/stable_baselines_wrapper.py | 49 +++++++++++++----
 2 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/examples/sb3_imitation.py b/examples/sb3_imitation.py
index 8e5d301b..7b08c320 100644
--- a/examples/sb3_imitation.py
+++ b/examples/sb3_imitation.py
@@ -63,16 +63,19 @@
     help="Evaluate policy in an env after training in a single env. Will always visualize.",
     default=False,
 )
-parser.add_argument("--speedup", default=1, type=int, help="Whether to speed up the physics in the env")
+parser.add_argument(
+    "--speedup", default=1, type=int, help="Whether to speed up the physics in the env"
+)
 parser.add_argument(
     "--n_parallel",
     default=1,
     type=int,
-    help="How many instances of the environment executable to " "launch - requires --env_path to be set if > 1.",
+    help="How many instances of the environment executable to "
+    "launch - requires --env_path to be set if > 1.",
 )
 parser.add_argument(
     "--il_timesteps",
-    default=100_000,
+    default=0,
     type=int,
     help="How many timesteps to train for using imitation learning.",
 )
@@ -149,37 +152,39 @@ def close_env():
 policy_kwargs = dict(log_std_init=log(1.0))
 
 learner = PPO(
-    batch_size=64,
+    batch_size=128,
     env=env,
     policy="MlpPolicy",
     learning_rate=0.0003,
-    clip_range=0.3,
+    clip_range=0.2,
     n_epochs=20,
     n_steps=64,
     ent_coef=0.0001,
     target_kl=0.025,
     policy_kwargs=policy_kwargs,
+    verbose=1,
 )
 
-reward_net = BasicRewardNet(
-    observation_space=env.observation_space,
-    action_space=env.action_space,
-    normalize_input_layer=RunningNorm,
-)
+if args.il_timesteps:
+    reward_net = BasicRewardNet(
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+        normalize_input_layer=RunningNorm,
+    )
 
-gail_trainer = GAIL(
-    demonstrations=trajectories,
-    demo_batch_size=64,
-    gen_replay_buffer_capacity=64,
-    n_disc_updates_per_round=24,
-    venv=env,
-    gen_algo=learner,
-    reward_net=reward_net,
-    allow_variable_horizon=True,
-)
+    gail_trainer = GAIL(
+        demonstrations=trajectories,
+        demo_batch_size=128,
+        gen_replay_buffer_capacity=512,
+        n_disc_updates_per_round=24,
+        venv=env,
+        gen_algo=learner,
+        reward_net=reward_net,
+        allow_variable_horizon=True,
+    )
 
-print("Starting Imitation Learning Training using GAIL:")
-gail_trainer.train(args.il_timesteps)
+    print("Starting Imitation Learning Training using GAIL:")
+    gail_trainer.train(args.il_timesteps)
 
 if args.rl_timesteps:
     print("Starting RL Training:")
@@ -197,7 +202,9 @@ def close_env():
         speedup=args.speedup,
     )
     env = VecMonitor(env)
-    mean_reward, _ = evaluate_policy(learner, env, n_eval_episodes=args.eval_episode_count)
+    mean_reward, _ = evaluate_policy(
+        learner, env, n_eval_episodes=args.eval_episode_count
+    )
     print(f"Mean reward after evaluation: {mean_reward}")
 
 close_env()
diff --git a/godot_rl/wrappers/stable_baselines_wrapper.py b/godot_rl/wrappers/stable_baselines_wrapper.py
index 8d7493cf..aed3f780 100644
--- a/godot_rl/wrappers/stable_baselines_wrapper.py
+++ b/godot_rl/wrappers/stable_baselines_wrapper.py
@@ -11,17 +11,31 @@
 
 
 class StableBaselinesGodotEnv(VecEnv):
-    def __init__(self, env_path: Optional[str] = None, n_parallel: int = 1, seed: int = 0, **kwargs) -> None:
+    def __init__(
+        self,
+        env_path: Optional[str] = None,
+        n_parallel: int = 1,
+        seed: int = 0,
+        **kwargs,
+    ) -> None:
         # If we are doing editor training, n_parallel must be 1
         if env_path is None and n_parallel > 1:
-            raise ValueError("You must provide the path to a exported game executable if n_parallel > 1")
+            raise ValueError(
+                "You must provide the path to a exported game executable if n_parallel > 1"
+            )
 
         # Define the default port
         port = kwargs.pop("port", GodotEnv.DEFAULT_PORT)
 
         # Create a list of GodotEnv instances
         self.envs = [
-            GodotEnv(env_path=env_path, convert_action_space=True, port=port + p, seed=seed + p, **kwargs)
+            GodotEnv(
+                env_path=env_path,
+                convert_action_space=True,
+                port=port + p,
+                seed=seed + p,
+                **kwargs,
+            )
             for p in range(n_parallel)
         ]
 
@@ -42,7 +56,9 @@ def _check_valid_action_space(self) -> None:
                 len(action_space.spaces) == 1
             ), f"sb3 supports a single action space, this env contains multiple spaces {action_space}"
 
-    def step(self, action: np.ndarray) -> Tuple[Dict[str, np.ndarray], np.ndarray, np.ndarray, List[Dict[str, Any]]]:
+    def step(
+        self, action: np.ndarray
+    ) -> Tuple[Dict[str, np.ndarray], np.ndarray, np.ndarray, List[Dict[str, Any]]]:
         # Initialize lists for collecting results
         all_obs = []
         all_rewards = []
@@ -70,7 +86,12 @@ def step(self, action: np.ndarray) -> Tuple[Dict[str, np.ndarray], np.ndarray, n
         obs = lod_to_dol(all_obs)
 
         # Return results
-        return {k: np.array(v) for k, v in obs.items()}, np.array(all_rewards), np.array(all_term), all_info
+        return (
+            {k: np.array(v) for k, v in obs.items()},
+            np.array(all_rewards, dtype=np.float32),
+            np.array(all_term),
+            all_info,
+        )
 
     def reset(self) -> Dict[str, np.ndarray]:
         # Initialize lists for collecting results
@@ -105,7 +126,9 @@ def action_space(self) -> gym.Space:
     def num_envs(self) -> int:
         return self.envs[0].num_envs * self.n_parallel
 
-    def env_is_wrapped(self, wrapper_class: type, indices: Optional[List[int]] = None) -> List[bool]:
+    def env_is_wrapped(
+        self, wrapper_class: type, indices: Optional[List[int]] = None
+    ) -> List[bool]:
         # Return a list indicating that no environments are wrapped
         return [False] * (self.envs[0].num_envs * self.n_parallel)
 
@@ -116,7 +139,9 @@ def env_method(self):
     def get_attr(self, attr_name: str, indices=None) -> List[Any]:
         if attr_name == "render_mode":
             return [None for _ in range(self.num_envs)]
-        raise AttributeError("get attr not fully implemented in godot-rl StableBaselinesWrapper")
+        raise AttributeError(
+            "get attr not fully implemented in godot-rl StableBaselinesWrapper"
+        )
 
     def seed(self, seed=None):
         raise NotImplementedError()
@@ -128,16 +153,20 @@ def step_async(self, actions: np.ndarray) -> None:
         # Execute the step function asynchronously, not actually implemented in this setting
         self.results = self.step(actions)
 
-    def step_wait(self) -> Tuple[Dict[str, np.ndarray], np.ndarray, np.ndarray, List[Dict[str, Any]]]:
+    def step_wait(
+        self,
+    ) -> Tuple[Dict[str, np.ndarray], np.ndarray, np.ndarray, List[Dict[str, Any]]]:
         # Wait for the results from the asynchronous step
         return self.results
 
 
 def stable_baselines_training(args, extras, n_steps: int = 200000, **kwargs) -> None:
     if can_import("ray"):
-        print("WARNING, stable baselines and ray[rllib] are not compatable")
+        print("WARNING, stable baselines and ray[rllib] are not compatible")
     # Initialize the custom environment
-    env = StableBaselinesGodotEnv(env_path=args.env_path, show_window=args.viz, speedup=args.speedup, **kwargs)
+    env = StableBaselinesGodotEnv(
+        env_path=args.env_path, show_window=args.viz, speedup=args.speedup, **kwargs
+    )
     env = VecMonitor(env)
 
     # Initialize the PPO model

From d915548eb49c490752af6a933aadae942984c6ab Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 25 Jan 2024 23:13:54 +0100
Subject: [PATCH 09/19] Reformat

---
 examples/sb3_imitation.py                     | 11 +++-------
 godot_rl/wrappers/stable_baselines_wrapper.py | 20 +++++--------------
 2 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/examples/sb3_imitation.py b/examples/sb3_imitation.py
index 7b08c320..d52e6e75 100644
--- a/examples/sb3_imitation.py
+++ b/examples/sb3_imitation.py
@@ -63,15 +63,12 @@
     help="Evaluate policy in an env after training in a single env. Will always visualize.",
     default=False,
 )
-parser.add_argument(
-    "--speedup", default=1, type=int, help="Whether to speed up the physics in the env"
-)
+parser.add_argument("--speedup", default=1, type=int, help="Whether to speed up the physics in the env")
 parser.add_argument(
     "--n_parallel",
     default=1,
     type=int,
-    help="How many instances of the environment executable to "
-    "launch - requires --env_path to be set if > 1.",
+    help="How many instances of the environment executable to " "launch - requires --env_path to be set if > 1.",
 )
 parser.add_argument(
     "--il_timesteps",
@@ -202,9 +199,7 @@ def close_env():
         speedup=args.speedup,
     )
     env = VecMonitor(env)
-    mean_reward, _ = evaluate_policy(
-        learner, env, n_eval_episodes=args.eval_episode_count
-    )
+    mean_reward, _ = evaluate_policy(learner, env, n_eval_episodes=args.eval_episode_count)
     print(f"Mean reward after evaluation: {mean_reward}")
 
 close_env()
diff --git a/godot_rl/wrappers/stable_baselines_wrapper.py b/godot_rl/wrappers/stable_baselines_wrapper.py
index aed3f780..1db812a9 100644
--- a/godot_rl/wrappers/stable_baselines_wrapper.py
+++ b/godot_rl/wrappers/stable_baselines_wrapper.py
@@ -20,9 +20,7 @@ def __init__(
     ) -> None:
         # If we are doing editor training, n_parallel must be 1
         if env_path is None and n_parallel > 1:
-            raise ValueError(
-                "You must provide the path to a exported game executable if n_parallel > 1"
-            )
+            raise ValueError("You must provide the path to a exported game executable if n_parallel > 1")
 
         # Define the default port
         port = kwargs.pop("port", GodotEnv.DEFAULT_PORT)
@@ -56,9 +54,7 @@ def _check_valid_action_space(self) -> None:
                 len(action_space.spaces) == 1
             ), f"sb3 supports a single action space, this env contains multiple spaces {action_space}"
 
-    def step(
-        self, action: np.ndarray
-    ) -> Tuple[Dict[str, np.ndarray], np.ndarray, np.ndarray, List[Dict[str, Any]]]:
+    def step(self, action: np.ndarray) -> Tuple[Dict[str, np.ndarray], np.ndarray, np.ndarray, List[Dict[str, Any]]]:
         # Initialize lists for collecting results
         all_obs = []
         all_rewards = []
@@ -126,9 +122,7 @@ def action_space(self) -> gym.Space:
     def num_envs(self) -> int:
         return self.envs[0].num_envs * self.n_parallel
 
-    def env_is_wrapped(
-        self, wrapper_class: type, indices: Optional[List[int]] = None
-    ) -> List[bool]:
+    def env_is_wrapped(self, wrapper_class: type, indices: Optional[List[int]] = None) -> List[bool]:
         # Return a list indicating that no environments are wrapped
         return [False] * (self.envs[0].num_envs * self.n_parallel)
 
@@ -139,9 +133,7 @@ def env_method(self):
     def get_attr(self, attr_name: str, indices=None) -> List[Any]:
         if attr_name == "render_mode":
             return [None for _ in range(self.num_envs)]
-        raise AttributeError(
-            "get attr not fully implemented in godot-rl StableBaselinesWrapper"
-        )
+        raise AttributeError("get attr not fully implemented in godot-rl StableBaselinesWrapper")
 
     def seed(self, seed=None):
         raise NotImplementedError()
@@ -164,9 +156,7 @@ def stable_baselines_training(args, extras, n_steps: int = 200000, **kwargs) ->
     if can_import("ray"):
         print("WARNING, stable baselines and ray[rllib] are not compatible")
     # Initialize the custom environment
-    env = StableBaselinesGodotEnv(
-        env_path=args.env_path, show_window=args.viz, speedup=args.speedup, **kwargs
-    )
+    env = StableBaselinesGodotEnv(env_path=args.env_path, show_window=args.viz, speedup=args.speedup, **kwargs)
     env = VecMonitor(env)
 
     # Initialize the PPO model

From c05e3ee6fd10bf86d0bf71ba37376d6dcef2edb1 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 26 Jan 2024 16:26:41 +0100
Subject: [PATCH 10/19] Update sb3_imitation.py

Removes currently unused argument, removes the mention of resuming since the script currently doesn't implement resuming (although it's possible to resume by modifying the script or with the sb3 example script).
---
 examples/sb3_imitation.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/examples/sb3_imitation.py b/examples/sb3_imitation.py
index d52e6e75..49bc5f1e 100644
--- a/examples/sb3_imitation.py
+++ b/examples/sb3_imitation.py
@@ -34,8 +34,7 @@
     "--save_model_path",
     default=None,
     type=str,
-    help="The path to use for saving the trained sb3 model after training is complete. Saved model can be used later "
-    "to resume training. Extension will be set to .zip",
+    help="The path to use for saving the trained sb3 model after training is complete. Extension will be set to .zip",
 )
 parser.add_argument(
     "--onnx_export_path",
@@ -43,13 +42,6 @@
     type=str,
     help="The Godot binary to use, do not include for in editor training",
 )
-parser.add_argument(
-    "--inference",
-    default=False,
-    action="store_true",
-    help="Instead of training, it will run inference on a loaded model for --timesteps steps. "
-    "Requires --resume_model_path to be set.",
-)
 parser.add_argument(
     "--viz",
     action="store_true",

From 50eb247485653c1b328f4a1436d34cbe80b532f8 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 26 Jan 2024 16:47:23 +0100
Subject: [PATCH 11/19] Reformat

Reformat using black-24.1.0
---
 godot_rl/wrappers/ray_wrapper.py | 6 +++---
 tests/benchmark_env.py           | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/godot_rl/wrappers/ray_wrapper.py b/godot_rl/wrappers/ray_wrapper.py
index d1fc68a1..f1c875f2 100644
--- a/godot_rl/wrappers/ray_wrapper.py
+++ b/godot_rl/wrappers/ray_wrapper.py
@@ -181,9 +181,9 @@ def rllib_training(args, extras):
             checkpoint_at_end=not args.eval,
             restore=args.restore,
             local_dir=os.path.abspath(args.experiment_dir) or os.path.abspath("logs/rllib"),
-            trial_name_creator=lambda trial: f"{args.experiment_name}"
-            if args.experiment_name
-            else f"{trial.trainable_name}_{trial.trial_id}",
+            trial_name_creator=lambda trial: (
+                f"{args.experiment_name}" if args.experiment_name else f"{trial.trainable_name}_{trial.trial_id}"
+            ),
         )
     if args.export:
         raise NotImplementedError("Exporting is not (re)implemented yet")
diff --git a/tests/benchmark_env.py b/tests/benchmark_env.py
index d5cc4f6a..b85a44f9 100644
--- a/tests/benchmark_env.py
+++ b/tests/benchmark_env.py
@@ -4,7 +4,6 @@
 we perform 10,000 actions and calculate the interactions per second in a variety of configurations
 """
 
-
 import time
 
 from godot_rl.core.godot_env import GodotEnv

From 8bccfa66bf0dbb4bbfd7cc4745b280cbba1ba3b0 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 26 Jan 2024 19:09:21 +0100
Subject: [PATCH 12/19] Update IMITATION_LEARNING.md

Reformat and other small improvements.
---
 docs/IMITATION_LEARNING.md | 56 +++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index c7b608fb..cfe0d459 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -5,7 +5,8 @@ For imitation learning, we use the imitation library: https://github.com/HumanCo
 From the docs:
 > Imitation provides clean implementations of imitation and reward learning algorithms, under a unified and
 > user-friendly API. Currently, we have implementations of Behavioral Cloning, DAgger (with synthetic examples),
-> density-based reward modeling, Maximum Causal Entropy Inverse Reinforcement Learning, Adversarial Inverse Reinforcement
+> density-based reward modeling, Maximum Causal Entropy Inverse Reinforcement Learning, Adversarial Inverse
+> Reinforcement
 > Learning, Generative Adversarial Imitation Learning, and Deep RL from Human Preferences.
 
 ### Installation:
@@ -22,17 +23,17 @@ learning. This tutorial assumes you have Godot, Godot RL Agents, Imitation, and
 the quick-start guide from the readme of this repository and potentially
 the [custom env tutorial](https://github.com/edbeeching/godot_rl_agents/blob/main/docs/CUSTOM_ENV.md) as well.
 
-#### Download all of the examples:
+#### Download all the examples:
 
 https://github.com/edbeeching/godot_rl_agents_examples/tree/main (either clone or click on `Code` > `Download ZIP`)
 
 #### Update plugin:
 
-At the time of writing this tutorials, envs don't currently have the plugin version that includes the demo recorder.
+At the time of writing this tutorial, envs don't currently have the plugin version that includes the demo recorder.
 We'll use the MultiLevelRobotEnv example. First download
-the [latest plugin from Github](https://github.com/edbeeching/godot_rl_agents_plugin) and then copy the `addons` folder
+the [latest plugin from GitHub](https://github.com/edbeeching/godot_rl_agents_plugin) and then copy the `addons` folder
 from the plugin folder to the previously downloaded example folder:
-`godot_rl_agents_examples-main\examples\MultiLevelRobot\addons` (replace all of the files or remove the addons folder in
+`godot_rl_agents_examples-main\examples\MultiLevelRobot\addons` (replace all the files or remove the addons folder in
 the game example before pasting the one from the plugin).
 
 #### In Godot editor, import the MultiLevelRobotEnv example.
@@ -46,11 +47,11 @@ recorder mode.
 
 #### Right click on `GameScene`, then click on `Editable Children`:
 
-![make game scene editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/c899ec23-45fd-41fa-a1f7-d9a4b836283e)
+![make game scene editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/91e5a74f-1186-4b28-bdbe-8032b9b9d6ab)
 
 #### Right click on `Robot`, then click on `Editable Children`:
 
-![make robot editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/16d6819f-77e9-491b-be45-900172b36a8e)
+![make robot editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/2e0bab9c-5b65-4b1e-843d-d56cc4882a0a)
 
 #### Set Control Mode to `Record Expert Demos` and write a file path to save the demos to:
 
@@ -71,7 +72,8 @@ an episode with a non-optimal outcome (e.g. if the robot fell or hit an enemy ro
 
 This will produce some input lag while recording demos, but this is what is set for training/inference as well. What it
 means is that the currently set action will repeat for 10 frames before the next action is set. Also, only once every 10
-frames, the obs/action will be read and saved to the demo file. You can optionally set a lower value here, in that case
+frames, the obs/action will be read and saved to the demo file. You can optionally set a lower value here to reduce lag,
+in that case
 you may also want to lower it in the `sync` node in `training_scene.tscn` and `testing_scene.tscn`.
 
 ![action repeat 10](https://github.com/edbeeching/godot_rl_agents/assets/61947090/50dd4ca3-1386-4435-a229-2becd71c42a1)
@@ -86,7 +88,7 @@ We need to change some things in the AIController to allow for demo recording to
 
 #### Modify `set_action` and implement `get_action`
 
-Find the set_action method, and replace the code with this:
+Find the set_action method, and replace the code of the method with:
 
 ```gdscript
 ## Returns the action that is currently applied to the robot.
@@ -108,12 +110,16 @@ func set_action(action = null) -> void:
 		).limit_length(1.0)
 ```
 
-The way this works is that if we are running training or inference with a RL agent, the set_action method will be called
-with action provided. However, during demo recording, set_action will be called without any action provided, so we need
+The way this works is that if we are running training or inference with a RL agent, the `set_action` method will be
+called
+with action values provided. However, during demo recording, `set_acation` will be called without any action provided,
+so we need
 to manually set the values.
 
-`set_action()` will be called just before `get_action()` so the demo recorder will record the currently applied action
-for the current state/observations.
+> [!NOTE]
+> `set_action()` will be called just before `get_action()`, so the demo recorder will record the currently applied
+> action
+> for the current state/observations.
 
 Now we can simplify the heuristic handling code (for when "human control" mode is used) in robot.gd.
 
@@ -167,11 +173,16 @@ Here's a highly sped-up video of recording 18 episodes:
 
 https://github.com/edbeeching/godot_rl_agents/assets/61947090/7bdc19ba-6e88-431d-b87b-7ec3e0ce1a7c
 
-Note: I found it difficult to control the robot with action repeat 10, and I removed a few episodes where the robot hit
-an enemy robot during recording so that they don't end up in the recorded demos file. I would recommend setting action
-repeat to a lower value like 6-8 (both in AIController and sync node in the two scenes mentioned previously). It's also
-possible to change the `speed up` property of the `sync` node while recording demos to make the process easier, as it
-will slow down or speed up the game according to the setting.
+> [!NOTE]
+> I found it difficult to control the robot with action repeat 10, and I removed a few episodes where the robot hit
+> an enemy robot during recording so that they don't end up in the recorded demos file. I would recommend setting action
+> repeat to a lower value like 6-8 (both in AIController and sync node in the two scenes mentioned previously).
+> Another way to make this easier is to drag the sync.gd script to Sync node in both training and testing scene.
+> An extended sync script is set in this example which set uses a 30 ticks per second physics setting, which is not
+> ideal for manual control.
+> It's also possible to change the `speed up` property of the `sync` node while recording demos to make the process
+> easier, as it
+> will slow down or speed up the game according to the setting.
 
 Once you are done recording, click on `x` to close the game window (do not use the `Stop` button in the editor as that
 will not save the file), and you will see `demo.json` in the filesystem.
@@ -202,12 +213,13 @@ E.g. on Windows:
 python sb3_imitation.py --env_path="PATH_TO_EXPORTED_GAME_EXE_FILE_HERE" --il_timesteps=250_000 --demo_files="PATH_TO_THE_RECORDED_demo.json_FILE_HERE" --eval_episode_count=20 --n_parallel=5 --speedup=15
 ````
 
-Training should begin. As we set a small amount of timesteps, the results won't be perfect but it shouldn't take too
+Training should begin. As we set a small amount of timesteps, the results won't be perfect, but it shouldn't take too
 long (may still take a while, you can reduce the timesteps if you wish to run a quick test). Beside increasing
-timesteps, you can open the script and modify the hyperaparameters to get better results. Having more high quality
+timesteps, you can open the scripta and modify the hyperaparameters to get better results. Having more high quality
 recorded demos can help too. You can load multiple files by adding them to the `--demo_files` argument,
 e.g. `--demo_files="file1_path" "file2_path" "file3_path"`.
-After the training is done, an evaluation environment should open and you will see the trained agent solving the env for
+After the training is done, an evaluation environment should open, and you will see the trained agent solving the env
+for
 20 episodes.
 
 In my case, I got:
@@ -219,7 +231,7 @@ For comparison, when training just with `--rl_timesteps=250_000` I got a reward
 ```Mean reward after evaluation: 9.194426536560059```
 
 The imitation-learned reward could be improved by tweaking hyperaparameters (the parameters provided in the script are
-not optimized), recording more higher quality demos, doing some RL timesteps after it, etc.
+not optimized), recording more high quality demos, doing some RL timesteps after it, etc.
 As this environment was designed and tested with PPO RL, in this case the environment is simple enough that PPO alone
 can learn it quickly from the reward function and imitation learning isn't necessary.
 However, in more complex environments where it might be difficult to define a good dense reward function, learning from

From a42fefe5780cb1d4e47fd065ab528d3e8b63f9f8 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 26 Jan 2024 19:10:08 +0100
Subject: [PATCH 13/19] Update IMITATION_LEARNING.md

Typo fix
---
 docs/IMITATION_LEARNING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index cfe0d459..e5e2d844 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -215,7 +215,7 @@ python sb3_imitation.py --env_path="PATH_TO_EXPORTED_GAME_EXE_FILE_HERE" --il_ti
 
 Training should begin. As we set a small amount of timesteps, the results won't be perfect, but it shouldn't take too
 long (may still take a while, you can reduce the timesteps if you wish to run a quick test). Beside increasing
-timesteps, you can open the scripta and modify the hyperaparameters to get better results. Having more high quality
+timesteps, you can open the script and modify the hyperaparameters to get better results. Having more high quality
 recorded demos can help too. You can load multiple files by adding them to the `--demo_files` argument,
 e.g. `--demo_files="file1_path" "file2_path" "file3_path"`.
 After the training is done, an evaluation environment should open, and you will see the trained agent solving the env

From 0dec80ad449c3d087ed497b29efd8ce58af628b2 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 26 Jan 2024 19:20:19 +0100
Subject: [PATCH 14/19] Update IMITATION_LEARNING.md

Another typo fix / clarification
---
 docs/IMITATION_LEARNING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index e5e2d844..9f09b4d5 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -177,8 +177,8 @@ https://github.com/edbeeching/godot_rl_agents/assets/61947090/7bdc19ba-6e88-431d
 > I found it difficult to control the robot with action repeat 10, and I removed a few episodes where the robot hit
 > an enemy robot during recording so that they don't end up in the recorded demos file. I would recommend setting action
 > repeat to a lower value like 6-8 (both in AIController and sync node in the two scenes mentioned previously).
-> Another way to make this easier is to drag the sync.gd script to Sync node in both training and testing scene.
-> An extended sync script is set in this example which set uses a 30 ticks per second physics setting, which is not
+> Another way to make this easier is to drag the defauklt sync.gd script to Sync node in both training and testing scene.
+> This example uses an extended sync script with a 30 ticks per second physics setting, which is not
 > ideal for manual control.
 > It's also possible to change the `speed up` property of the `sync` node while recording demos to make the process
 > easier, as it

From 8eec5d2ab7461bac245a13327dbddbf1d4ad2cb6 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 26 Jan 2024 19:22:06 +0100
Subject: [PATCH 15/19] Update IMITATION_LEARNING.md

Another clarification
---
 docs/IMITATION_LEARNING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index 9f09b4d5..ee8ce4ca 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -49,7 +49,7 @@ recorder mode.
 
 ![make game scene editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/91e5a74f-1186-4b28-bdbe-8032b9b9d6ab)
 
-#### Right click on `Robot`, then click on `Editable Children`:
+#### Expand `GameScene`, Right click on `Robot`, then click on `Editable Children`:
 
 ![make robot editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/2e0bab9c-5b65-4b1e-843d-d56cc4882a0a)
 

From 03bdf8c0e9eba6389dbfe267d5f810f8bb7f84e5 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Sat, 27 Jan 2024 17:00:51 +0100
Subject: [PATCH 16/19] Update IMITATION_LEARNING.md

Updates the tutorial images and completes the description of a step.
---
 docs/IMITATION_LEARNING.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index ee8ce4ca..80f4f503 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -51,19 +51,19 @@ recorder mode.
 
 #### Expand `GameScene`, Right click on `Robot`, then click on `Editable Children`:
 
-![make robot editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/2e0bab9c-5b65-4b1e-843d-d56cc4882a0a)
+![make robot editable](https://github.com/edbeeching/godot_rl_agents/assets/61947090/08d603cc-abfc-4513-bcae-97bf6ab68084)
 
-#### Set Control Mode to `Record Expert Demos` and write a file path to save the demos to:
+#### Click on expanded `AIController3D`, set `Control Mode` to `Record Expert Demos` and write a file path to save the demos to:
 
-![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/be531a1e-14e9-4fb3-8055-698d3f99a1e5)
+![configure aicontroller3d](https://github.com/edbeeching/godot_rl_agents/assets/61947090/8dc56be1-a157-4bac-9917-ddfbbe8262eb)
 
 #### Add an `InputEventKey` to `Remove Last Episode Key`:
 
-![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/ff94ffb6-23e8-4c3e-8dc3-ca8d07cbfc45)
+![add inputeventkey](https://github.com/edbeeching/godot_rl_agents/assets/61947090/ff94ffb6-23e8-4c3e-8dc3-ca8d07cbfc45)
 
 #### Set a key of your choice, then click on `OK`:
 
-![image](https://github.com/edbeeching/godot_rl_agents/assets/61947090/1d10a016-2944-411d-a4ab-cb00409fda04)
+![set remove episode key](https://github.com/edbeeching/godot_rl_agents/assets/61947090/1d10a016-2944-411d-a4ab-cb00409fda04)
 
 This key will be used to remove the last episode during recording. We can use this during demo recording if we recorded
 an episode with a non-optimal outcome (e.g. if the robot fell or hit an enemy robot, the episode timed out, etc.).

From 187106c8d2b5ab669ed9bbdee719a9f6683d061c Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Mon, 29 Jan 2024 01:19:52 +0100
Subject: [PATCH 17/19] Update IMITATION_LEARNING.md

typo fix
---
 docs/IMITATION_LEARNING.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index 80f4f503..2813f1ff 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -112,7 +112,7 @@ func set_action(action = null) -> void:
 
 The way this works is that if we are running training or inference with a RL agent, the `set_action` method will be
 called
-with action values provided. However, during demo recording, `set_acation` will be called without any action provided,
+with action values provided. However, during demo recording, `set_action` will be called without any action provided,
 so we need
 to manually set the values.
 
@@ -177,7 +177,8 @@ https://github.com/edbeeching/godot_rl_agents/assets/61947090/7bdc19ba-6e88-431d
 > I found it difficult to control the robot with action repeat 10, and I removed a few episodes where the robot hit
 > an enemy robot during recording so that they don't end up in the recorded demos file. I would recommend setting action
 > repeat to a lower value like 6-8 (both in AIController and sync node in the two scenes mentioned previously).
-> Another way to make this easier is to drag the defauklt sync.gd script to Sync node in both training and testing scene.
+> Another way to make this easier is to drag the default sync.gd script to Sync node in both training and testing
+> scene.
 > This example uses an extended sync script with a 30 ticks per second physics setting, which is not
 > ideal for manual control.
 > It's also possible to change the `speed up` property of the `sync` node while recording demos to make the process
@@ -215,7 +216,7 @@ python sb3_imitation.py --env_path="PATH_TO_EXPORTED_GAME_EXE_FILE_HERE" --il_ti
 
 Training should begin. As we set a small amount of timesteps, the results won't be perfect, but it shouldn't take too
 long (may still take a while, you can reduce the timesteps if you wish to run a quick test). Beside increasing
-timesteps, you can open the script and modify the hyperaparameters to get better results. Having more high quality
+timesteps, you can open the script and modify the hyperparameters to get better results. Having more high quality
 recorded demos can help too. You can load multiple files by adding them to the `--demo_files` argument,
 e.g. `--demo_files="file1_path" "file2_path" "file3_path"`.
 After the training is done, an evaluation environment should open, and you will see the trained agent solving the env
@@ -230,7 +231,7 @@ and/or other variables may have changed since then.
 For comparison, when training just with `--rl_timesteps=250_000` I got a reward of:
 ```Mean reward after evaluation: 9.194426536560059```
 
-The imitation-learned reward could be improved by tweaking hyperaparameters (the parameters provided in the script are
+The imitation-learned reward could be improved by tweaking hyperparameters (the parameters provided in the script are
 not optimized), recording more high quality demos, doing some RL timesteps after it, etc.
 As this environment was designed and tested with PPO RL, in this case the environment is simple enough that PPO alone
 can learn it quickly from the reward function and imitation learning isn't necessary.

From 02a87b7704b0358619d50d967527fb2a06cbeae1 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Mon, 29 Jan 2024 20:28:35 +0100
Subject: [PATCH 18/19] Update sb3_imitation.py

Refactor for loop
---
 examples/sb3_imitation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/sb3_imitation.py b/examples/sb3_imitation.py
index 49bc5f1e..94cb99d4 100644
--- a/examples/sb3_imitation.py
+++ b/examples/sb3_imitation.py
@@ -115,11 +115,11 @@ def close_env():
     with open(file_path, "r") as file:
         data = json.load(file)
 
-    for i in range(0, len(data)):
+    for traj in data:
         trajectories.append(
             imitation.data.rollout.types.Trajectory(
-                obs=np.array(data[i][0]),
-                acts=np.array(data[i][1]),
+                obs=np.array(traj[0]),
+                acts=np.array(traj[1]),
                 infos=None,
                 terminal=True,
             )

From c811fa1566e757c9f4dcff736034b2f55bbc3743 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Tue, 30 Jan 2024 20:22:44 +0100
Subject: [PATCH 19/19] Update IMITATION_LEARNING.md

---
 docs/IMITATION_LEARNING.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/IMITATION_LEARNING.md b/docs/IMITATION_LEARNING.md
index 2813f1ff..32359509 100644
--- a/docs/IMITATION_LEARNING.md
+++ b/docs/IMITATION_LEARNING.md
@@ -118,8 +118,15 @@ to manually set the values.
 
 > [!NOTE]
 > `set_action()` will be called just before `get_action()`, so the demo recorder will record the currently applied
-> action
-> for the current state/observations.
+> action for the current state/observations.
+
+> [!TIP]
+> The values in the array that `get_action()` returns should be in the same order as the action keys defined in
+> `func get_action_space()`. For continous actions:
+> If an action is size 1, that means add one value to the array.
+> If an action is size 2, add two values for that action.
+> If an action is size 2, and the next action is size 1, add two values for the first action,
+> then one value for the second. 
 
 Now we can simplify the heuristic handling code (for when "human control" mode is used) in robot.gd.