aws-deepracer-community · mattcamp · Apr 18, 2024 · May 1, 2024 · May 1, 2024 · May 1, 2024
diff --git a/bin/activate.sh b/bin/activate.sh
@@ -18,6 +18,10 @@ function dr-update-env {
     return 1
   fi
 
+  if [[ ! -z $DR_EXPERIMENT_NAME ]]; then
+    export DR_CONFIG="$DIR/experiments/$DR_EXPERIMENT_NAME/run.env"
+  fi
+
   if [[ -f "$DR_CONFIG" ]]; then
     LINES=$(grep -v '^#' $DR_CONFIG)
     for l in $LINES; do
@@ -26,10 +30,25 @@ function dr-update-env {
       eval "export $env_var=$env_val"
     done
   else
-    echo "File run.env does not exist."
+    echo "File ${DR_CONFIG} does not exist."
     return 1
   fi
 
+  if [[ ! -z $DR_EXPERIMENT_NAME ]]; then
+    if [[ -f "$DIR/experiments/$DR_EXPERIMENT_NAME/run.env" ]]; then
+      LINES=$(grep -v '^#' $DIR/experiments/$DR_EXPERIMENT_NAME/run.env)
+      for l in $LINES; do
+        env_var=$(echo $l | cut -f1 -d\=)
+        env_val=$(echo $l | cut -f2 -d\=)
+        eval "export $env_var=$env_val"
+      done
+    else
+      echo "File $DIR/experiments/$DR_EXPERIMENT_NAME/run.env does not exist."
+      return 1
+    fi
+  fi
+
+
   if [[ -z "${DR_RUN_ID}" ]]; then
     export DR_RUN_ID=0
   fi
@@ -54,12 +73,18 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
 DIR="$(dirname $SCRIPT_DIR)"
 export DR_DIR=$DIR
 
+EXPERIMENT_FLAG="$( grep DR_EXPERIMENT_NAME $DIR/system.env | grep -v \#)"
+
 if [[ -f "$1" ]]; then
   export DR_CONFIG=$(readlink -f $1)
   dr-update-env
 elif [[ -f "$DIR/run.env" ]]; then
   export DR_CONFIG="$DIR/run.env"
   dr-update-env
+elif [[ ! -z $EXPERIMENT_FLAG ]]; then
+  EXPERIMENT_NAME=$(echo $EXPERIMENT_FLAG | cut -f2 -d\=)
+  eval "export DR_CONFIG=$DIR/experiments/$EXPERIMENT_NAME/run.env"
+  dr-update-env
 else
   echo "No configuration file."
   return 1

diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh
@@ -3,7 +3,10 @@
 function dr-upload-custom-files {
   eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/)
   echo "Uploading files to $CUSTOM_TARGET"
-  aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET
+  if [[ -z $DR_EXPERIMENT_NAME ]]; then
+    aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET
+  else
+    aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/experiments/$DR_EXPERIMENT_NAME/custom_files/ $CUSTOM_TARGET
 }
 
 function dr-upload-model {

diff --git a/defaults/template-system.env b/defaults/template-system.env
@@ -27,5 +27,6 @@ DR_WEBVIEWER_PORT=8100
 # DR_REMOTE_MINIO_URL=http://mynas:9000
 # DR_ROBOMAKER_CUDA_DEVICES=0
 # DR_SAGEMAKER_CUDA_DEVICES=0
+# DR_EXPERIMENT_NAME=
 # DR_TELEGRAF_HOST=telegraf
-# DR_TELEGRAF_PORT=8092
+# DR_TELEGRAF_PORT=8092
diff --git a/docs/multi_run.md b/docs/multi_run.md
@@ -1,4 +1,46 @@
-# Running Multiple Experiments
+# Managing experiments
+
+Often when training a model you may find that you want to run different training experiments with different settings, reward functions, action spaces, etc. 
+
+By default DRfC will assume that you store all the settings in run.env and in the files inside the `custom_files/` directory, however when running multiple sequential experiments these folders can get cluttered with many files and it can be tricky to keep track of what settings or files were used for a particular training run.
+
+DRfC has an optional feature which can be enabled to store all the config files for a particular training run in a dedicated sub-directory. 
+
+## To enable experiment sub-directories
+1. create the initial directory structure for your experiments. The top level directory must be called `experiments/` and must be in the root of your DRfC installation, along with a further subdir for your first experiment which must then contain a subdir called `custom_files`. 
+
+    `mkdir` using the `-p` flag can create this for you in a single easy command (be sure to run from inside the main DRfC directory):
+
+    ```
+    mkdir -p experiments/test-1/custom_files
+    ```
+2. Move (or copy) run.env into the experiment directory
+    ```
+    mv run.env experiments/test-1
+    ```
+
+    If you are using multiple workers then also move the `worker-#.env` files. 
+3. Create (or move) the files in the new experiment's custom_files directory for reward function, model metadata and hyperparameters.
+    ```
+    cp custom_files/* experiments/test-1/custom_files
+    ```
+4. Uncomment the `DR_EXPERIMENT_NAME` line from system.env and set it to your experiment name (which must match the name of your new subdir inside `experiments`, in this example it should be set to `test-1`)
+5. Run `dr-update` or restart your shell and re-source `bin/activate.sh`
+6. Start training as normal using `dr-start-training`
+
+## To iterate on an experiment
+
+To create a new experiment based on a previous one just copy the entire experiment subdir to a new name and update the `DR_EXPERIMENT_NAME` line in system.env.
+
+```
+cp -av experiments/test-1 experiments/test-2
+```
+
+You should edit the `run.env` inside the new experiment folder to update the `DR_LOCAL_S3_MODEL_PREFIX` (and `DR_LOCAL_S3_PRETRAINED_PREFIX` if you are cloning the previous experiment's model).
+
+Don't forget to run `dr-update` after changing any files. 
+
+# Running Multiple Parallel Experiments
 
 It is possible to run multiple experiments on one computer in parallel. This is possible both in `swarm` and `compose` mode, and is controlled by `DR_RUN_ID` in `run.env`.
 

diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py
@@ -135,7 +135,10 @@
 
         else:  # i >= 2 
             #read in additional configuration file.  format of file must be worker#-run.env
-            location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i)))
+            if os.environ.get('DR_EXPERIMENT_NAME'):
+                location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'experiments', os.environ.get('DR_EXPERIMENT_NAME'),'worker-{}.env'.format(i)))
+            else:
+                location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i)))
             with open(location, 'r') as fh:
                 vars_dict = dict(
                     tuple(line.split('='))