From ee11e43e786052b1331c60afd12dfa236202e24c Mon Sep 17 00:00:00 2001 From: Matt Camp Date: Thu, 18 Apr 2024 10:03:17 +0200 Subject: [PATCH 1/8] initial commit of experiments changes --- bin/activate.sh | 15 +++++++++++++++ bin/scripts_wrapper.sh | 5 ++++- scripts/training/prepare-config.py | 5 ++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 127484e7..91d5aa95 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -30,6 +30,21 @@ function dr-update-env { return 1 fi + if [[ ! -z DR_EXPERIMENT_NAME ]]; then + if [[ -f "$DIR/experiments/$DR_EXPERIMENT_NAME/run.env" ]]; then + LINES=$(grep -v '^#' $DIR/experiments/$DR_EXPERIMENT_NAME/run.env) + for l in $LINES; do + env_var=$(echo $l | cut -f1 -d\=) + env_val=$(echo $l | cut -f2 -d\=) + eval "export $env_var=$env_val" + done + else + echo "File $DIR/experiments/$DR_EXPERIMENT_NAME/run.env does not exist." + return 1 + fi + fi + + if [[ -z "${DR_RUN_ID}" ]]; then export DR_RUN_ID=0 fi diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 623c70f4..6b74ebd8 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -3,7 +3,10 @@ function dr-upload-custom-files { eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" - aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET + if [[ -z DR_EXPERIMENT_NAME ]]; then + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET + else + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/experiments/$DR_EXPERIMENT_NAME/custom_files/ $CUSTOM_TARGET } function dr-upload-model { diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 2c493608..05ed82c0 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -135,7 +135,10 @@ else: # i >= 2 #read in additional configuration file. format of file must be worker#-run.env - location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i))) + if os.environ.get('DR_EXPERIMENT_NAME'): + location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'experiments', os.environ.get('DR_EXPERIMENT_NAME'),'worker-{}.env'.format(i))) + else: + location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i))) with open(location, 'r') as fh: vars_dict = dict( tuple(line.split('=')) From c9a863cc0cb312951f301f9ebac9a3083505382c Mon Sep 17 00:00:00 2001 From: Matt Camp Date: Wed, 1 May 2024 11:53:34 +0200 Subject: [PATCH 2/8] fix bash env var --- bin/scripts_wrapper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 6b74ebd8..08ee0026 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -3,7 +3,7 @@ function dr-upload-custom-files { eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" - if [[ -z DR_EXPERIMENT_NAME ]]; then + if [[ -z $DR_EXPERIMENT_NAME ]]; then aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET else aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/experiments/$DR_EXPERIMENT_NAME/custom_files/ $CUSTOM_TARGET From 57412425306058e19475f85944e436c240260f9a Mon Sep 17 00:00:00 2001 From: Matt Camp Date: Wed, 1 May 2024 11:53:55 +0200 Subject: [PATCH 3/8] simplify experiments to use existing DR_CONFIG --- bin/activate.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/bin/activate.sh b/bin/activate.sh index 91d5aa95..0c960414 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -18,6 +18,10 @@ function dr-update-env { return 1 fi + if [[ ! -z $DR_EXPERIMENT_NAME ]]; then + export DR_CONFIG="$DIR/experiments/$DR_EXPERIMENT_NAME/run.env" + fi + if [[ -f "$DR_CONFIG" ]]; then LINES=$(grep -v '^#' $DR_CONFIG) for l in $LINES; do @@ -26,7 +30,7 @@ function dr-update-env { eval "export $env_var=$env_val" done else - echo "File run.env does not exist." + echo "File ${DR_CONFIG} does not exist." return 1 fi @@ -65,12 +69,19 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" DIR="$(dirname $SCRIPT_DIR)" export DR_DIR=$DIR +EXPERIMENT_FLAG="$( grep DR_EXPERIMENT_NAME $DIR/system.env )" + if [[ -f "$1" ]]; then export DR_CONFIG=$(readlink -f $1) dr-update-env elif [[ -f "$DIR/run.env" ]]; then export DR_CONFIG="$DIR/run.env" dr-update-env +elif [[ ! -z $EXPERIMENT_FLAG ]]; +then + EXPERIMENT_NAME=$(echo $EXPERIMENT_FLAG | cut -f2 -d\=) + eval "export DR_CONFIG=$DIR/experiments/$EXPERIMENT_NAME/run.env" + dr-update-env else echo "No configuration file." return 1 From 61b1749e3529e1e32976dd5a856ec82455019269 Mon Sep 17 00:00:00 2001 From: Matt Camp Date: Wed, 1 May 2024 11:55:15 +0200 Subject: [PATCH 4/8] add commented out var to system.env template --- defaults/template-system.env | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/defaults/template-system.env b/defaults/template-system.env index 317dfb70..577af4a2 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -26,4 +26,5 @@ DR_WEBVIEWER_PORT=8100 # DR_DISPLAY=:99 # DR_REMOTE_MINIO_URL=http://mynas:9000 # DR_ROBOMAKER_CUDA_DEVICES=0 -# DR_SAGEMAKER_CUDA_DEVICES=0 \ No newline at end of file +# DR_SAGEMAKER_CUDA_DEVICES=0 +# DR_EXPERIMENT_NAME= \ No newline at end of file From 7e8f7662ef1e7fb85124b75566e0e23ab39eea03 Mon Sep 17 00:00:00 2001 From: Matt Camp Date: Wed, 1 May 2024 12:20:37 +0200 Subject: [PATCH 5/8] Add docs --- docs/multi_run.md | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/multi_run.md b/docs/multi_run.md index d0e6de38..606a200b 100644 --- a/docs/multi_run.md +++ b/docs/multi_run.md @@ -1,4 +1,46 @@ -# Running Multiple Experiments +# Managing experiments + +Often when training a model you may find that you want to run different training experiments with different settings, reward functions, action spaces, etc. + +By default DRfC will assume that you store all the settings in run.env and in the files inside the `custom_files/` directory, however when running multiple sequential experiments these folders can get cluttered with many files and it can be tricky to keep track of what settings or files were used for a particular training run. + +DRfC has an optional feature which can be enabled to store all the config files for a particular training run in a dedicated sub-directory. + +## To enable experiment sub-directories +1. create the initial directory structure for your experiments. The top level directory must be called `experiments/` and must be in the root of your DRfC installation, along with a further subdir for your first experiment which must then contain a subdir called `custom_files`. + + `mkdir` using the `-p` flag can create this for you in a single easy command (be sure to run from inside the main DRfC directory): + + ``` + mkdir -p experiments/test-1/custom_files + ``` +2. Move (or copy) run.env into the experiment directory + ``` + mv run.env experiments/test-1 + ``` + + If you are using multiple workers then also move the `worker-#.env` files. +3. Create (or move) the files in the new experiment's custom_files directory for reward function, model metadata and hyperparameters. + ``` + cp custom_files/* experiments/test-1/custom_files + ``` +4. Uncomment the `DR_EXPERIMENT_NAME` line from system.env and set it to your experiment name (which must match the name of your new subdir inside `experiments`, in this example it should be set to `test-1`) +5. Run `dr-update` or restart your shell and re-source `bin/activate.sh` +6. Start training as normal using `dr-start-training` + +## To iterate on an experiment + +To create a new experiment based on a previous one just copy the entire experiment subdir to a new name and update the `DR_EXPERIMENT_NAME` line in system.env. + +``` +cp -av experiments/test-1 experiments/test-2 +``` + +You should edit the `run.env` inside the new experiment folder to update the `DR_LOCAL_S3_MODEL_PREFIX` (and `DR_LOCAL_S3_PRETRAINED_PREFIX` if you are cloning the previous experiment's model). + +Don't forget to run `dr-update` after changing any files. + +# Running Multiple Parallel Experiments It is possible to run multiple experiments on one computer in parallel. This is possible both in `swarm` and `compose` mode, and is controlled by `DR_RUN_ID` in `run.env`. From 4ef8dd89569585ee842a8c375da4533aeafc7c04 Mon Sep 17 00:00:00 2001 From: Matt Camp Date: Wed, 1 May 2024 12:27:55 +0200 Subject: [PATCH 6/8] Ignore commented out line in system.env --- bin/activate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/activate.sh b/bin/activate.sh index 0c960414..42d3eda7 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -69,7 +69,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" DIR="$(dirname $SCRIPT_DIR)" export DR_DIR=$DIR -EXPERIMENT_FLAG="$( grep DR_EXPERIMENT_NAME $DIR/system.env )" +EXPERIMENT_FLAG="$( grep DR_EXPERIMENT_NAME $DIR/system.env | grep -v \#)" if [[ -f "$1" ]]; then export DR_CONFIG=$(readlink -f $1) From a55a93df053b8565de8273344277f18230f7eefc Mon Sep 17 00:00:00 2001 From: Matt Camp Date: Wed, 1 May 2024 12:30:02 +0200 Subject: [PATCH 7/8] remove newline --- bin/activate.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 42d3eda7..d04aabaa 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -77,8 +77,7 @@ if [[ -f "$1" ]]; then elif [[ -f "$DIR/run.env" ]]; then export DR_CONFIG="$DIR/run.env" dr-update-env -elif [[ ! -z $EXPERIMENT_FLAG ]]; -then +elif [[ ! -z $EXPERIMENT_FLAG ]]; then EXPERIMENT_NAME=$(echo $EXPERIMENT_FLAG | cut -f2 -d\=) eval "export DR_CONFIG=$DIR/experiments/$EXPERIMENT_NAME/run.env" dr-update-env From 00e9d2f7aac3f0f092fdd3a3e6fc4470794108bc Mon Sep 17 00:00:00 2001 From: Matt Camp Date: Wed, 1 May 2024 12:31:55 +0200 Subject: [PATCH 8/8] fix missing $ --- bin/activate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/activate.sh b/bin/activate.sh index d04aabaa..03a431b4 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -34,7 +34,7 @@ function dr-update-env { return 1 fi - if [[ ! -z DR_EXPERIMENT_NAME ]]; then + if [[ ! -z $DR_EXPERIMENT_NAME ]]; then if [[ -f "$DIR/experiments/$DR_EXPERIMENT_NAME/run.env" ]]; then LINES=$(grep -v '^#' $DIR/experiments/$DR_EXPERIMENT_NAME/run.env) for l in $LINES; do