diff --git a/bin/activate.sh b/bin/activate.sh index 0b3e2e02..8d3cd26c 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -18,6 +18,10 @@ function dr-update-env { return 1 fi + if [[ ! -z $DR_EXPERIMENT_NAME ]]; then + export DR_CONFIG="$DIR/experiments/$DR_EXPERIMENT_NAME/run.env" + fi + if [[ -f "$DR_CONFIG" ]]; then LINES=$(grep -v '^#' $DR_CONFIG) for l in $LINES; do @@ -26,10 +30,25 @@ function dr-update-env { eval "export $env_var=$env_val" done else - echo "File run.env does not exist." + echo "File ${DR_CONFIG} does not exist." return 1 fi + if [[ ! -z $DR_EXPERIMENT_NAME ]]; then + if [[ -f "$DIR/experiments/$DR_EXPERIMENT_NAME/run.env" ]]; then + LINES=$(grep -v '^#' $DIR/experiments/$DR_EXPERIMENT_NAME/run.env) + for l in $LINES; do + env_var=$(echo $l | cut -f1 -d\=) + env_val=$(echo $l | cut -f2 -d\=) + eval "export $env_var=$env_val" + done + else + echo "File $DIR/experiments/$DR_EXPERIMENT_NAME/run.env does not exist." + return 1 + fi + fi + + if [[ -z "${DR_RUN_ID}" ]]; then export DR_RUN_ID=0 fi @@ -54,12 +73,18 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" DIR="$(dirname $SCRIPT_DIR)" export DR_DIR=$DIR +EXPERIMENT_FLAG="$( grep DR_EXPERIMENT_NAME $DIR/system.env | grep -v \#)" + if [[ -f "$1" ]]; then export DR_CONFIG=$(readlink -f $1) dr-update-env elif [[ -f "$DIR/run.env" ]]; then export DR_CONFIG="$DIR/run.env" dr-update-env +elif [[ ! -z $EXPERIMENT_FLAG ]]; then + EXPERIMENT_NAME=$(echo $EXPERIMENT_FLAG | cut -f2 -d\=) + eval "export DR_CONFIG=$DIR/experiments/$EXPERIMENT_NAME/run.env" + dr-update-env else echo "No configuration file." return 1 diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 27310f06..53aee913 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -3,7 +3,10 @@ function dr-upload-custom-files { eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" - aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET + if [[ -z $DR_EXPERIMENT_NAME ]]; then + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET + else + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/experiments/$DR_EXPERIMENT_NAME/custom_files/ $CUSTOM_TARGET } function dr-upload-model { diff --git a/defaults/template-system.env b/defaults/template-system.env index eafb4834..ca80a9e4 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -27,5 +27,6 @@ DR_WEBVIEWER_PORT=8100 # DR_REMOTE_MINIO_URL=http://mynas:9000 # DR_ROBOMAKER_CUDA_DEVICES=0 # DR_SAGEMAKER_CUDA_DEVICES=0 +# DR_EXPERIMENT_NAME= # DR_TELEGRAF_HOST=telegraf -# DR_TELEGRAF_PORT=8092 \ No newline at end of file +# DR_TELEGRAF_PORT=8092 diff --git a/docs/multi_run.md b/docs/multi_run.md index d0e6de38..606a200b 100644 --- a/docs/multi_run.md +++ b/docs/multi_run.md @@ -1,4 +1,46 @@ -# Running Multiple Experiments +# Managing experiments + +Often when training a model you may find that you want to run different training experiments with different settings, reward functions, action spaces, etc. + +By default DRfC will assume that you store all the settings in run.env and in the files inside the `custom_files/` directory, however when running multiple sequential experiments these folders can get cluttered with many files and it can be tricky to keep track of what settings or files were used for a particular training run. + +DRfC has an optional feature which can be enabled to store all the config files for a particular training run in a dedicated sub-directory. + +## To enable experiment sub-directories +1. create the initial directory structure for your experiments. The top level directory must be called `experiments/` and must be in the root of your DRfC installation, along with a further subdir for your first experiment which must then contain a subdir called `custom_files`. + + `mkdir` using the `-p` flag can create this for you in a single easy command (be sure to run from inside the main DRfC directory): + + ``` + mkdir -p experiments/test-1/custom_files + ``` +2. Move (or copy) run.env into the experiment directory + ``` + mv run.env experiments/test-1 + ``` + + If you are using multiple workers then also move the `worker-#.env` files. +3. Create (or move) the files in the new experiment's custom_files directory for reward function, model metadata and hyperparameters. + ``` + cp custom_files/* experiments/test-1/custom_files + ``` +4. Uncomment the `DR_EXPERIMENT_NAME` line from system.env and set it to your experiment name (which must match the name of your new subdir inside `experiments`, in this example it should be set to `test-1`) +5. Run `dr-update` or restart your shell and re-source `bin/activate.sh` +6. Start training as normal using `dr-start-training` + +## To iterate on an experiment + +To create a new experiment based on a previous one just copy the entire experiment subdir to a new name and update the `DR_EXPERIMENT_NAME` line in system.env. + +``` +cp -av experiments/test-1 experiments/test-2 +``` + +You should edit the `run.env` inside the new experiment folder to update the `DR_LOCAL_S3_MODEL_PREFIX` (and `DR_LOCAL_S3_PRETRAINED_PREFIX` if you are cloning the previous experiment's model). + +Don't forget to run `dr-update` after changing any files. + +# Running Multiple Parallel Experiments It is possible to run multiple experiments on one computer in parallel. This is possible both in `swarm` and `compose` mode, and is controlled by `DR_RUN_ID` in `run.env`. diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 3f692c6d..22e69681 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -135,7 +135,10 @@ else: # i >= 2 #read in additional configuration file. format of file must be worker#-run.env - location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i))) + if os.environ.get('DR_EXPERIMENT_NAME'): + location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'experiments', os.environ.get('DR_EXPERIMENT_NAME'),'worker-{}.env'.format(i))) + else: + location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i))) with open(location, 'r') as fh: vars_dict = dict( tuple(line.split('='))