From 74897c97ae3d1b4879f01064a7c9ac8145cb0877 Mon Sep 17 00:00:00 2001
From: Andrew Theurer <atheurer@redhat.com>
Date: Fri, 25 Oct 2024 13:41:27 -0400
Subject: [PATCH 1/2] support ilab 1.2 and various fixes

---
 ilab-client       | 349 ++++++++++++++++++++++++----------------------
 ilab-post-process | 145 +++++++++++++------
 multiplex.json    |  46 ++++--
 rickshaw.json     |   8 ++
 4 files changed, 323 insertions(+), 225 deletions(-)

diff --git a/ilab-client b/ilab-client
index dbac761..daad7ee 100755
--- a/ilab-client
+++ b/ilab-client
@@ -4,19 +4,51 @@
 exec >ilab-client-stderrout.txt
 exec 2>&1
 
+. /usr/bin/ilab-base || (echo "/usr/bin/ilab-base not found"; exit 1)
+
 workflow="train"
 train_until="complete" # checkpoint:N, runavg:N, or complete
 nnodes=1
-gpus=1
-num_epochs=1
-save_samples=""
-max_seq_length=""
-effective_batch_size=""
-cpu_offload_optimizer=""
-cpu_offload_pin_memory=""
+train_profile="L40_x4"
+train_model_path="/home/models/granite-7b-redhat-lab"
+train_phased_mt_bench_judge="/root/.cache/instructlab/models/prometheus-8x7b-v2-0"
+train_phased_phase1_data="/home/data/training/jul19-knowledge-26k.jsonl"
+train_phased_phase1_num_epochs=2
+train_phased_phase1_samples_per_save=""
+train_phased_phase1_trim_samples=""
+train_phased_phase2_data="/usr/share/instructlab/sdg/datasets/skills.jsonl"
+train_phased_phase2_num_epochs=2
+train_phased_phase2_samples_per_save=""
+train_phased_phase2_trim_samples="15000"
+
+ilab_version=`ilab --version | awk '{print $3}'`
+# 0.19.1 = rhelai 1.2
+# 0.18.3 = rhelai 1.1.0
 
+pwd=`/bin/pwd`
+pushd /opt/app-root/lib/python3.11/site-packages
+find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata
+popd
 
-opts=$(getopt -q -o "" --longoptions "workflow:,num-cpus:,batch-size:,save-samples:,max-seq-len:,model:,data-path:,nnodes:,gpus:,num-epochs:,effective-batch-size:,deepspeed-cpu-offload-optimizer:,deepspeed-cpu-offload-optimizer-pin-memory:,train-until:" -n "getopt.sh" -- "$@");
+longopts=""
+longopts+=" workflow:"
+longopts+=",sdg-num-cpus:"
+longopts+=",sdg-batch-size:"
+longopts+=",sdg-model:"
+longopts+=",sdg-gpus:"
+longopts+=",train-profile:"
+longopts+=",train-model-path:"
+longopts+=",train-phased-mt-bench-judge:"
+longopts+=",train-phased-phase1-data:"
+longopts+=",train-phased-phase1-num-epochs:"
+longopts+=",train-phased-phase1-samples-per-save:"
+longopts+=",train-phased-phase1-trim-data-samples:"
+longopts+=",train-phased-phase2-data:"
+longopts+=",train-phased-phase2-num-epochs:"
+longopts+=",train-phased-phase2-samples-per-save:"
+longopts+=",train-phased-phase2-trim-data-samples:"
+
+opts=$(getopt -q -o "" --longoptions "$longopts" -n "getopt.sh" -- "$@");
 if [ $? -ne 0 ]; then
     printf -- "\tUnrecognized option specified\n\n"
     exit 1
@@ -27,58 +59,61 @@ while true; do
     val=$1; shift
     case "$arg" in
         --workflow)
-        workflow="$val"
-        ;;
-
-	# The following options are for SDG
-        --num-cpus)
-        num_cpus="--num-cpus=$val"
-        ;;
-        --batch-size)
-        batch_size="--batch-size=$val"
-        ;;
-
-
-	# The following options are for SDG and training
-        --gpus)
-        gpus="--gpus=$val"
-        ;;
-        --model)
-        model=$val
-        ;;
-
-	# The following options are for training
-        --save-samples)
-        save_samples="--save-samples=$val"
-        ;;
-        --max-seq-len)
-        max_seq_len="--max-seq-len=$val"
-        ;;
-        --data-path)
-        data_path=$val
-        ;;
-        --nnodes)
-        nnodes=$val
-        ;;
-        --num-epochs)
-        num_epochs=$val
-        ;;
-        --effective-batch-size)
-        effective_batch_size="--effective_batch_size $val"
-        ;;
-        --deepspeed-cpu-offload-optimizer)
-        if [ $val == "1" ]; then
-                cpu_offload_optimizer="--deepspeed-cpu-offload-optimizer True"
-            fi
-            ;;
-        --deepspeed-cpu-offload-optimizer-pin-memory)
-        if [ $val == "1" ]; then
-                cpu_offload_pin_memory="--deepspeed-cpu-offload-optimizer-pin-memory True"
-            fi
-        ;;
-        --train-until)
-            train_until=$val
+            workflow="$val"
+            ;;
+
+    # The following options are for sdg
+        --sdg-model)
+            sdg_model=$val
+            ;;
+        --sdg-num-cpus)
+            sdg_num_cpus=" --num-cpus $val"
+            ;;
+        --sdg-gpus)
+            sdg_num_cpus=" --gpus $val"
+            ;;
+        --sdg-batch-size)
+            sdg_batch_size=" --batch-size $val"
+            ;;
+
+    # The following options are for training
+        --train-profile)
+            train_profile=$val
+            ;;
+        --train-model-path)
+            train_model_path=$val
+            ;;
+        --train-phased-mt-bench-judge)
+            train_phased_mt_bench_judge=$val
+            ;;
+        # For phase1
+        --train-phased-phase1-data)
+            train_phased_phase1_data=$val
             ;;
+        --train-phased-phase1-num-epochs)
+            train_phased_phase1_num_epochs=$val
+            ;;
+        --train-phased-phase1-samples-per-save)
+            train_phased_phase1_samples_per_save="--phased-phase1-samples-per-save=$val"
+            ;;
+        --train-phased-phase1-trim-data-samples)
+            train_phased_phase1_trim_samples=$val
+            ;;
+        # For phase2
+        --train-phased-phase2-data)
+            train_phased_phase2_data=$val
+            ;;
+        --train-phased-phase2-num-epochs)
+            train_phased_phase2_num_epochs=$val
+            ;;
+        --train-phased-phase2-samples-per-save)
+            train_phased_phase2_samples_per_save="--phased-phase2-samples-per-save=$val"
+            ;;
+        --train-phased-phase2-trim-data-samples)
+            train_phased_phase2_trim_samples=$val
+            ;;
+
+
         --)
             break
             ;;
@@ -88,106 +123,112 @@ while true; do
     esac
 done
 
-ilab config init --non-interactive
-cp /opt/app-root/src/.config/instructlab/config.yaml .
+# Valid values for $train_profile:
+# A100_H100_x2 A100_H100_x4 A100_H100_x8 L40_x4 L40_x8 L4_x8 train_a100x4x8
+ilab config init --non-interactive --train-profile /usr/share/instructlab/training/profiles/$train_profile.yaml
+
+ilab config show >ilab-config-show.yaml
 
 echo "workflow: $workflow"
 echo "nnodes: $nnodes"
 echo "gpus: $gpus"
 
 if [[ $workflow =~ ^train.* ]]; then
+    ilab model train --help >ilab-model-train-help.txt
     echo "train_until: $train_until"
-    echo "num_epochs: $num_epochs"
-    echo "save_sample: $save_samples"
-    echo "max_seq_length: $max_seq_length"
-    echo "effective_batch_size: $effective_batch_size"
+    #echo "num_epochs: $num_epochs"
+    #echo "save_sample: $save_samples"
+    #echo "max_seq_length: $max_seq_length"
+    #echo "effective_batch_size: $effective_batch_size"
     echo "cpu_offload_optimizer: $cpu_offload_optimizer"
     echo "cpu_offload_pin_memory: $cpu_offload_pin_memory"
-    train_cmd="ilab model train\
-            --model-path $model\
-            --data-path $data_path\
-            --nnodes=$nnodes\
-            $gpus\
-            --num-epochs=$num_epochs
-            --ckpt-output-dir .\
-            $save_samples\
-            $max_seq_length\
-            $effective_batch_size\
-            $cpu_offload_optimizer\
-            $cpu_offload_pin_memory"
-
-    echo "/home/models:"
-    ls -la /home/models
-    echo "/home/data:"
-    ls -la /home/data
+    mkdir -p e2e
 
-    echo "train cmd:"
-    echo "$train_cmd"
+    if [[ ! -z "$train_phased_phase1_trim_samples" ]]; then
+        if [[ ! -e /tmp/random-but-same ]]; then
+            exit_error "Could not find /tmp/random-but-same"
+	fi
+        original_num_samples=`wc -l $train_phased_phase1_data | awk '{print $1}'`
+        shuf_cmd="shuf -n $train_phased_phase1_trim_samples --random-source=/tmp/random-but-same $train_phased_phase1_data -o $train_phased_phase1_data.randomly-trimmed"
+	echo "Going to run: $shuf_cmd"
+	$shuf_cmd || exit_error "$shuf_cmd failed"
+        train_phased_phase1_data+=".randomly-trimmed"
+	echo "trimmed data:"
+        if [[ ! -e $train_phased_phase1_data ]]; then
+            exit_error "Could not find $train_phased_phase1_data"
+	fi
+	/bin/ls -l $train_phased_phase1_data
+	echo "scale=2; $original_num_samples / $trimmed_num_samples" | bc >phase1-trimmed-sample-ratio.txt
+        trimmed_num_samples=`wc -l $train_phased_phase1_data | awk '{print $1}'`
+    fi
 
-    count=0
-    rc=0
-
-    if [[ $train_until =~ ^checkpoint:([0-9]+) ]]; then
-        num_samples=${BASH_REMATCH[1]}
-        watch_regex="Model\ssaved\sin\shf_format/(.*)"
-    elif [[ $train_until =~ ^runavg:([0-9]+) ]]; then
-        num_samples=${BASH_REMATCH[1]}
-        watch_regex='RunningAvgSamplesPerSec'
-    else
-        echo "not watching for 'Model saved in' or 'RunningAvgSamplesPerSec' and will train for $num_epochs epochs"
-        num_samples=0
+    if [[ ! -z "$train_phased_phase2_trim_samples" ]]; then
+        if [[ ! -e /tmp/random-but-same ]]; then
+            exit_error "Could not find /tmp/random-but-same"
+	fi
+        original_num_samples=`wc -l $train_phased_phase2_data | awk '{print $1}'`
+        shuf_cmd="shuf -n $train_phased_phase2_trim_samples --random-source=/tmp/random-but-same $train_phased_phase2_data -o $train_phased_phase2_data.randomly-trimmed"
+	echo "Going to run: $shuf_cmd"
+	$shuf_cmd || exit_error "$shuf_cmd failed"
+        train_phased_phase2_data+=".randomly-trimmed"
+	echo "trimmed data:"
+        if [[ ! -e $train_phased_phase2_data ]]; then
+            exit_error "Could not find $train_phased_phase2_data"
+	fi
+	/bin/ls -l $train_phased_phase2_data
+        trimmed_num_samples=`wc -l $train_phased_phase2_data | awk '{print $1}'`
+	echo "scale=2; $original_num_samples / $trimmed_num_samples" | bc >phase2-trimmed-sample-ratio.txt
     fi
 
-    echo "watch_regex: [$watch_regex]"
-    echo "num_samples: [$num_samples]"
+    train_cmd="ilab model train --skip-user-confirm --strategy lab-multiphase"
+
+    train_cmd+=" --phased-base-dir e2e"
+    train_cmd+=" --model-path $train_model_path"
+
+    train_cmd+=" --phased-phase1-data $train_phased_phase1_data"
+    train_cmd+=" --phased-phase1-num-epochs $train_phased_phase1_num_epochs"
+    if [[ ! -z "$train_phased_phase1_samples_per_save" ]]; then
+        train_cmd+=" --phased-phase1-samples-per-save $train_phased_phase1_samples_per_save"
+    fi
+
+    train_cmd+=" --phased-phase2-data $train_phased_phase2_data"
+    train_cmd+=" --phased-phase2-num-epochs $train_phased_phase2_num_epochs"
+    if [[ ! -z "$train_phased_phase2_samples_per_save" ]]; then
+        train_cmd+=" --phased-phase2-samples-per-save $train_phased_phase2_samples_per_save"
+    fi
+
+    train_cmd+=" --phased-mt-bench-judge $train_phased_mt_bench_judge"
+
+    echo "train cmd:"
+    echo "$train_cmd"
+
     echo "Training:"
     snapshot_file=""
-    $train_cmd 2>&1 |
-    {
-        count=0
-        while read line; do
-            echo "$line" >>train.txt
-            if [[ $line =~ $watch_regex ]]; then
-                snapshot=${BASH_REMATCH[1]}
-                ((count++))
-                echo found $line
-                if [[ $train_until =~ "^checkpoint:([0-9]+)" ]]; then
-                    echo "Found snapshot: hf_format/$snapshot"
-                fi
-            fi
-            if [[ $count -ne 0 && $count -ge $num_samples ]]; then
-                echo "Found the checkpoint we needed: hf_format/$snapshot"
-                echo "hf_format/$snapshot" >last_checkpointed_model.txt
-                break
-            fi
-        done
-        echo "count: $count"
-        if [[ $count -lt $num_samples ]]; then
-            echo "did not get the number of $watch_regex samples, so exiting error"
-            exit 1
-        else
-            echo "Exiting without error"
-            exit 0
-        fi
-    }
+    date +%s >train-start-timestamp.txt
+    $train_cmd >train-stdout.txt 2>train-stderr.txt
     rc=$?
     wait
-    for file in training_params_and_metrics_global0.jsonl full_logs_global0.log train.txt; do
-        if [ -e $file ]; then
-            xz $file
-        fi
-    done
+    date +%s >train-stop-timestamp.txt
+
+    # Delete any file larger than 100 MB (usually model checkpoints)
+    find . -size +100M -type f -print | while read line; do /bin/rm -f $line; done
+
+    # Compress the rest
+    find . -type f | while read line; do xz $line; done
+
 elif [[ $workflow =~ ^sdg.* ]]; then
-    sdg_cmd="ilab data generate\
-            --model $model\
-	    $num_cpus\
-	    $batch_size\
-            $gpus\
-            --output-dir ."
+    ilab data generate --help >ilab-data-generate-help.txt
+    sdg_cmd="ilab data generate"
+    sdg_cmd+=" --model $sdg_model"
+    sdg_cmd+=" $sdg_num_cpus"
+    sdg_cmd+=" $sdg_batch_size"
+    sdg_cmd+=" $sdg_gpus"
+    sdg_cmd+=" --output-dir ."
 
     echo "sdg cmd:"
     echo "$sdg_cmd"
 
+    echo "running ilab taxonomy diff"
     ilab taxonomy diff
 
     # This assumes LoRa already exists and is not downloaded by InstructLab every time
@@ -201,42 +242,18 @@ elif [[ $workflow =~ ^sdg.* ]]; then
     ln -sf /home/data/taxonomy/knowledge/phoenix /opt/app-root/src/.local/share/instructlab/taxonomy/knowledge/phoenix
     ln -sf /home/data/taxonomy/knowledge/mtba /opt/app-root/src/.local/share/instructlab/taxonomy/knowledge/mtba
 
-    ilab diff
     # The following is needed for post-processing
     wc -l /usr/share/instructlab/sdg/datasets/skills.jsonl | awk '{print $1}' >skills-num-samples.txt
 
-    $sdg_cmd >sdg.txt 2>&1
+    $sdg_cmd >sdg-stdout.txt 2>sdg-stderr.txt
     rc=$?
-    xz sdg.txt
+    for file in sdg-stdout.txt sdg-stderr.txt; do
+        xz $file &
+    done
+    wait
 else
     echo "Workflow [$workflow] not supported, exiting"
     exit 1
 fi
 
-if [[ $worklfow == "train+eval" ]]; then
-    if [[ ! -e last_checkpointed_model.txt ]]; then
-        echo "Error: could not find file: last_checkpointed_model.txt"
-    exit 1
-    fi
-    model_path=`cat last_checkpointed_model.txt`
-    if [ ! -e $model_path ]; then
-        echo "Could not find directory for checkpointed model: $model_path"
-    exit 1
-    fi
-    eval_cmd="ilab model evaluate"
-    eval_cmd+=" --benchmark mmlu_branch"
-    eval_cmd+=" --model $model_path"
-    eval_cmd+=" --tasks-dir tests/testdata/mmlu_branch/"
-    eval_cmd+=" --base-model $model"
-    echo "eval_cmd: $eval_cmd"
-    echo "Evaluating:"
-    $eval_cmd >eval.txt 2>&1
-
-else
-    echo "Deleting any snapshots because eval not happening after training"
-    rm -rf ht_format/*
-fi
-
-rm -rf ht_format/*
-
 exit $rc
diff --git a/ilab-post-process b/ilab-post-process
index 8906e7a..455bc34 100755
--- a/ilab-post-process
+++ b/ilab-post-process
@@ -9,6 +9,7 @@ import re
 import copy
 import math
 import json
+import yaml
 import argparse
 import glob
 from datetime import datetime
@@ -29,29 +30,17 @@ else:
 from toolbox.metrics import log_sample
 from toolbox.metrics import finish_samples
 
+params = {}
+
 class t_global(object):
      args = None
 
 def process_options():
     parser = argparse.ArgumentParser(description = 'Post process raw benchmark data into Common Data Model output')
 
-    # known options:
-        #--workflow)
-        #--save-samples)
-        #--max-seq-len)
-        #--model)
-        #--data-path)
-        #--nnodes)
-        #--gpus)
-        #--num-epochs)
-        #--effective-batch-size)
-        #--deepspeed-cpu-offload-optimizer)
-        #--deepspeed-cpu-offload-optimizer-pin-memory)
-        #--train-until)
-
     parser.add_argument('--workflow',
                         dest = 'workflow',
-                        help = 'sdg, train, or eval',
+                        help = 'sdg or train',
                         default = ""
                         )
 
@@ -59,13 +48,55 @@ def process_options():
 
     return()
 
+def traverse_yaml(o, prepend):
+    for key in o:
+        if isinstance(o[key], dict):
+            traverse_yaml(o[key], key)
+        else:
+            val_str = "%s" % (o[key])
+            if prepend == "":
+                params_name = "%s" % (key)
+            else:
+                params_name = "%s.%s" % (prepend,key)
+            print("%s: %s" % (params_name, val_str))
+            params[params_name] = val_str
+    return()
+
+
+def process_config_yaml():
+    with open("config.yaml") as stream:
+        try:
+            c=yaml.safe_load(stream)
+            traverse_yaml(c, "")
+        except yaml.YAMLError as exc:
+            print(exc)
+    return()
+
+
 def main():
     process_options()
-
     if t_global.args.workflow == '':
         print('workflow was not defined, exiting')
         return(1)
 
+    # While a crucible run contains parameters for a benchmark, it is possible
+    # that many parameters that are used are omitted by the params input.  The
+    # post-proceesing script can add parameters if it can detect them from the
+    # generated benchmark data.
+
+    # Note: the cmdline options passed to this script are the crucible benchmark
+    # params
+
+    # First process what the params are from the config.yaml that is generated
+    # via 'ilab config init'
+    ## process_config_yaml()
+
+    # Next, find any matching params from cmdline opts and override any matching
+    # one with the new value.
+    #add_cmdline_opts()
+
+    print(params)
+
     # In any benchmark post-process script, the metrics generated need to be attributed to a
     # time-period (AKA benchmark-phase).  The period which is used to report and offical
     # result for the benchmark is the 'measurement' period.  Other periods thay may exist
@@ -80,39 +111,62 @@ def main():
 
     metric_files = []
 
-    period = { 'name': 'measurement', 'metric-files': [] }
-    file_id = 'global0'
-
-    desc = {'source' : 'ilab', 'class': 'throughput'}
-    names = {}
     if t_global.args.workflow == 'train':
-        desc['type'] = 'train-samples-sec';
-        iter_sample['primary-metric'] = 'train-samples-sec'
-        filename = 'training_params_and_metrics_global0.jsonl.xz'
-        with lzma.open(filename, 'rt') as file:
-            for line in file:
-                d = json.loads(line)
-                # file contents to parse (per line):
-                #{"epoch": 0, "step": 1, "rank": 0,
-                # "loss": 0.18146394193172455,
-                # "overall_throughput": 3.5244029279710176,
-                # "lr": 0.0, "cuda_mem_allocated": 14.08400821685791,
-                # "cuda_malloc_retries": 0,
-                # "num_loss_counted_tokens": 4940, "batch_size": 14,
-                # "total_loss": 0.4069821238517761, "gradnorm": null,
-                # "weight_norm": 557.9681396484375,
-                # "timestamp": "2024-07-18T22:46:41.628932"}
-                if 'epoch' in d.keys():
-                    dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f')
-                    ts = math.floor(dt.timestamp() * 1000)
-                    sample = {'end': ts, 'value': d['overall_throughput']}
-                    log_sample(file_id, desc, names, sample)
+        first_ts = None
+        last_ts = None
+        for phase in [1, 2]:
+            period = { 'name': 'phase' + str(phase), 'metric-files': [] }
+            file_id = 'phase' + str(phase)
+            desc = {'source' : 'ilab', 'class': 'throughput'}
+            names = {}
+            desc['type'] = 'train-samples-sec';
+            filename = 'e2e/phase' + str(phase) + '/checkpoints/training_params_and_metrics_global0.jsonl.xz'
+            print('Opening ' + filename)
+            with lzma.open(filename, 'rt') as file:
+                for line in file:
+                    d = json.loads(line)
+                    # file contents to parse (per line):
+                    #{"epoch": 0, "step": 1, "rank": 0,
+                    # "loss": 0.18146394193172455,
+                    # "overall_throughput": 3.5244029279710176,
+                    # "lr": 0.0, "cuda_mem_allocated": 14.08400821685791,
+                    # "cuda_malloc_retries": 0,
+                    # "num_loss_counted_tokens": 4940, "batch_size": 14,
+                    # "total_loss": 0.4069821238517761, "gradnorm": null,
+                    # "weight_norm": 557.9681396484375,
+                    # "timestamp": "2024-07-18T22:46:41.628932"}
+                    if 'epoch' in d.keys():
+                        dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f')
+                        ts = math.floor(dt.timestamp() * 1000)
+                        if first_ts == None:
+                            first_ts = ts
+                        sample = {'end': ts, 'value': d['overall_throughput']}
+                        log_sample(file_id, desc, names, sample)
+                        last_ts = ts
+            metric_file_name = finish_samples()
+            period['metric-files'].append(metric_file_name)
+            iter_sample['periods'].append(period)
+
+        # Now create the primary metric and the primary-period
+        iter_sample['primary-metric'] = 'actual-train-seconds'
+        period = { 'name': 'measurement', 'metric-files': [] }
+        file_id = 'measurement'
+        desc = {'source' : 'ilab', 'class': 'count', 'type': 'actual-train-seconds'}
+        names = {}
+        sample = {'begin': first_ts, 'end': last_ts, 'value': (last_ts - first_ts) / 1000}
+        log_sample(file_id, desc, names, sample)
         metric_file_name = finish_samples()
         period['metric-files'].append(metric_file_name)
         iter_sample['periods'].append(period)
+
     elif t_global.args.workflow == 'sdg':
         print('sdg')
-        desc['type'] = 'sdg-samples-sec';
+        iter_sample['primary-metric'] = 'sdg-samples-sec'
+        period = { 'name': 'measurement', 'metric-files': [] }
+        file_id = 'measurement'
+        desc = {'source' : 'ilab', 'class': 'throughput', 'type': 'sdg-samples-sec'}
+        names = {}
+
         iter_sample['primary-metric'] = 'sdg-samples-sec'
         num_skills = open("skills-num-samples.txt").readline().rstrip()
         print('skills num samples')
@@ -125,10 +179,8 @@ def main():
         # Skills are loaded by detecting the "dataset loaded" line with N samples,
         # where N = skills-num-samples.txt (which comes from counting lines in
         # /usr/share/instructlab/sdg/datasets/skills.jsonl during the benchmark
-        with lzma.open("sdg.txt.xz", 'rt') as file:
+        with lzma.open("sdg-stderr.txt.xz", 'rt') as file:
             for line in file:
-                #print('line')
-                #print(line)
                 if skipped_skills:
                     if reggy := re.search(r'^INFO\s(\d+-\d+-\d+\s\d+:\d+:\d+\d+,\d+).+Generation\stook\s(\d+\.\d+)s', line):
                         print('found generation line')
@@ -168,3 +220,4 @@ def main():
 
 if __name__ == "__main__":
     exit(main())
+
diff --git a/multiplex.json b/multiplex.json
index 7cb4d56..4d93a21 100644
--- a/multiplex.json
+++ b/multiplex.json
@@ -5,34 +5,54 @@
       "generic_string" : {
         "description" : "all types of strings",
         "args" : [
-            "model", "data-path"
+            "sdg-model",
+            "train-model-path",
+            "train-profile",
+	    "train-data_path",
+	    "train-model_path",
+	    "train-phased-mt-bench-judge"
         ],
         "vals" : ".+"
       },
       "integer_ge_zero" : {
         "description" : "a whole number >= 0",
-        "args" : [ "batch-size" ],
+        "args" : [ 
+	    "sdg-num-cpus",
+	    "sdg-gpus",
+	    "sdg-batch-size",
+	    "train-model-path",
+	    "train-profile",
+            "train-phased-phase1-data",
+	    "train-phased-phase1-num-epochs",
+	    "train-phased-phase1-samples-per-save",
+	    "train-phased-phase1-trim-data-samples",
+            "train-phased-phase2-data",
+	    "train-phased-phase2-num-epochs",
+	    "train-phased-phase2-samples-per-save",
+	    "train-phased-phase2-trim-data-samples",
+            "train-phased-mt-bench-judge"
+	],
         "vals" : "[0-9]+"
       },
-      "integer_gt_zero" : {
-        "description" : "a whole number > 0",
-        "args" : [ "num-cpus", "nnodes", "gpus", "num-epochs", "effective-batch-size", "max-seq-length", "save-samples" ],
-        "vals" : "[1-9][0-9]*"
-      },
       "workflows" : {
         "description" : "what type of work to do",
         "args" : [ "workflow" ],
-        "vals" : "train|train+eval|eval|sdg|sdg+train|sdg+train+eval"
+        "vals" : "train|sdg"
       },
       "train_until" : {
         "description" : "checkpoint:N, or runavg:N, or complete (N is a positive int)",
-        "args" : [ "train-until" ],
+        "args" : [ "train.train-until" ],
         "vals" : "((checkpoint|runavg)\\:([1-9][0-9]*)|complete)"
       },
-      "bool_0_1" : {
-        "description" : "boolean as 0 (false) or 1 (true)",
-        "args" : [ "deepspeed-cpu-offload-optimizer", "deepspeed-cpu-offload-optimizer-pin-memory" ],
-        "vals" : "[0-1]"
+      "bool" : {
+        "description" : "boolean as True or False",
+        "args" : [ 
+          "train.checkpoint_at_epoch",
+          "train.deepspeed_cpu_offload_optimizer",
+          "train.is_padding_free",
+          "additional_args.deepspeed_cpu_offload_optimizer_pin_memory"
+        ],
+        "vals" : "[True|False]"
       }
   }
 }
diff --git a/rickshaw.json b/rickshaw.json
index 2ebd1d5..5cb9033 100644
--- a/rickshaw.json
+++ b/rickshaw.json
@@ -8,10 +8,18 @@
   },
   "client" : {
     "files-from-controller": [
+      {
+          "src": "%bench-dir%/random-but-same",
+          "dest": "/tmp/"
+      },
       {
           "src": "%bench-dir%/ilab-get-runtime",
           "dest": "/usr/bin/"
       },
+      {
+          "src": "%bench-dir%/ilab-base",
+          "dest": "/usr/bin/"
+      },
       {
           "src": "%bench-dir%/ilab-client",
           "dest": "/usr/bin/"

From 64ee8200a2662b0851e0d3ee249f1400451d7c57 Mon Sep 17 00:00:00 2001
From: Andrew Theurer <atheurer@redhat.com>
Date: Tue, 29 Oct 2024 10:29:54 -0400
Subject: [PATCH 2/2] more cleanup

---
 ilab-client       | 14 +-------
 ilab-post-process | 44 -----------------------
 multiplex.json    | 90 +++++++++++++++++++----------------------------
 3 files changed, 37 insertions(+), 111 deletions(-)

diff --git a/ilab-client b/ilab-client
index daad7ee..65039ab 100755
--- a/ilab-client
+++ b/ilab-client
@@ -7,7 +7,6 @@ exec 2>&1
 . /usr/bin/ilab-base || (echo "/usr/bin/ilab-base not found"; exit 1)
 
 workflow="train"
-train_until="complete" # checkpoint:N, runavg:N, or complete
 nnodes=1
 train_profile="L40_x4"
 train_model_path="/home/models/granite-7b-redhat-lab"
@@ -21,7 +20,7 @@ train_phased_phase2_num_epochs=2
 train_phased_phase2_samples_per_save=""
 train_phased_phase2_trim_samples="15000"
 
-ilab_version=`ilab --version | awk '{print $3}'`
+ilab --version | awk '{print $3}' >ilab-version.txt
 # 0.19.1 = rhelai 1.2
 # 0.18.3 = rhelai 1.1.0
 
@@ -112,8 +111,6 @@ while true; do
         --train-phased-phase2-trim-data-samples)
             train_phased_phase2_trim_samples=$val
             ;;
-
-
         --)
             break
             ;;
@@ -135,11 +132,6 @@ echo "gpus: $gpus"
 
 if [[ $workflow =~ ^train.* ]]; then
     ilab model train --help >ilab-model-train-help.txt
-    echo "train_until: $train_until"
-    #echo "num_epochs: $num_epochs"
-    #echo "save_sample: $save_samples"
-    #echo "max_seq_length: $max_seq_length"
-    #echo "effective_batch_size: $effective_batch_size"
     echo "cpu_offload_optimizer: $cpu_offload_optimizer"
     echo "cpu_offload_pin_memory: $cpu_offload_pin_memory"
     mkdir -p e2e
@@ -181,22 +173,18 @@ if [[ $workflow =~ ^train.* ]]; then
     fi
 
     train_cmd="ilab model train --skip-user-confirm --strategy lab-multiphase"
-
     train_cmd+=" --phased-base-dir e2e"
     train_cmd+=" --model-path $train_model_path"
-
     train_cmd+=" --phased-phase1-data $train_phased_phase1_data"
     train_cmd+=" --phased-phase1-num-epochs $train_phased_phase1_num_epochs"
     if [[ ! -z "$train_phased_phase1_samples_per_save" ]]; then
         train_cmd+=" --phased-phase1-samples-per-save $train_phased_phase1_samples_per_save"
     fi
-
     train_cmd+=" --phased-phase2-data $train_phased_phase2_data"
     train_cmd+=" --phased-phase2-num-epochs $train_phased_phase2_num_epochs"
     if [[ ! -z "$train_phased_phase2_samples_per_save" ]]; then
         train_cmd+=" --phased-phase2-samples-per-save $train_phased_phase2_samples_per_save"
     fi
-
     train_cmd+=" --phased-mt-bench-judge $train_phased_mt_bench_judge"
 
     echo "train cmd:"
diff --git a/ilab-post-process b/ilab-post-process
index 455bc34..39ec5c7 100755
--- a/ilab-post-process
+++ b/ilab-post-process
@@ -48,55 +48,12 @@ def process_options():
 
     return()
 
-def traverse_yaml(o, prepend):
-    for key in o:
-        if isinstance(o[key], dict):
-            traverse_yaml(o[key], key)
-        else:
-            val_str = "%s" % (o[key])
-            if prepend == "":
-                params_name = "%s" % (key)
-            else:
-                params_name = "%s.%s" % (prepend,key)
-            print("%s: %s" % (params_name, val_str))
-            params[params_name] = val_str
-    return()
-
-
-def process_config_yaml():
-    with open("config.yaml") as stream:
-        try:
-            c=yaml.safe_load(stream)
-            traverse_yaml(c, "")
-        except yaml.YAMLError as exc:
-            print(exc)
-    return()
-
-
 def main():
     process_options()
     if t_global.args.workflow == '':
         print('workflow was not defined, exiting')
         return(1)
 
-    # While a crucible run contains parameters for a benchmark, it is possible
-    # that many parameters that are used are omitted by the params input.  The
-    # post-proceesing script can add parameters if it can detect them from the
-    # generated benchmark data.
-
-    # Note: the cmdline options passed to this script are the crucible benchmark
-    # params
-
-    # First process what the params are from the config.yaml that is generated
-    # via 'ilab config init'
-    ## process_config_yaml()
-
-    # Next, find any matching params from cmdline opts and override any matching
-    # one with the new value.
-    #add_cmdline_opts()
-
-    print(params)
-
     # In any benchmark post-process script, the metrics generated need to be attributed to a
     # time-period (AKA benchmark-phase).  The period which is used to report and offical
     # result for the benchmark is the 'measurement' period.  Other periods thay may exist
@@ -220,4 +177,3 @@ def main():
 
 if __name__ == "__main__":
     exit(main())
-
diff --git a/multiplex.json b/multiplex.json
index 4d93a21..242285b 100644
--- a/multiplex.json
+++ b/multiplex.json
@@ -1,58 +1,40 @@
 {
-    "presets": {
+  "presets": {
+  },
+  "validations": {
+    "generic_string" : {
+      "description" : "all types of strings",
+      "args" : [
+        "sdg-model",
+        "train-model-path",
+        "train-profile",
+        "train-phased-phase1-data",
+        "train-phased-phase2-data",
+        "train-phased-mt-bench-judge"
+      ],
+      "vals" : ".+"
     },
-    "validations": {
-      "generic_string" : {
-        "description" : "all types of strings",
-        "args" : [
-            "sdg-model",
-            "train-model-path",
-            "train-profile",
-	    "train-data_path",
-	    "train-model_path",
-	    "train-phased-mt-bench-judge"
-        ],
-        "vals" : ".+"
-      },
-      "integer_ge_zero" : {
-        "description" : "a whole number >= 0",
-        "args" : [ 
-	    "sdg-num-cpus",
-	    "sdg-gpus",
-	    "sdg-batch-size",
-	    "train-model-path",
-	    "train-profile",
-            "train-phased-phase1-data",
-	    "train-phased-phase1-num-epochs",
-	    "train-phased-phase1-samples-per-save",
-	    "train-phased-phase1-trim-data-samples",
-            "train-phased-phase2-data",
-	    "train-phased-phase2-num-epochs",
-	    "train-phased-phase2-samples-per-save",
-	    "train-phased-phase2-trim-data-samples",
-            "train-phased-mt-bench-judge"
-	],
-        "vals" : "[0-9]+"
-      },
-      "workflows" : {
-        "description" : "what type of work to do",
-        "args" : [ "workflow" ],
-        "vals" : "train|sdg"
-      },
-      "train_until" : {
-        "description" : "checkpoint:N, or runavg:N, or complete (N is a positive int)",
-        "args" : [ "train.train-until" ],
-        "vals" : "((checkpoint|runavg)\\:([1-9][0-9]*)|complete)"
-      },
-      "bool" : {
-        "description" : "boolean as True or False",
-        "args" : [ 
-          "train.checkpoint_at_epoch",
-          "train.deepspeed_cpu_offload_optimizer",
-          "train.is_padding_free",
-          "additional_args.deepspeed_cpu_offload_optimizer_pin_memory"
-        ],
-        "vals" : "[True|False]"
-      }
+    "integer_ge_zero" : {
+      "description" : "a whole number >= 0",
+      "args" : [ 
+        "sdg-num-cpus",
+	"sdg-gpus",
+	"sdg-batch-size",
+	"train-model-path",
+	"train-profile",
+	"train-phased-phase1-num-epochs",
+	"train-phased-phase2-num-epochs",
+	"train-phased-phase1-samples-per-save",
+	"train-phased-phase2-samples-per-save",
+	"train-phased-phase1-trim-data-samples",
+	"train-phased-phase2-trim-data-samples"
+      ],
+      "vals" : "[0-9]+"
+    },
+    "workflows" : {
+      "description" : "what type of work to do",
+      "args" : [ "workflow" ],
+      "vals" : "train|sdg"
+    }
   }
 }