From 74897c97ae3d1b4879f01064a7c9ac8145cb0877 Mon Sep 17 00:00:00 2001 From: Andrew Theurer Date: Fri, 25 Oct 2024 13:41:27 -0400 Subject: [PATCH 1/2] support ilab 1.2 and various fixes --- ilab-client | 349 ++++++++++++++++++++++++---------------------- ilab-post-process | 145 +++++++++++++------ multiplex.json | 46 ++++-- rickshaw.json | 8 ++ 4 files changed, 323 insertions(+), 225 deletions(-) diff --git a/ilab-client b/ilab-client index dbac761..daad7ee 100755 --- a/ilab-client +++ b/ilab-client @@ -4,19 +4,51 @@ exec >ilab-client-stderrout.txt exec 2>&1 +. /usr/bin/ilab-base || (echo "/usr/bin/ilab-base not found"; exit 1) + workflow="train" train_until="complete" # checkpoint:N, runavg:N, or complete nnodes=1 -gpus=1 -num_epochs=1 -save_samples="" -max_seq_length="" -effective_batch_size="" -cpu_offload_optimizer="" -cpu_offload_pin_memory="" +train_profile="L40_x4" +train_model_path="/home/models/granite-7b-redhat-lab" +train_phased_mt_bench_judge="/root/.cache/instructlab/models/prometheus-8x7b-v2-0" +train_phased_phase1_data="/home/data/training/jul19-knowledge-26k.jsonl" +train_phased_phase1_num_epochs=2 +train_phased_phase1_samples_per_save="" +train_phased_phase1_trim_samples="" +train_phased_phase2_data="/usr/share/instructlab/sdg/datasets/skills.jsonl" +train_phased_phase2_num_epochs=2 +train_phased_phase2_samples_per_save="" +train_phased_phase2_trim_samples="15000" + +ilab_version=`ilab --version | awk '{print $3}'` +# 0.19.1 = rhelai 1.2 +# 0.18.3 = rhelai 1.1.0 +pwd=`/bin/pwd` +pushd /opt/app-root/lib/python3.11/site-packages +find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata +popd -opts=$(getopt -q -o "" --longoptions "workflow:,num-cpus:,batch-size:,save-samples:,max-seq-len:,model:,data-path:,nnodes:,gpus:,num-epochs:,effective-batch-size:,deepspeed-cpu-offload-optimizer:,deepspeed-cpu-offload-optimizer-pin-memory:,train-until:" -n "getopt.sh" -- "$@"); +longopts="" +longopts+=" workflow:" +longopts+=",sdg-num-cpus:" +longopts+=",sdg-batch-size:" +longopts+=",sdg-model:" +longopts+=",sdg-gpus:" +longopts+=",train-profile:" +longopts+=",train-model-path:" +longopts+=",train-phased-mt-bench-judge:" +longopts+=",train-phased-phase1-data:" +longopts+=",train-phased-phase1-num-epochs:" +longopts+=",train-phased-phase1-samples-per-save:" +longopts+=",train-phased-phase1-trim-data-samples:" +longopts+=",train-phased-phase2-data:" +longopts+=",train-phased-phase2-num-epochs:" +longopts+=",train-phased-phase2-samples-per-save:" +longopts+=",train-phased-phase2-trim-data-samples:" + +opts=$(getopt -q -o "" --longoptions "$longopts" -n "getopt.sh" -- "$@"); if [ $? -ne 0 ]; then printf -- "\tUnrecognized option specified\n\n" exit 1 @@ -27,58 +59,61 @@ while true; do val=$1; shift case "$arg" in --workflow) - workflow="$val" - ;; - - # The following options are for SDG - --num-cpus) - num_cpus="--num-cpus=$val" - ;; - --batch-size) - batch_size="--batch-size=$val" - ;; - - - # The following options are for SDG and training - --gpus) - gpus="--gpus=$val" - ;; - --model) - model=$val - ;; - - # The following options are for training - --save-samples) - save_samples="--save-samples=$val" - ;; - --max-seq-len) - max_seq_len="--max-seq-len=$val" - ;; - --data-path) - data_path=$val - ;; - --nnodes) - nnodes=$val - ;; - --num-epochs) - num_epochs=$val - ;; - --effective-batch-size) - effective_batch_size="--effective_batch_size $val" - ;; - --deepspeed-cpu-offload-optimizer) - if [ $val == "1" ]; then - cpu_offload_optimizer="--deepspeed-cpu-offload-optimizer True" - fi - ;; - --deepspeed-cpu-offload-optimizer-pin-memory) - if [ $val == "1" ]; then - cpu_offload_pin_memory="--deepspeed-cpu-offload-optimizer-pin-memory True" - fi - ;; - --train-until) - train_until=$val + workflow="$val" + ;; + + # The following options are for sdg + --sdg-model) + sdg_model=$val + ;; + --sdg-num-cpus) + sdg_num_cpus=" --num-cpus $val" + ;; + --sdg-gpus) + sdg_num_cpus=" --gpus $val" + ;; + --sdg-batch-size) + sdg_batch_size=" --batch-size $val" + ;; + + # The following options are for training + --train-profile) + train_profile=$val + ;; + --train-model-path) + train_model_path=$val + ;; + --train-phased-mt-bench-judge) + train_phased_mt_bench_judge=$val + ;; + # For phase1 + --train-phased-phase1-data) + train_phased_phase1_data=$val ;; + --train-phased-phase1-num-epochs) + train_phased_phase1_num_epochs=$val + ;; + --train-phased-phase1-samples-per-save) + train_phased_phase1_samples_per_save="--phased-phase1-samples-per-save=$val" + ;; + --train-phased-phase1-trim-data-samples) + train_phased_phase1_trim_samples=$val + ;; + # For phase2 + --train-phased-phase2-data) + train_phased_phase2_data=$val + ;; + --train-phased-phase2-num-epochs) + train_phased_phase2_num_epochs=$val + ;; + --train-phased-phase2-samples-per-save) + train_phased_phase2_samples_per_save="--phased-phase2-samples-per-save=$val" + ;; + --train-phased-phase2-trim-data-samples) + train_phased_phase2_trim_samples=$val + ;; + + --) break ;; @@ -88,106 +123,112 @@ while true; do esac done -ilab config init --non-interactive -cp /opt/app-root/src/.config/instructlab/config.yaml . +# Valid values for $train_profile: +# A100_H100_x2 A100_H100_x4 A100_H100_x8 L40_x4 L40_x8 L4_x8 train_a100x4x8 +ilab config init --non-interactive --train-profile /usr/share/instructlab/training/profiles/$train_profile.yaml + +ilab config show >ilab-config-show.yaml echo "workflow: $workflow" echo "nnodes: $nnodes" echo "gpus: $gpus" if [[ $workflow =~ ^train.* ]]; then + ilab model train --help >ilab-model-train-help.txt echo "train_until: $train_until" - echo "num_epochs: $num_epochs" - echo "save_sample: $save_samples" - echo "max_seq_length: $max_seq_length" - echo "effective_batch_size: $effective_batch_size" + #echo "num_epochs: $num_epochs" + #echo "save_sample: $save_samples" + #echo "max_seq_length: $max_seq_length" + #echo "effective_batch_size: $effective_batch_size" echo "cpu_offload_optimizer: $cpu_offload_optimizer" echo "cpu_offload_pin_memory: $cpu_offload_pin_memory" - train_cmd="ilab model train\ - --model-path $model\ - --data-path $data_path\ - --nnodes=$nnodes\ - $gpus\ - --num-epochs=$num_epochs - --ckpt-output-dir .\ - $save_samples\ - $max_seq_length\ - $effective_batch_size\ - $cpu_offload_optimizer\ - $cpu_offload_pin_memory" - - echo "/home/models:" - ls -la /home/models - echo "/home/data:" - ls -la /home/data + mkdir -p e2e - echo "train cmd:" - echo "$train_cmd" + if [[ ! -z "$train_phased_phase1_trim_samples" ]]; then + if [[ ! -e /tmp/random-but-same ]]; then + exit_error "Could not find /tmp/random-but-same" + fi + original_num_samples=`wc -l $train_phased_phase1_data | awk '{print $1}'` + shuf_cmd="shuf -n $train_phased_phase1_trim_samples --random-source=/tmp/random-but-same $train_phased_phase1_data -o $train_phased_phase1_data.randomly-trimmed" + echo "Going to run: $shuf_cmd" + $shuf_cmd || exit_error "$shuf_cmd failed" + train_phased_phase1_data+=".randomly-trimmed" + echo "trimmed data:" + if [[ ! -e $train_phased_phase1_data ]]; then + exit_error "Could not find $train_phased_phase1_data" + fi + /bin/ls -l $train_phased_phase1_data + echo "scale=2; $original_num_samples / $trimmed_num_samples" | bc >phase1-trimmed-sample-ratio.txt + trimmed_num_samples=`wc -l $train_phased_phase1_data | awk '{print $1}'` + fi - count=0 - rc=0 - - if [[ $train_until =~ ^checkpoint:([0-9]+) ]]; then - num_samples=${BASH_REMATCH[1]} - watch_regex="Model\ssaved\sin\shf_format/(.*)" - elif [[ $train_until =~ ^runavg:([0-9]+) ]]; then - num_samples=${BASH_REMATCH[1]} - watch_regex='RunningAvgSamplesPerSec' - else - echo "not watching for 'Model saved in' or 'RunningAvgSamplesPerSec' and will train for $num_epochs epochs" - num_samples=0 + if [[ ! -z "$train_phased_phase2_trim_samples" ]]; then + if [[ ! -e /tmp/random-but-same ]]; then + exit_error "Could not find /tmp/random-but-same" + fi + original_num_samples=`wc -l $train_phased_phase2_data | awk '{print $1}'` + shuf_cmd="shuf -n $train_phased_phase2_trim_samples --random-source=/tmp/random-but-same $train_phased_phase2_data -o $train_phased_phase2_data.randomly-trimmed" + echo "Going to run: $shuf_cmd" + $shuf_cmd || exit_error "$shuf_cmd failed" + train_phased_phase2_data+=".randomly-trimmed" + echo "trimmed data:" + if [[ ! -e $train_phased_phase2_data ]]; then + exit_error "Could not find $train_phased_phase2_data" + fi + /bin/ls -l $train_phased_phase2_data + trimmed_num_samples=`wc -l $train_phased_phase2_data | awk '{print $1}'` + echo "scale=2; $original_num_samples / $trimmed_num_samples" | bc >phase2-trimmed-sample-ratio.txt fi - echo "watch_regex: [$watch_regex]" - echo "num_samples: [$num_samples]" + train_cmd="ilab model train --skip-user-confirm --strategy lab-multiphase" + + train_cmd+=" --phased-base-dir e2e" + train_cmd+=" --model-path $train_model_path" + + train_cmd+=" --phased-phase1-data $train_phased_phase1_data" + train_cmd+=" --phased-phase1-num-epochs $train_phased_phase1_num_epochs" + if [[ ! -z "$train_phased_phase1_samples_per_save" ]]; then + train_cmd+=" --phased-phase1-samples-per-save $train_phased_phase1_samples_per_save" + fi + + train_cmd+=" --phased-phase2-data $train_phased_phase2_data" + train_cmd+=" --phased-phase2-num-epochs $train_phased_phase2_num_epochs" + if [[ ! -z "$train_phased_phase2_samples_per_save" ]]; then + train_cmd+=" --phased-phase2-samples-per-save $train_phased_phase2_samples_per_save" + fi + + train_cmd+=" --phased-mt-bench-judge $train_phased_mt_bench_judge" + + echo "train cmd:" + echo "$train_cmd" + echo "Training:" snapshot_file="" - $train_cmd 2>&1 | - { - count=0 - while read line; do - echo "$line" >>train.txt - if [[ $line =~ $watch_regex ]]; then - snapshot=${BASH_REMATCH[1]} - ((count++)) - echo found $line - if [[ $train_until =~ "^checkpoint:([0-9]+)" ]]; then - echo "Found snapshot: hf_format/$snapshot" - fi - fi - if [[ $count -ne 0 && $count -ge $num_samples ]]; then - echo "Found the checkpoint we needed: hf_format/$snapshot" - echo "hf_format/$snapshot" >last_checkpointed_model.txt - break - fi - done - echo "count: $count" - if [[ $count -lt $num_samples ]]; then - echo "did not get the number of $watch_regex samples, so exiting error" - exit 1 - else - echo "Exiting without error" - exit 0 - fi - } + date +%s >train-start-timestamp.txt + $train_cmd >train-stdout.txt 2>train-stderr.txt rc=$? wait - for file in training_params_and_metrics_global0.jsonl full_logs_global0.log train.txt; do - if [ -e $file ]; then - xz $file - fi - done + date +%s >train-stop-timestamp.txt + + # Delete any file larger than 100 MB (usually model checkpoints) + find . -size +100M -type f -print | while read line; do /bin/rm -f $line; done + + # Compress the rest + find . -type f | while read line; do xz $line; done + elif [[ $workflow =~ ^sdg.* ]]; then - sdg_cmd="ilab data generate\ - --model $model\ - $num_cpus\ - $batch_size\ - $gpus\ - --output-dir ." + ilab data generate --help >ilab-data-generate-help.txt + sdg_cmd="ilab data generate" + sdg_cmd+=" --model $sdg_model" + sdg_cmd+=" $sdg_num_cpus" + sdg_cmd+=" $sdg_batch_size" + sdg_cmd+=" $sdg_gpus" + sdg_cmd+=" --output-dir ." echo "sdg cmd:" echo "$sdg_cmd" + echo "running ilab taxonomy diff" ilab taxonomy diff # This assumes LoRa already exists and is not downloaded by InstructLab every time @@ -201,42 +242,18 @@ elif [[ $workflow =~ ^sdg.* ]]; then ln -sf /home/data/taxonomy/knowledge/phoenix /opt/app-root/src/.local/share/instructlab/taxonomy/knowledge/phoenix ln -sf /home/data/taxonomy/knowledge/mtba /opt/app-root/src/.local/share/instructlab/taxonomy/knowledge/mtba - ilab diff # The following is needed for post-processing wc -l /usr/share/instructlab/sdg/datasets/skills.jsonl | awk '{print $1}' >skills-num-samples.txt - $sdg_cmd >sdg.txt 2>&1 + $sdg_cmd >sdg-stdout.txt 2>sdg-stderr.txt rc=$? - xz sdg.txt + for file in sdg-stdout.txt sdg-stderr.txt; do + xz $file & + done + wait else echo "Workflow [$workflow] not supported, exiting" exit 1 fi -if [[ $worklfow == "train+eval" ]]; then - if [[ ! -e last_checkpointed_model.txt ]]; then - echo "Error: could not find file: last_checkpointed_model.txt" - exit 1 - fi - model_path=`cat last_checkpointed_model.txt` - if [ ! -e $model_path ]; then - echo "Could not find directory for checkpointed model: $model_path" - exit 1 - fi - eval_cmd="ilab model evaluate" - eval_cmd+=" --benchmark mmlu_branch" - eval_cmd+=" --model $model_path" - eval_cmd+=" --tasks-dir tests/testdata/mmlu_branch/" - eval_cmd+=" --base-model $model" - echo "eval_cmd: $eval_cmd" - echo "Evaluating:" - $eval_cmd >eval.txt 2>&1 - -else - echo "Deleting any snapshots because eval not happening after training" - rm -rf ht_format/* -fi - -rm -rf ht_format/* - exit $rc diff --git a/ilab-post-process b/ilab-post-process index 8906e7a..455bc34 100755 --- a/ilab-post-process +++ b/ilab-post-process @@ -9,6 +9,7 @@ import re import copy import math import json +import yaml import argparse import glob from datetime import datetime @@ -29,29 +30,17 @@ else: from toolbox.metrics import log_sample from toolbox.metrics import finish_samples +params = {} + class t_global(object): args = None def process_options(): parser = argparse.ArgumentParser(description = 'Post process raw benchmark data into Common Data Model output') - # known options: - #--workflow) - #--save-samples) - #--max-seq-len) - #--model) - #--data-path) - #--nnodes) - #--gpus) - #--num-epochs) - #--effective-batch-size) - #--deepspeed-cpu-offload-optimizer) - #--deepspeed-cpu-offload-optimizer-pin-memory) - #--train-until) - parser.add_argument('--workflow', dest = 'workflow', - help = 'sdg, train, or eval', + help = 'sdg or train', default = "" ) @@ -59,13 +48,55 @@ def process_options(): return() +def traverse_yaml(o, prepend): + for key in o: + if isinstance(o[key], dict): + traverse_yaml(o[key], key) + else: + val_str = "%s" % (o[key]) + if prepend == "": + params_name = "%s" % (key) + else: + params_name = "%s.%s" % (prepend,key) + print("%s: %s" % (params_name, val_str)) + params[params_name] = val_str + return() + + +def process_config_yaml(): + with open("config.yaml") as stream: + try: + c=yaml.safe_load(stream) + traverse_yaml(c, "") + except yaml.YAMLError as exc: + print(exc) + return() + + def main(): process_options() - if t_global.args.workflow == '': print('workflow was not defined, exiting') return(1) + # While a crucible run contains parameters for a benchmark, it is possible + # that many parameters that are used are omitted by the params input. The + # post-proceesing script can add parameters if it can detect them from the + # generated benchmark data. + + # Note: the cmdline options passed to this script are the crucible benchmark + # params + + # First process what the params are from the config.yaml that is generated + # via 'ilab config init' + ## process_config_yaml() + + # Next, find any matching params from cmdline opts and override any matching + # one with the new value. + #add_cmdline_opts() + + print(params) + # In any benchmark post-process script, the metrics generated need to be attributed to a # time-period (AKA benchmark-phase). The period which is used to report and offical # result for the benchmark is the 'measurement' period. Other periods thay may exist @@ -80,39 +111,62 @@ def main(): metric_files = [] - period = { 'name': 'measurement', 'metric-files': [] } - file_id = 'global0' - - desc = {'source' : 'ilab', 'class': 'throughput'} - names = {} if t_global.args.workflow == 'train': - desc['type'] = 'train-samples-sec'; - iter_sample['primary-metric'] = 'train-samples-sec' - filename = 'training_params_and_metrics_global0.jsonl.xz' - with lzma.open(filename, 'rt') as file: - for line in file: - d = json.loads(line) - # file contents to parse (per line): - #{"epoch": 0, "step": 1, "rank": 0, - # "loss": 0.18146394193172455, - # "overall_throughput": 3.5244029279710176, - # "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, - # "cuda_malloc_retries": 0, - # "num_loss_counted_tokens": 4940, "batch_size": 14, - # "total_loss": 0.4069821238517761, "gradnorm": null, - # "weight_norm": 557.9681396484375, - # "timestamp": "2024-07-18T22:46:41.628932"} - if 'epoch' in d.keys(): - dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f') - ts = math.floor(dt.timestamp() * 1000) - sample = {'end': ts, 'value': d['overall_throughput']} - log_sample(file_id, desc, names, sample) + first_ts = None + last_ts = None + for phase in [1, 2]: + period = { 'name': 'phase' + str(phase), 'metric-files': [] } + file_id = 'phase' + str(phase) + desc = {'source' : 'ilab', 'class': 'throughput'} + names = {} + desc['type'] = 'train-samples-sec'; + filename = 'e2e/phase' + str(phase) + '/checkpoints/training_params_and_metrics_global0.jsonl.xz' + print('Opening ' + filename) + with lzma.open(filename, 'rt') as file: + for line in file: + d = json.loads(line) + # file contents to parse (per line): + #{"epoch": 0, "step": 1, "rank": 0, + # "loss": 0.18146394193172455, + # "overall_throughput": 3.5244029279710176, + # "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, + # "cuda_malloc_retries": 0, + # "num_loss_counted_tokens": 4940, "batch_size": 14, + # "total_loss": 0.4069821238517761, "gradnorm": null, + # "weight_norm": 557.9681396484375, + # "timestamp": "2024-07-18T22:46:41.628932"} + if 'epoch' in d.keys(): + dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f') + ts = math.floor(dt.timestamp() * 1000) + if first_ts == None: + first_ts = ts + sample = {'end': ts, 'value': d['overall_throughput']} + log_sample(file_id, desc, names, sample) + last_ts = ts + metric_file_name = finish_samples() + period['metric-files'].append(metric_file_name) + iter_sample['periods'].append(period) + + # Now create the primary metric and the primary-period + iter_sample['primary-metric'] = 'actual-train-seconds' + period = { 'name': 'measurement', 'metric-files': [] } + file_id = 'measurement' + desc = {'source' : 'ilab', 'class': 'count', 'type': 'actual-train-seconds'} + names = {} + sample = {'begin': first_ts, 'end': last_ts, 'value': (last_ts - first_ts) / 1000} + log_sample(file_id, desc, names, sample) metric_file_name = finish_samples() period['metric-files'].append(metric_file_name) iter_sample['periods'].append(period) + elif t_global.args.workflow == 'sdg': print('sdg') - desc['type'] = 'sdg-samples-sec'; + iter_sample['primary-metric'] = 'sdg-samples-sec' + period = { 'name': 'measurement', 'metric-files': [] } + file_id = 'measurement' + desc = {'source' : 'ilab', 'class': 'throughput', 'type': 'sdg-samples-sec'} + names = {} + iter_sample['primary-metric'] = 'sdg-samples-sec' num_skills = open("skills-num-samples.txt").readline().rstrip() print('skills num samples') @@ -125,10 +179,8 @@ def main(): # Skills are loaded by detecting the "dataset loaded" line with N samples, # where N = skills-num-samples.txt (which comes from counting lines in # /usr/share/instructlab/sdg/datasets/skills.jsonl during the benchmark - with lzma.open("sdg.txt.xz", 'rt') as file: + with lzma.open("sdg-stderr.txt.xz", 'rt') as file: for line in file: - #print('line') - #print(line) if skipped_skills: if reggy := re.search(r'^INFO\s(\d+-\d+-\d+\s\d+:\d+:\d+\d+,\d+).+Generation\stook\s(\d+\.\d+)s', line): print('found generation line') @@ -168,3 +220,4 @@ def main(): if __name__ == "__main__": exit(main()) + diff --git a/multiplex.json b/multiplex.json index 7cb4d56..4d93a21 100644 --- a/multiplex.json +++ b/multiplex.json @@ -5,34 +5,54 @@ "generic_string" : { "description" : "all types of strings", "args" : [ - "model", "data-path" + "sdg-model", + "train-model-path", + "train-profile", + "train-data_path", + "train-model_path", + "train-phased-mt-bench-judge" ], "vals" : ".+" }, "integer_ge_zero" : { "description" : "a whole number >= 0", - "args" : [ "batch-size" ], + "args" : [ + "sdg-num-cpus", + "sdg-gpus", + "sdg-batch-size", + "train-model-path", + "train-profile", + "train-phased-phase1-data", + "train-phased-phase1-num-epochs", + "train-phased-phase1-samples-per-save", + "train-phased-phase1-trim-data-samples", + "train-phased-phase2-data", + "train-phased-phase2-num-epochs", + "train-phased-phase2-samples-per-save", + "train-phased-phase2-trim-data-samples", + "train-phased-mt-bench-judge" + ], "vals" : "[0-9]+" }, - "integer_gt_zero" : { - "description" : "a whole number > 0", - "args" : [ "num-cpus", "nnodes", "gpus", "num-epochs", "effective-batch-size", "max-seq-length", "save-samples" ], - "vals" : "[1-9][0-9]*" - }, "workflows" : { "description" : "what type of work to do", "args" : [ "workflow" ], - "vals" : "train|train+eval|eval|sdg|sdg+train|sdg+train+eval" + "vals" : "train|sdg" }, "train_until" : { "description" : "checkpoint:N, or runavg:N, or complete (N is a positive int)", - "args" : [ "train-until" ], + "args" : [ "train.train-until" ], "vals" : "((checkpoint|runavg)\\:([1-9][0-9]*)|complete)" }, - "bool_0_1" : { - "description" : "boolean as 0 (false) or 1 (true)", - "args" : [ "deepspeed-cpu-offload-optimizer", "deepspeed-cpu-offload-optimizer-pin-memory" ], - "vals" : "[0-1]" + "bool" : { + "description" : "boolean as True or False", + "args" : [ + "train.checkpoint_at_epoch", + "train.deepspeed_cpu_offload_optimizer", + "train.is_padding_free", + "additional_args.deepspeed_cpu_offload_optimizer_pin_memory" + ], + "vals" : "[True|False]" } } } diff --git a/rickshaw.json b/rickshaw.json index 2ebd1d5..5cb9033 100644 --- a/rickshaw.json +++ b/rickshaw.json @@ -8,10 +8,18 @@ }, "client" : { "files-from-controller": [ + { + "src": "%bench-dir%/random-but-same", + "dest": "/tmp/" + }, { "src": "%bench-dir%/ilab-get-runtime", "dest": "/usr/bin/" }, + { + "src": "%bench-dir%/ilab-base", + "dest": "/usr/bin/" + }, { "src": "%bench-dir%/ilab-client", "dest": "/usr/bin/" From 64ee8200a2662b0851e0d3ee249f1400451d7c57 Mon Sep 17 00:00:00 2001 From: Andrew Theurer Date: Tue, 29 Oct 2024 10:29:54 -0400 Subject: [PATCH 2/2] more cleanup --- ilab-client | 14 +------- ilab-post-process | 44 ----------------------- multiplex.json | 90 +++++++++++++++++++---------------------------- 3 files changed, 37 insertions(+), 111 deletions(-) diff --git a/ilab-client b/ilab-client index daad7ee..65039ab 100755 --- a/ilab-client +++ b/ilab-client @@ -7,7 +7,6 @@ exec 2>&1 . /usr/bin/ilab-base || (echo "/usr/bin/ilab-base not found"; exit 1) workflow="train" -train_until="complete" # checkpoint:N, runavg:N, or complete nnodes=1 train_profile="L40_x4" train_model_path="/home/models/granite-7b-redhat-lab" @@ -21,7 +20,7 @@ train_phased_phase2_num_epochs=2 train_phased_phase2_samples_per_save="" train_phased_phase2_trim_samples="15000" -ilab_version=`ilab --version | awk '{print $3}'` +ilab --version | awk '{print $3}' >ilab-version.txt # 0.19.1 = rhelai 1.2 # 0.18.3 = rhelai 1.1.0 @@ -112,8 +111,6 @@ while true; do --train-phased-phase2-trim-data-samples) train_phased_phase2_trim_samples=$val ;; - - --) break ;; @@ -135,11 +132,6 @@ echo "gpus: $gpus" if [[ $workflow =~ ^train.* ]]; then ilab model train --help >ilab-model-train-help.txt - echo "train_until: $train_until" - #echo "num_epochs: $num_epochs" - #echo "save_sample: $save_samples" - #echo "max_seq_length: $max_seq_length" - #echo "effective_batch_size: $effective_batch_size" echo "cpu_offload_optimizer: $cpu_offload_optimizer" echo "cpu_offload_pin_memory: $cpu_offload_pin_memory" mkdir -p e2e @@ -181,22 +173,18 @@ if [[ $workflow =~ ^train.* ]]; then fi train_cmd="ilab model train --skip-user-confirm --strategy lab-multiphase" - train_cmd+=" --phased-base-dir e2e" train_cmd+=" --model-path $train_model_path" - train_cmd+=" --phased-phase1-data $train_phased_phase1_data" train_cmd+=" --phased-phase1-num-epochs $train_phased_phase1_num_epochs" if [[ ! -z "$train_phased_phase1_samples_per_save" ]]; then train_cmd+=" --phased-phase1-samples-per-save $train_phased_phase1_samples_per_save" fi - train_cmd+=" --phased-phase2-data $train_phased_phase2_data" train_cmd+=" --phased-phase2-num-epochs $train_phased_phase2_num_epochs" if [[ ! -z "$train_phased_phase2_samples_per_save" ]]; then train_cmd+=" --phased-phase2-samples-per-save $train_phased_phase2_samples_per_save" fi - train_cmd+=" --phased-mt-bench-judge $train_phased_mt_bench_judge" echo "train cmd:" diff --git a/ilab-post-process b/ilab-post-process index 455bc34..39ec5c7 100755 --- a/ilab-post-process +++ b/ilab-post-process @@ -48,55 +48,12 @@ def process_options(): return() -def traverse_yaml(o, prepend): - for key in o: - if isinstance(o[key], dict): - traverse_yaml(o[key], key) - else: - val_str = "%s" % (o[key]) - if prepend == "": - params_name = "%s" % (key) - else: - params_name = "%s.%s" % (prepend,key) - print("%s: %s" % (params_name, val_str)) - params[params_name] = val_str - return() - - -def process_config_yaml(): - with open("config.yaml") as stream: - try: - c=yaml.safe_load(stream) - traverse_yaml(c, "") - except yaml.YAMLError as exc: - print(exc) - return() - - def main(): process_options() if t_global.args.workflow == '': print('workflow was not defined, exiting') return(1) - # While a crucible run contains parameters for a benchmark, it is possible - # that many parameters that are used are omitted by the params input. The - # post-proceesing script can add parameters if it can detect them from the - # generated benchmark data. - - # Note: the cmdline options passed to this script are the crucible benchmark - # params - - # First process what the params are from the config.yaml that is generated - # via 'ilab config init' - ## process_config_yaml() - - # Next, find any matching params from cmdline opts and override any matching - # one with the new value. - #add_cmdline_opts() - - print(params) - # In any benchmark post-process script, the metrics generated need to be attributed to a # time-period (AKA benchmark-phase). The period which is used to report and offical # result for the benchmark is the 'measurement' period. Other periods thay may exist @@ -220,4 +177,3 @@ def main(): if __name__ == "__main__": exit(main()) - diff --git a/multiplex.json b/multiplex.json index 4d93a21..242285b 100644 --- a/multiplex.json +++ b/multiplex.json @@ -1,58 +1,40 @@ { - "presets": { + "presets": { + }, + "validations": { + "generic_string" : { + "description" : "all types of strings", + "args" : [ + "sdg-model", + "train-model-path", + "train-profile", + "train-phased-phase1-data", + "train-phased-phase2-data", + "train-phased-mt-bench-judge" + ], + "vals" : ".+" }, - "validations": { - "generic_string" : { - "description" : "all types of strings", - "args" : [ - "sdg-model", - "train-model-path", - "train-profile", - "train-data_path", - "train-model_path", - "train-phased-mt-bench-judge" - ], - "vals" : ".+" - }, - "integer_ge_zero" : { - "description" : "a whole number >= 0", - "args" : [ - "sdg-num-cpus", - "sdg-gpus", - "sdg-batch-size", - "train-model-path", - "train-profile", - "train-phased-phase1-data", - "train-phased-phase1-num-epochs", - "train-phased-phase1-samples-per-save", - "train-phased-phase1-trim-data-samples", - "train-phased-phase2-data", - "train-phased-phase2-num-epochs", - "train-phased-phase2-samples-per-save", - "train-phased-phase2-trim-data-samples", - "train-phased-mt-bench-judge" - ], - "vals" : "[0-9]+" - }, - "workflows" : { - "description" : "what type of work to do", - "args" : [ "workflow" ], - "vals" : "train|sdg" - }, - "train_until" : { - "description" : "checkpoint:N, or runavg:N, or complete (N is a positive int)", - "args" : [ "train.train-until" ], - "vals" : "((checkpoint|runavg)\\:([1-9][0-9]*)|complete)" - }, - "bool" : { - "description" : "boolean as True or False", - "args" : [ - "train.checkpoint_at_epoch", - "train.deepspeed_cpu_offload_optimizer", - "train.is_padding_free", - "additional_args.deepspeed_cpu_offload_optimizer_pin_memory" - ], - "vals" : "[True|False]" - } + "integer_ge_zero" : { + "description" : "a whole number >= 0", + "args" : [ + "sdg-num-cpus", + "sdg-gpus", + "sdg-batch-size", + "train-model-path", + "train-profile", + "train-phased-phase1-num-epochs", + "train-phased-phase2-num-epochs", + "train-phased-phase1-samples-per-save", + "train-phased-phase2-samples-per-save", + "train-phased-phase1-trim-data-samples", + "train-phased-phase2-trim-data-samples" + ], + "vals" : "[0-9]+" + }, + "workflows" : { + "description" : "what type of work to do", + "args" : [ "workflow" ], + "vals" : "train|sdg" + } } }