forked from jssmith/ray-scheduler-prototype
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ray_prototype_sweep.sh
executable file
·149 lines (123 loc) · 6.98 KB
/
ray_prototype_sweep.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/bin/bash
#part 1: run real ray with sweep over number of workload type, tasks, scale (these all need to be run with 1 worker, since they are just for generating a test
#part 2: generate trace for each of the previous runs
#part 3: sweep over simulator (sweet over different schedulers, number of nodes, number of workers per node, data_transfer_cost
declare -a SCHEDULERS=("trivial" "location_aware" "transfer_aware" "delay" "trivial_local" "trivial_threshold_local" "location_aware_local" "transfer_aware_local" "transfer_aware_threshold_local")
min_num_node_range=1
max_num_node_range=10
min_num_worker_range=2
max_num_worker_range=5
# Proportionate to number of tasks.
min_task_factor_range=1
max_task_factor_range=6
# Proportionate to object size, approximately.
min_scale_factor_range=1
max_scale_factor_range=5
now=$(date +"%m_%d_%Y_%H_%M_%S")
sim_sweep_v_output_file="sim_sweep_verbose.csv"
if [ -f $sim_sweep_v_out_file ] ; then
#rm $sim_sweep_v_output_file
mv ${sim_sweep_v_output_file} ${sim_sweep_v_output_file}_${now}.csv
fi
sim_sweep_output_file="sim_sweep.csv"
if [ -f $sim_sweep_out_file ] ; then
#rm $sim_sweep_output_file
mv ${sim_sweep_output_file} ${sim_sweep_output_file}_${now}.csv
fi
#csv title
echo "workload,task_factor,scale,total_num_tasks,total_task_durations,total_num_objects,total_object_sizes,norm_critical_path,num_nodes,worker_per_node,total_workers,data_transfer_cost,scheduler,total job completion" | paste -sd ',' >> $sim_sweep_v_output_file
echo "workload,total_num_tasks,total_task_durations,total_num_objects,total_object_sizes,norm_critical_path,num_nodes,worker_per_node,data_transfer_cost,scheduler,total job completion" | paste -sd ',' >> $sim_sweep_output_file
dot="$(cd "$(dirname "$0")"; pwd)"
mkdir -p $dot/traces/sweep/
#******************external real Ray sweep******************************
###############################rnn########################################
#for t in `seq $min_task_factor_range $max_task_factor_range` #number of tasks factor (in the case of rnn this is num_steps_
#do
# for s in `seq $min_scale_factor_range $max_scale_factor_range` #scale factor (object sizes)
# do
# rm -r /tmp/raylogs/*
# echo running ray_rnn with 1 workers $t steps \(tasks factor\) and scale \(object size factor\) $((s*5))
# python $dot/workloads/rnn/rnn_ray_6_layers.py -w 1 -s $s -n $t
# #generate trace
# echo generating trace of ray_rnn with 1 worker $t steps \(tasks factor\) and scale \(object size factor\) $((s*5))
# python build_trace.py /tmp/raylogs
# mv trace.json $dot/traces/sweep/trace_rnn_t${t}_s${s}.json
# done
#done
#
##############################rl-pong########################################
#external real Ray sweep
#for t in `seq $min_task_factor_range $max_task_factor_range` #number of tasks factor (in the case of rnn this is num_steps_
#do
# for s in `seq $min_scale_factor_range $max_scale_factor_range` #scale factor (object sizes)
# do
# rm -r /tmp/raylogs/*
# echo running rl-pong with 1 workers $((t*10)) iterations \(tasks factor\) and scale \(object size factor\) $((s*5))
# python $dot/workloads/rl_pong/driver.py --iterations $((t*10)) --workers 10
# #generate trace
# echo generating trace of rl-pong with 10 workers $((t*10)) iterations \(tasks factor\) and scale \(object size factor\) $((s*5))
# python build_trace.py /tmp/raylogs
# mv trace.json $dot/traces/sweep/trace_rl-pong_t${t}_s${s}.json
# done
#done
##############################alexnet########################################
#for t in `seq $min_task_factor_range $max_task_factor_range` #number of tasks factor (in the case of rnn this is num_steps_
#do
# for s in `seq 1 8` #scale factor (object sizes)
# do
# rm -r /tmp/raylogs/*
# echo running alexnet with 1 workers $t iterations \(tasks factor\) and scale \(object size factor\) $s
# python $dot/workloads/alexnet/driver.py --iterations $t --num_batches $s
# #generate trace
# echo generating trace of alexnet with $t iterations \(tasks factor\) and scale \(object size factor\) $s
# python build_trace.py /tmp/raylogs
# mv trace.json $dot/traces/sweep/trace_alexnet_t${t}_s${s}.json
# done
#done
##############################matrix multipication########################################
#for t in `seq $min_task_factor_range $max_task_factor_range` #number of tasks factor (in the case of matrix multipication this is the size of the matrix)
#do
# for s in `seq 1 3` #scale factor (object sizes)
# do
# rm -r /tmp/raylogs/*
# echo running matrix multipication with 2 workers, matrix size $((t*100)) \(tasks factor\) and block size \(object size factor\) $((s*10))
# python $dot/workloads/mat_mult.py --size $((t*100)) --block-size $((s*10))
# #generate trace
# echo generating trace of matrix multipication with matrix size $((t*100)) \(tasks factor\) and block size \(object size factor\) $((s*10))
# python build_trace.py /tmp/raylogs
# mv trace.json $dot/traces/sweep/trace_mat-mult_t${t}_s${s}.json
# done
#done
#***************internal simulator sweep**********************************
# The following constants are fixed according to EC2's m4.2xlarge.
# 1 gigabit/node network bandwidth.
dtc_log=-8
# 4 physical cores.
w=4
# 1ms database message delay.
db_delay=0.001
for filename in $dot/traces/sweep/*.json; do
workload_name=`echo $filename | cut -d_ -f2`
t=`echo $filename | cut -d_ -f3`
s=`echo $filename | cut -d_ -f4 | cut -d. -f1`
for n in `seq $min_num_node_range $max_num_node_range` #number of nodes
do
for sched in ${SCHEDULERS[@]}
do
dtc=$(awk "BEGIN{print 10 ^ $dtc_log}")
echo running ray-scheduler-prototype on $filename trace with $sched scheduling policy, $n nodes, $w workers per node, $dtc data transfer cost, and $db_delay db delay
sim_result=`python replaytrace.py $n $w $dtc $db_delay $sched false $filename 2>&1 | tail -n1`
sim_time_result=`echo $sim_result | cut -d: -f1`
total_tasks_num=`echo $sim_result | cut -d: -f2`
total_task_durations=`echo $sim_result | cut -d: -f3`
total_num_objects=`echo $sim_result | cut -d: -f4`
total_object_sizes=`echo $sim_result | cut -d: -f5`
norm_critical_path=`echo $sim_result | cut -d: -f6`
echo $workload_name, $t, $s, $total_tasks_num, $total_task_durations, $total_num_objects, $total_object_sizes, $norm_critical_path, $n, $w, $(( $n*$w )), $dtc, $sched, $sim_time_result | paste -sd ',' >> $sim_sweep_v_output_file
echo $workload_name, $total_tasks_num, $total_task_durations, $total_num_objects, $total_object_sizes, $norm_critical_path, $n, $w, $dtc, $sched, $sim_time_result | paste -sd ',' >> $sim_sweep_output_file
done
done
done
#############################################################################################################
#call plotting script
#`python ray_sched_plots.py $dot/sim_sweep.csv`