-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathrun_exps.sh
executable file
·250 lines (220 loc) · 9.03 KB
/
run_exps.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env bash
#setup the python path
cwd=`pwd`
PY=`which python`
#first cmd line parameter is the dataset name
dataset=$1
#the path to the project
export MAIN_PATH=/home/user/REFL
# the path to the dataset, note $dataset, the dataset name passed as argument to script
export DATA_PATH=/home/user/REFL/dataset/data/${dataset}
#the path to the conda envirnoment
export CONDA_ENV=/home/user/anaconda3/envs/refl
#the path to the conda source script
export CONDA_PATH=/home/user/anaconda3/
#Set WANDB for logging the experiments results or do wandb login from terminal
export WANDB_API_KEY=""
# The entity or team used for WANDB logging, should be set correctly, typically should be set your WANDB userID
export WANDB_ENTITY=""
#RESET LD Library path
export LD_LIBRARY_PATH=""
#set the path to the config file, use the files with _exp suffix
cd core/evals
config_dir=`ls -1 configs/$dataset/*exp.*`
#other parameters is related to the server IPs and number of GPUs to utilize
#Set the list of IP addresses to run the experiment, each server will run a single experiment in a round robin fashion
declare -a IPS=('127.0.0.1')
#declare -a IPS=('10.0.0.1' '10.0.0.2' '10.0.0.3' '10.0.0.4')
#Set the list with the number of GPUs to utilize on each server
declare -a GPUS=("1")
#declare -a GPUS=("4" "4" "4" "4")
#number of servers
SERVERS=4
#Tag to identify the experiment on WANDB
tags="safa" #"r-safa" #"badstale" #"avail" #"central" #"motive2" "motive1" #"safa1" #"avail" #"stale" # #"avail" #"safa" "r-safa" "delta"
export TAGS=$tags
#---- Setting the main experimental parameters -----
#number of FL rounds
epochs=1000
#number of local training epochs to run on each client
steps="1"
#number of workers per round
workers="10" #"10 50 100"
#the preset deadline for each round
deadlines="100" # 300 for reddit and stackoverflow
# the aggregation algorithm, we use yogi for all benchmarks except for CIFAR10 for which we use FedAvg
aggregators="yogi" #"fedavg prox yogi"
#target number of clients to complete the round (80% as per Google recommendation)
targetratios="0.8" #"0.1" for safa #"0.1 0.3" # 0.8 is default, 0 means no round failures
#total number of clients in the experiment: 0: use the benchmarks default number of clients, otherwise set as needed (needs to be set for CIFAR10)
clients=0 #"0" "1000" "3000"
#the data partitioning method: -1 IID-uniform, 0 Fedscale mapping, 1 NONIID-Uniform, 2 NONIID-Zipfian, 3 NONIID-Balanced
partitions="1" #"-1 0 1 2 3"
#sampling seed set for randomizing the experiment, the experiment runs 3 times with seeds 0, 1 and 2 and average is taken
sampleseeds="0 1 2"
#introduce artificial dropouts
dropoutratio=0.0 #not used in experiments
# The overcommitted clients during the selection phase to account for possible dropouts, similar to oort it is set to 30%
overcommit="1.3"
# experiment type: 0 - we relay on round deadlines for completion similar to Google's system setting
# 1 - we wait for the target ratio to complete the round and there is no deadline simialr to setting in oort
exptypes="0" #"0 1"
# use Behaviour heterogenity or not: 0: do not use behaviour trace - always available, 1: use behaviour trace, dynamic client availability
randbehv="1" #0 1
#---- Selection scheme -----
#client sampling choice either Random or Oort, if REFL is enabled then this selection does not matter, default is random
samplers="random" #"random oort"
## ------- REFL related parameters ----------
### Stale-Aware Aggregation (SAA) Module
#whether to use stale aggregation -1:use REFL's Stale-Aware Aggregation, 0:no stale aggregation, otherwise the number of rounds for the threshold, e.g., SAFA use 5 for stale rounds threshold
stales="-1" #-1 1 5
# The multiplication factor for the weight/importance of new updates: 0: not used similar to 1:Equal weight as of the new updates, 2: divide by fixed weight 2 (half),
# If the value is negative, it indicated the method used by REFL-SAA module to manipulate (boost and damp) the stale updates: -1:Average stale rounds, -2:AdaSGD, -3:DynSGD, -4: REFL Method
stalefactors="-4" #"2 0 1 -1 -2 -3 -4"
# The beta value for the weighted average of the damping and scaling factors in the stale update
stalebeta=0.35 #0.65
#The scaling coefficient to boost the good stale updates
scalecoff=1
### Intelligent Participant Selection (IPS) Module
#whether to use availability prioritization 0: do not use priority, 1: use availability prioritization
availprios="1" #0 1
# probability for the oracle to get the availability right (Accuraccy level which should match from the average performance of the time-series model)
availprob=0.9
#wether to enable the adaptive selection of the clients: 0: do not use, 1: use
adaptselect=0 #0 1
#### Scaling the system capability of the devices
# by how much to scale the client devices: 1.0: same capabilities as per the device config file, 2.0: double the computational capabilities
scale_sys=1.0 #1.0 2.0
# percentage of the clients to apply the system capabilities scaling
scalesyspercent=0.0 #
#We adjust the SAFA experiments as they are extermely expensive to run, all online clients are invoked in each round!
if [ $tags == 'safa' ] || [ $tags == 'safa1' ] || [ $tags == 'r-safa' ] || [ $tags == 'n-safa' ];
then
epochs=250
partitions="1"
workers="10"
deadlines="100"
exptypes="0"
steps="1"
samplers="random"
aggregators="fedavg"
clients=3000
stales="5"
availprios="0"
randbehv=1
targetratios="0.8"
adaptselect=1
dropoutratio=0.0 #0.25
if [ $tags == 'safa' ] || [ $tags == 'safa1' ];
then
targetratios="0.1"
fi
if [ $tags == 'n-safa' ];
then
stales="0"
fi
fi
# this is for running the central mode experiments
if [ $tags == 'central' ];
then
clients=10
workers=10
exptypes=1
randbehv=-1
samplers="random"
aggregators="fedavg"
stales=0
availprios=0
scale_sys=1.0
partitions="-1 1 2 3"
fi
if [ $dataset == 'google_speech' ] || [ $dataset == 'google_speech_dummy' ]
then
config_dir=`ls -1 configs/google_speech/*exp.*`
else
if [ $dataset == 'openImg' ];
then
steps="5"
config_dir=`ls -1 configs/openimage/*exp.*`
else
if [[ $dataset == 'stackoverflow' || $dataset == 'reddit' ]]
then
steps="5"
fi
fi
fi
cd $cwd
# for loop to run the experiments
count=0
for f in $config_dir;
do
for exptype in $exptypes;
do
for worker in $workers;
do
for sampler in $samplers;
do
for aggregator in $aggregators;
do
for deadline in $deadlines;
do
for targetratio in $targetratios
do
for availprio in $availprios
do
for stale in $stales;
do
for step in $steps;
do
for part in $partitions;
do
for sampleseed in $sampleseeds
do
for stalefactor in $stalefactors;
do
#Export the variables for the yaml file
export EPOCHS=$epochs
export DATASET=$dataset
export WORKERS=$worker
export STALEUPDATES=$stale
export LOCALSTEPS=$steps
export DEADLINE=$deadline
export TARGETRATIO=$targetratio
export OVERCOMMIT=$overcommit
export SAMPLER=$sampler
export AGGREGATOR=$aggregator
export PARTITION=$part
export EXPTYPE=$exptype
export CLIENTS=$clients
export SAMPLESEED=$sampleseed
export AVAILPRIO=$availprio
export AVAILPROP=$availprob
export RANDBEHV=$randbehv
export ADAPT_SELECT=$adaptselect
export STALE_FACTOR=$stalefactor
export STALE_BETA=$stalebeta
export SCALE_COFF=$scalecoff
export DROPOUT_RATIO=$dropoutratio
export SCALE_SYS=$scale_sys
export SCALE_SYS_PERCENT=$scalesyspercent
echo "Settings: config=$f dataset=$dataset worker=$worker deadline=$deadline stale=$stale steps=$step sampler=$sampler aggregator=$aggregator exptype=$exptype clients=$CLIENTS"
index=`expr $count % $SERVERS`
echo "index: $index node info: ${IPS[$index]} ${GPUS[$index]}"
#invoke the manager to launch the PS and workers on target node
$PY $cwd/core/evals/manager.py submit $cwd/core/evals/$f ${IPS[$index]} ${GPUS[$index]}
sleep 5
#experiment counter
count=`expr $count + 1`
done
done
done
done
done
done
done
done
done
done
done
done
done