-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathconf.yml
45 lines (37 loc) · 2.38 KB
/
conf.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Configuration file of Oort training experiment
# ========== Cluster configuration ==========
# ip address of the parameter server (need 1 GPU)
ps_ip: 10.0.0.5
# ip address of each worker:# of gpus on this worker
# Note that if we collocate ps and worker, then we need to decrease this number of gpus by 1
# E.g., master node has 4 GPUs, then 1 for the ps, and worker should be set to: worker:3
worker_ips:
- 10.0.0.3:1 # worker_ip: # of gpus
- 10.0.0.4:1
- 10.0.0.2:1
- 10.0.0.1:1
exp_path: $HOME/Oort/training
auth:
ssh_user: ""
ssh_private_key: ~/.ssh/id_rsa
# cmd to run before we can indeed run oort (in order)
setup_commands:
- source $HOME/anaconda3/bin/activate oort
- export NCCL_SOCKET_IFNAME='enp94s0f0' # Run "ifconfig" to ensure the right NIC for nccl if you have multiple NICs
# ========== Additional job configuration ==========
# Default parameters are specified in argParser.py, wherein more description of the parameter can be found
job_conf:
- log_path: $HOME/Oort/training/evals # Path of log files
- job_name: openimage # Generate logs under this folder: log_path/job_name/time_stamp
- total_worker: 100 # Number of participants per round, we use K=100 in our paper, large K will be much slower
- data_set: openImg # Dataset: openImg, google_speech, stackoverflow
- data_dir: $HOME/Oort/FLPerf/open_images # Path of the dataset
- data_mapfile: $HOME/Oort/FLPerf/open_images/clientDataMap # Allocation of data to each client, turn to iid setting if not provided
- client_path: $HOME/Oort/FLPerf/data/device_info/client_profile.pkl # Path of the client trace
- sample_mode: oort # Client selection: random, oort
- model: shufflenet_v2_x2_0 # Models: shufflenet_v2_x2_0, mobilenet_v2, resnet34, albert-base-v2
- gradient_policy: yogi # Commenting out this line will turn to "Fedprox"
- round_penalty: 2.0 # Penalty factor in our paper (\alpha), \alpha -> 0 turns to (Oort w/o sys)
- eval_interval: 20 # How many rounds to run a testing on the testing set
- epochs: 500 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
- pacer_delta: 10