-
Notifications
You must be signed in to change notification settings - Fork 7
/
run_deepreduce.sh
executable file
·107 lines (88 loc) · 6.72 KB
/
run_deepreduce.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env bash
### tf_cnn_benchmarks ResNet20 ResNet50
./grace/env-tf1.14/bin/mpirun \
-x CUDA_VISIBLE_DEVICES=0 \
-x NCCL_IB_DISABLE=0 \
-np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \
--display-allocation -map-by slot -bind-to none -nooversubscribe \
-mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp1s0f1 -x NCCL_SOCKET_IFNAME=enp1s0f1 \
python \
./grace-benchmarks/tensorflow/Classification/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model=resnet20_v2 --data_name=cifar10 --batch_size=256 --weight_decay=0.0001 --optimizer=momentum --piecewise_learning_rate_schedule=0.1;163;0.01;245;0.001 --variable_update=horovod --train_dir={log_dir}/ckpts --summary_verbosity=1 --save_summaries_steps=10 --num_epochs=328 --eval_during_training_every_n_steps=200 --num_eval_epochs=8 --data_dir=/mnt/scratch/cifar-10-batches-py/
./grace/env-tf1.14/bin/mpirun \
-x CUDA_VISIBLE_DEVICES=0 \
-x NCCL_IB_DISABLE=0 \
-np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \
--display-allocation -map-by slot -bind-to none -nooversubscribe \
-mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp1s0f1 -x NCCL_SOCKET_IFNAME=enp1s0f1 \
./grace/env-tf1.14/bin/python \
./grace-benchmarks/tensorflow/Classification/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model=resnet50 --data_name=imagenet --batch_size=256 --weight_decay=1e-4 --optimizer=momentum --nodistortions --variable_update=horovod --train_dir={log_dir}/ckpts --summary_verbosity=1 --save_summaries_steps=100 --num_epochs=90 --eval_during_training_every_n_steps=625 --num_eval_epochs=8 --data_dir=/mnt/scratch/imagenet18/data/imagenet
### pytorch resnet20 cifar10 data volume and micro-benchmark
./grace/env-tf1.14/bin/mpirun \
-x CUDA_VISIBLE_DEVICES=0 \
-x NCCL_IB_DISABLE=0 \
-np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \
--display-allocation -map-by slot -bind-to none -nooversubscribe \
-mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp1s0f1 -x NCCL_SOCKET_IFNAME=enp1s0f1 \
./grace/env-tf1.14/bin/python \
./grace-benchmarks/torch/cifar10/trainer_grace.py \
-a resnet20 --data=/mnt/scratch/cifar-10-batches-py/ --log_volume \
--grace_config="{'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather', 'compress_ratio': 0.01, 'deepreduce':'index', 'index':'bloom', 'micro-benchmark':True}"
### NCF sparse/dense/deepreduce accuracy
./grace/env-tf1.14/bin/mpirun \
-x CUDA_VISIBLE_DEVICES=0 \
-x NCCL_IB_DISABLE=0 \
-np 2 -H 11.0.0.203:1,11.0.0.204:1 \
--display-allocation -map-by slot -bind-to none -nooversubscribe \
-mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include ens1f0 -x NCCL_SOCKET_IFNAME=ens1f0 \
./grace/env-tf1.14/bin/python -W ignore::UserWarning \
./grace-benchmarks/torch/Recommendation/NCF/ncf_grace.py \
--data ./data/ml-20m/torch/cache/ml-20m \
--load_checkpoint_path ./grace/results/NCF/checkpoints/model_init.pth --seed 44 \
--weak_scaling --extra_wandb_tags=accuracy,10G --log_volume \
--grace_config="{'compressor': 'none', 'memory': 'none', 'communicator': 'allreduce'}"
./grace/env-tf1.14/bin/mpirun \
-x CUDA_VISIBLE_DEVICES=1 \
-x NCCL_IB_DISABLE=0 \
-np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \
--display-allocation -map-by slot -bind-to none -nooversubscribe \
-mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp136s0f0 -x NCCL_SOCKET_IFNAME=enp136s0f0 \
./grace/env-tf1.14/bin/python -W ignore::UserWarning \
./grace/src/grace-benchmarks/torch/Recommendation/NCF/ncf_grace.py \
--data ./grace/data/ml-20m/torch/cache/ml-20m \
--load_checkpoint_path ./grace/results/NCF/checkpoints/model_init.pth --seed 44 \
--weak_scaling --extra_wandb_tags=accuracy,10G --log_volume \
--grace_config="{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0}"
# "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'index', 'index':'bloom'}"
# "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'value', 'value':'polyfit'}"
# "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'value', 'value':'qsgd', 'buckets_num':128, 'quantum_num': 32}"
# "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'both'}"
# "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'index', 'index':'bloom', 'policy':'p0', 'fpr':0.01}"
# "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'both', 'index':'bloom', 'policy':'random', 'fpr':0.01, 'value':'qsgd'}"
# "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'both', 'index':'bloom', 'policy':'random', 'fpr':0.01, 'value':'qsgd'}"
# SKCompress
./grace/env-tf1.14/bin/mpirun \
-x CUDA_VISIBLE_DEVICES=1 \
-x NCCL_IB_DISABLE=0 \
-np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \
--display-allocation -map-by slot -bind-to none -nooversubscribe \
-mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp136s0f0 -x NCCL_SOCKET_IFNAME=enp136s0f0 \
./grace/env-tf1.14/bin/python -W ignore::UserWarning \
./grace/src/grace-benchmarks/torch/Recommendation/NCF/ncf_grace.py \
--data ./grace/data/ml-20m/torch/cache/ml-20m \
--load_checkpoint_path ./grace/results/NCF/checkpoints/model_init.pth --seed 44 \
--weak_scaling --extra_wandb_tags=accuracy,10G \
--grace_config="{'compressor': 'SKCompressCPU','num_quantiles':128, 'sparsifier': 'threshold', 'threshold': 0.0, \
'memory': 'none', 'communicator': 'allgather'}" --micro_benchmark --log_volume
### NCF time breakdown
./grace/env-tf1.14/bin/mpirun \
-x CUDA_VISIBLE_DEVICES=0 \
-x NCCL_IB_DISABLE=1 \
-np 4 -H 10.0.0.133:1,10.0.0.134:1,10.0.0.135:1,10.0.0.136:1 \
--display-allocation -map-by slot -bind-to none -nooversubscribe \
-mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp1s0f1 -x NCCL_SOCKET_IFNAME=enp1s0f1 \
./grace/env-tf1.14/bin/python -W ignore::UserWarning \
./grace/src/grace-benchmarks/torch/Recommendation/NCF/ncf_grace.py \
--data ./grace/data/ml-20m/torch/cache/ml-20m \
--load_checkpoint_path ./grace/results/NCF/checkpoints/model_init.pth --seed 44 \
--weak_scaling --extra_wandb_tags=timer,10G -e=3 --grads_accumulated=10 --log_time \
--grace_config="{'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather', 'compress_ratio': 0.1, 'deepreduce':'index', 'index':'bloom'}"