-
Notifications
You must be signed in to change notification settings - Fork 1
/
config.yaml
205 lines (192 loc) · 7.57 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
expt_name: "default"
# Just set to multiple values to run the same config multiple times. Just there
# to take into account random variation
run_id: 0
seed: 42
# A common place, so can be overriden in notebooks, which don't support ":"
# interpolation
cwd: ${hydra:runtime.cwd}
sync_bn: false
test_only: false
# Set this to force data parallel training. Num nodes should be 1.
data_parallel: false
dist_backend: nccl
pytorch:
# This only works with the compiled version of torchvision, and might have
# some memory issues?
video_backend: "video_reader"
train:
fn: 'train' # Which file in func/ directory to use for training
batch_size: 2
# This can have structure as follows:
# <module name in model>:<module name in ckpt>:<path to ckpt> <>...
# By default also supports just the <path to ckpt>
# But the more complex structure can be used to init separate parts of model
# using diff checkpoints. By default if only 2 elements are specified with :,
# module_name_in_ckpt is assumed to be null
init_from_model: null
# Total epochs to train for
num_epochs: 45
# Evaluate within training, every these many epochs
eval_freq: 1
# Shuffle data at train time
shuffle_data: true
# Store the best performing checkpoint
store_best: false
train_one_epoch_fn:
_target_: func.train.train_one_epoch
print_freq: 10
print_large_freq: 1000 # How often to write images/videos summary
grad_clip_params: ${opt.grad_clip} # DO NOT CHANGE HERE, change in opt
# Set the following to store models every so many epochs. By default
# will only store the last checkpoint and the best checkpoint.
save_freq: null
# Num of minutes to save at, same as above -- must set save_intermediate
# true to save like this
save_freq_min: 60 # At least save every 60 mins
# Whether or not to save the intermediate models
save_intermediates: false
loss_wts:
cls_action: 1.0
cls_verb: 1.0
cls_noun: 1.0
pred: 1.0
feat: 1.0
distill: 150.0
ens_distill: 0.0
distill_feat: 0.0
distill_feat_mse: 0.0
# Past predictions, default 0 to be backward compatible
past_cls_action: 0.0
past_cls_verb: 0.0
past_cls_noun: 0.0
eval:
batch_size: null # Will automatically figure out from train if null
eval_fn:
_target_: func.train.evaluate
store: true
store_endpoint: logits
only_run_featext: false
model:
backbone_dim: 2048
# Use the backbone dim if null. Don't use the interpolation since the
# backbone dim might be updated in the code
intermediate_featdim: null
backbone_last_n_modules_to_drop: 2 # Avg pool and linear layer
dropout: 0.0
# Set to a number to project the temp_agg and future features to this
# dimension using a MLP before applying the NCE loss.
# Note this is also applied when doing L2 regression loss, so the name is a
# bit of a misnomer.
project_dim_for_nce: null
# Set to true to also add a regression head -- that is used for dense
# anticipation when predicting the duration of an action
add_regression_head: False
bn:
eps: 0.001
mom: 0.1
# Set this to true if you want to have the same temporal aggregated feat
# dim as from the original backbone (backbone_dim). This will add a linear
# layer to get that. It's useful when training future predictive models,
# with future feat avg as the target.
same_temp_agg_dim: false
# Set this to true to use the class mappings to get the other predictions
# eg, verb/noun from action, instead of adding additional linear layers
# Only applicable when predicting multiple output classes
use_cls_mappings: false
# Apply the classifier on the past predictions too
classifier_on_past: false
opt:
# Not using an overall LR anymore, since everything is now defined per
# module.
# Use a list format to specify per-layer LRs and WD. The first element is
# module_name ("__all__" => all params), LR and WD.
# Note that if there is any overlap between parameters, those params
# will get updated that many number of times as they appear in the list.
# It WILL NOT take the last options as highest precedence. (TODO future)
# The first term can also be a list, to give it a bunch of modules to set
# the same LR and WD for.
lr_wd: [[__all__, 0.1, 0.0001]]
# Set this to true to also scale the LR by the batch size (normally it will
# be scaled by the #replicas, so the LR is specified per given batch size).
# This allows to further specify a LR per batch element (useful when doing
# sweeps over batch size).
scale_lr_by_bs: false
# Set this to true to only train the last classifier layer.
# Also, will set all BN layers to not compute mean/var at runtime.
classifier_only: false
bias_bn_wd_scale: 1.0 # Scale the WD for bias and BN layers by this amount
grad_clip:
max_norm: null # By default, no clipping
norm_type: 2
warmup:
_target_: common.scheduler.Warmup
init_lr_ratio: 0.0 # Warmup from this ratio of the orig LRs
num_epochs: 0 # Warmup for this many epochs (will take out of total epochs)
moco:
_target_: moco.moco.builder.MoCo
dim: 128
K: 65536
m: 0.999
T: 0.2 # From moco-v2
mlp: true # From moco-v2
defaults:
- train_eval_op: basic
- train_eval_op/cls_loss_acc_fn: basic
- train_eval_op/reg_criterion: mse
- opt/optimizer: sgd
- model/backbone: r2plus1d_34
- model/temporal_aggregator: mean
- model/future_predictor: identity
- model/temporal_aggregator_after_future_pred: identity
- model/classifier: linear
- opt/scheduler: warmup_multi_step
# Any keys with dataset_train prefix, like dataset_train2, etc, will all
# be used for training by concatentating all those datasets. So you can
# use multiple datasets in training by adding
# +dataseset_train2=hmdb51/train to the command line config.
# Note that this only works with standard datasets, ConcatDataset can't
# handle overly customized datasets as we use in EpicKitchens
- dataset@dataset_train: dundee50salads/anticipation_train
# Any keys with the dataset_eval prefix, will all be evaluated on separately.
# The postfix will be used to identify which dataset the results are on.
# So, you can use > 1 evaluation datasets that way, by adding it in the
# command line config, like +dataset_eval2=hmdb51/val
- dataset@dataset_eval: dundee50salads/anticipation_val
- data@data_train: default
- data@data_eval: default
# Load any common dataset files, that will be used to create other dataset
# elements.
- dataset/epic_kitchens/common
- dataset/epic_kitchens100/common
- dataset/dundee50salads/common
- dataset/dundee50salads/annot_reader_fn: orig
- dataset/egtea/common
# Overrides
- override hydra/launcher: submitit_slurm
- override hydra/job_logging: colorlog
- override hydra/hydra_logging: colorlog
hydra:
job:
name: "AVT"
launcher:
# All params in https://github.com/facebookresearch/hydra/blob/master/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/config.py
timeout_min: 2880
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: ${hydra.launcher.gpus_per_node}
# This is the memory requested per node. So all GPUs on a given
# node will share this memory
mem_gb: 450
nodes: 2
# Use these parameters through + options in hydra
# partition: learnfair
# max_num_timeout: 3
# constraint: ${hydra.launcher.gpu_type} # Any, or could say [volta|pascal]
# comment: ""
run:
dir: ./outputs/ # Specified in the launch script
sweep:
dir: ${hydra.run.dir}
# Output sub directory for sweep runs.
subdir: ${hydra.job.num} # ${hydra.job.override_dirname}