-
Notifications
You must be signed in to change notification settings - Fork 14
/
pretrain.yaml
117 lines (109 loc) · 4.16 KB
/
pretrain.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# (1) each train_file (json) contains a python list where each item is {'image': img_path, 'caption': text or list_of_text }
# (2) this also accepts a two-element sublist, where the 1st is the anno json file as above (1), the 2nd is image_root, it will be joined with the `image` (image path)
data_root: ${oc.env:SL_DATA_DIR}/videos_images
anno_root_pt: ${oc.env:SL_DATA_DIR}/anno_pretrain
anno_root_downstream: ${oc.env:SL_DATA_DIR}/anno_downstream
available_corpus:
# Each value is a list containing annotation path and image root dir, and
# optionally a third element (`image` or `video`) to indicate whether it
# is an image-text or video-text dataset. If not provided, it is set to `image`.
cc3m: ['${anno_root_pt}/cc3m_train.json', '${data_root}/cc3m_224'] # need quote '${}' when inside `[]`
cc12m: ['${anno_root_pt}/cc12m.json', '${data_root}/cc12m_224']
sbu: ['${anno_root_pt}/sbu.json', '${data_root}/sbu_224']
vg: ['${anno_root_pt}/vg.json', '${data_root}/vg']
coco: ['${anno_root_pt}/coco.json', '${data_root}/coco']
f30k_val: ['${anno_root_downstream}/flickr30k_val.json', '${data_root}/f30k']
f30k_test: ['${anno_root_downstream}/flickr30k_test.json', '${data_root}/f30k']
# use a third element to indicate that input is video instead of image
msrvtt_1k_test: ['${anno_root_downstream}/msrvtt_test1k.json', '${data_root}/msrvtt_2fps_224', video]
webvid: ['${anno_root_pt}/webvid_train.json', '${data_root}/webvid_2fps_224', video]
coco_vg: # easy combining datasets
- ${available_corpus.coco}
- ${available_corpus.vg}
webvid_cc3m:
- ${available_corpus.webvid}
- ${available_corpus.cc3m}
webvid_14m:
- ${available_corpus.webvid}
- ${available_corpus.cc3m}
- ${available_corpus.coco}
- ${available_corpus.vg}
- ${available_corpus.sbu}
- ${available_corpus.cc12m}
train_corpus: coco_vg
train_file: ${available_corpus[${train_corpus}]}
test_file:
msrvtt_1k_test: ${available_corpus.msrvtt_1k_test}
text_encoder: bert-base-uncased
bert_config: configs/config_bert.json
vit_type: beit # items in ${vit_zoo}
vit_zoo: # from huggingface
beit: microsoft/beit-base-patch16-224-pt22k-ft22k
vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]}
temporal_vision_encoder:
enable: False
num_layers: 2
update_pooler_embed: False
add_temporal_embed: False # whether to add temporal embed to encoded frames
image_res: 224
embed_dim: 256
video_input: # input
num_frames: 1
reader: decord # one of [decord, av]
sample_type: rand # [rand, middle]
num_frames_test: 4 # num_frames during inference/test
sample_type_test: middle
max_txt_l:
image: 32
video: 32
batch_size:
image: 128
video: 96
batch_size_test:
image: 100
video: 100
k_test: 128
temp: 0.07
mlm_prob: 0.5
loss_weight:
itc: 1.0
mlm: 1.0
itm: 1.0
itm_hard_neg: True
optimizer:
opt: adamW
lr: 1e-4
opt_betas: [0.9, 0.999] # default
weight_decay: 0.02
max_grad_norm: -1 # requires a positive float, use -1 to disable
different_lr: # use a different lr for some modules, e.g., larger lr for new modules
enable: False
module_names: [temporal_vision_encoder, ]
lr: 1e-3
scheduler:
sched: cosine
epochs: 10
min_lr_multi: 0.01 # min_lr will be `optimizer.lr * min_lr_multi`
warmup_epochs: 1 # float
output_dir: None # output dir
resume: False # if True, load optimizer and scheduler states as well
pretrained_path: None # path to pretrained model weights, for resume only?
evaluate: False
# `eval_frame_ensemble': how do we aggregate scores if `video_input.num_frames_test' > `video_input.num_frames'
# `concat': concat frames before input to multi-modal encoder, i.e., early fusion
# `mean', `max', `lse': mean/max/lse-pool scores after multi-modal encoder, i.e., late fusion, as in ClipBERT
eval_frame_ensemble: concat # [concat, max, mean, lse]
eval_x_only: False
eval_offload: False # offload image gpu tensors to cpu to save memory, when meet OOM error.
device: cuda
seed: 42
log_freq: 100
dist_url: env://
distributed: True
fp16: True
debug: False
num_workers: 12
wandb:
enable: False
entity: None # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
project: pretrain # setup in your command line