-
Notifications
You must be signed in to change notification settings - Fork 74
/
eval_visulization.lua
254 lines (208 loc) · 10.9 KB
/
eval_visulization.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
require 'torch'
require 'nn'
require 'nngraph'
-- local imports
require 'visu.DataLoaderResNetEval'
local utils = require 'misc.utils'
require 'visu.LanguageModel_visu'
local net_utils = require 'misc.net_utils'
require 'misc.optim_updates'
require 'gnuplot'
require 'xlua'
-------------------------------------------------------------------------------
-- Input arguments and options
-------------------------------------------------------------------------------
cmd = torch.CmdLine()
cmd:text()
cmd:text('Train an Image Captioning model')
cmd:text()
cmd:text('Options')
-- Model settings
--[[
cmd:option('-dataset','flickr30k','')
cmd:option('-input_h5','/data/flickr30k/cocotalk.h5','path to the h5file containing the preprocessed dataset')
cmd:option('-input_json','/data/flickr30k/cocotalk.json','path to the json file containing additional info and vocab')
cmd:option('-cnn_model','../image_model/resnet-152.t7','path to CNN model file containing the weights, Caffe format. Note this MUST be a VGGNet-16 right now.')
]]--
cmd:option('-input_h5','/data/coco/cocotalk.h5','path to the h5file containing the preprocessed dataset')
cmd:option('-input_json','/data/coco/cocotalk.json','path to the json file containing additional info and vocab')
cmd:option('-cnn_model','../image_model/resnet-152.t7','path to CNN model file containing the weights, Caffe format. Note this MUST be a VGGNet-16 right now.')
cmd:option('-checkpoint_path', 'save/coco_val_1', 'folder to save checkpoints into (empty = this folder)')
--[[
cmd:option('-input_h5','/data/coco/cocotalk_test.h5','path to the h5file containing the preprocessed dataset')
cmd:option('-input_json','/data/coco/cocotalk_test.json','path to the json file containing additional info and vocab')
cmd:option('-input_vocab_json','/data/coco/cocotalk.json','path to the json file containing additional info and vocab')
cmd:option('-cnn_model','../image_model/resnet-152.t7','path to CNN model file containing the weights, Caffe format. Note this MUST be a VGGNet-16 right now.')
]]--
cmd:option('-start_from', 'model_id1_36.t7', 'path to a model checkpoint to initialize model weights from. Empty = don\'t')
cmd:option('-beam_size', 3, 'Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.')
--cmd:option('-checkpoint_path', 'save/flickr30k_512x1_1', 'folder to save checkpoints into (empty = this folder)')
cmd:option('-drop_prob_lm', 0.5, 'strength of dropout in the Language Model RNN')
cmd:option('-rnn_size',512,'size of the rnn in number of hidden nodes in each layer')
cmd:option('-num_layers',1,'the encoding size of each token in the vocabulary, and the image.')
cmd:option('-input_encoding_size',512,'the encoding size of each token in the vocabulary, and the image.')
cmd:option('-batch_size',10,'what is the batch size in number of images per batch? (there will be x seq_per_img sentences)')
cmd:option('-fc_size',2048,'the encoding size of each token in the vocabulary, and the image.')
cmd:option('-conv_size',2048,'the encoding size of each token in the vocabulary, and the image.')
cmd:option('-seq_per_img',5,'number of captions to sample for each image during training. Done for efficiency since CNN forward pass is expensive. E.g. coco has 5 sents/image')
cmd:option('-val_images_use', -1, 'how many images to use when periodically evaluating the validation loss? (-1 = all)')
cmd:option('-save_checkpoint_every', 3, 'how often to save a model checkpoint?')
cmd:option('-language_eval', 0, 'Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.')
-- misc
cmd:option('-backend', 'cudnn', 'nn|cudnn')
cmd:option('-id', '1', 'an id identifying this run/job. used in cross-val and appended when writing progress files')
cmd:option('-seed', 123, 'random number generator seed to use')
cmd:option('-gpuid', 0, 'which gpu to use. -1 = use CPU')
cmd:text()
-------------------------------------------------------------------------------
-- Basic Torch initializations
-------------------------------------------------------------------------------
local opt = cmd:parse(arg)
--torch.manualSeed(opt.seed)
torch.setdefaulttensortype('torch.FloatTensor') -- for CPU
if opt.gpuid >= 0 then
require 'cutorch'
require 'cunn'
if opt.backend == 'cudnn' then require 'cudnn' end
--cutorch.manualSeed(opt.seed)
cutorch.setDevice(opt.gpuid + 1) -- note +1 because lua is 1-indexed
end
-------------------------------------------------------------------------------
-- Create the Data Loader instance
-------------------------------------------------------------------------------
local loader = DataLoader{h5_file = opt.input_h5, json_file = opt.input_json, neighbor_h5 = opt.nn_neighbor,
batch_size = opt.batch_size, seq_per_img = opt.seq_per_img, thread_num = opt.thread_num}
--local loader = DataLoader{h5_file = opt.input_h5, json_file = opt.input_json, vocab_json_file = opt.input_vocab_json,neighbor_h5 = opt.nn_neighbor,
-- batch_size = opt.batch_size, seq_per_img = opt.seq_per_img, thread_num = opt.thread_num}
-------------------------------------------------------------------------------
-- Initialize the networks
-------------------------------------------------------------------------------
-- create protos from scratch
-- intialize language model
local lmOpt = {}
lmOpt.vocab_size = loader:getVocabSize()
lmOpt.input_encoding_size = opt.input_encoding_size
lmOpt.rnn_size = opt.rnn_size
lmOpt.num_layers = opt.num_layers
lmOpt.dropout = opt.drop_prob_lm
lmOpt.seq_length = loader:getSeqLength()
lmOpt.batch_size = opt.batch_size * opt.seq_per_img
lmOpt.fc_size = opt.fc_size
lmOpt.conv_size = opt.conv_size
local loaded_checkpoint
if opt.start_from ~= '' then -- just copy to gpu1 params
local loaded_checkpoint_path = path.join(opt.checkpoint_path, opt.start_from)
print(loaded_checkpoint_path)
loaded_checkpoint = torch.load(loaded_checkpoint_path)
end
-- iterate over different gpu
local protos = {}
protos.lm = nn.LanguageModel(lmOpt):cuda()
-- initialize the ConvNet
if opt.start_from ~= '' then -- just copy to gpu1 params
protos.cnn_conv_fix = loaded_checkpoint.protos.cnn_conv_fix:cuda()
protos.cnn_conv = loaded_checkpoint.protos.cnn_conv:cuda()
protos.cnn_fc = loaded_checkpoint.protos.cnn_fc:cuda()
else
local cnn_raw = torch.load(opt.cnn_model)
protos.cnn_conv_fix = net_utils.build_residual_cnn_conv_fix(cnn_raw,
{backend = cnn_backend, start_layer_num = opt.finetune_start_layer}):cuda()
protos.cnn_conv = net_utils.build_residual_cnn_conv(cnn_raw,
{backend = cnn_backend, start_layer_num = opt.finetune_start_layer}):cuda()
protos.cnn_fc = net_utils.build_residual_cnn_fc(cnn_raw,
{backend = cnn_backend}):cuda()
end
protos.expanderConv = nn.FeatExpanderConv(opt.seq_per_img):cuda()
protos.expanderFC = nn.FeatExpander(opt.seq_per_img):cuda()
protos.transform_cnn_conv = net_utils.transform_cnn_conv(opt.conv_size):cuda()
-- criterion for the language model
protos.crit = nn.LanguageModelCriterion():cuda()
params, grad_params = protos.lm:getParameters()
cnn1_params, cnn1_grad_params = protos.cnn_conv:getParameters()
print('total number of parameters in LM: ', params:nElement())
print('total number of parameters in CNN_conv: ', cnn1_params:nElement())
assert(params:nElement() == grad_params:nElement())
assert(cnn1_params:nElement() == cnn1_grad_params:nElement())
if opt.start_from ~= '' then -- just copy to gpu1 params
params:copy(loaded_checkpoint.lmparam)
end
protos.lm:createClones()
-- create clones and ensure parameter sharing. we have to do this
-- all the way here at the end because calls such as :cuda() and
-- :getParameters() reshuffle memory around.
collectgarbage() -- "yeah, sure why not"
-------------------------------------------------------------------------------
-- Evaluation fun(ction)
-------------------------------------------------------------------------------
local function evaluate_split(split, evalopt)
local val_images_use = utils.getopt(evalopt, 'val_images_use', -1)
print('=> evaluating ...')
-- setting to the evaluation mode, use only the first gpu
protos.cnn_conv:evaluate()
protos.cnn_fc:evaluate()
protos.lm:evaluate()
protos.cnn_conv_fix:evaluate()
local n = 0
local loss_sum = 0
local loss_evals = 0
local predictions = {}
local vocab = loader:getVocab()
local imgId_cell = {}
local nbatch = math.ceil(val_images_use / opt.batch_size)
if val_images_use == -1 then
nbatch = loader:getnBatch(split)
end
loader:init_rand(split)
loader:reset_iterator(split)
local atten_out_all = torch.FloatTensor(loader:getSeqLength()+1, 5*nbatch*opt.batch_size, 50):zero()
--for n, data in loader:run({split = split, size_image_use = val_images_use}) do
for n = 1, nbatch do
local data = loader:run({split = split, size_image_use = val_images_use})
xlua.progress(n,nbatch)
-- convert the data to cuda
data.images = data.images:cuda()
data.labels = data.labels:cuda()
-- forward the model to get loss
local feats_conv_fix = protos.cnn_conv_fix:forward(data.images)
local feats_conv = protos.cnn_conv:forward(feats_conv_fix)
local feat_conv_t = protos.transform_cnn_conv:forward(feats_conv)
local feats_fc = protos.cnn_fc:forward(feats_conv)
local expanded_feats_conv = protos.expanderConv:forward(feat_conv_t)
local expanded_feats_fc = protos.expanderFC:forward(feats_fc)
local logprobs, atten = protos.lm:forward({expanded_feats_conv, expanded_feats_fc, data.labels})
--local loss = protos.crit:forward({logprobs, data.labels})
--loss_sum = loss_sum + loss
--loss_evals = loss_evals + 1
-- forward the model to also get generated samples for each image
local sampleOpt = {beam_size = opt.beam_size}
--local seq, atten = protos.lm:sample({feat_conv_t, feats_fc, vocab}, sampleOpt)
local sents, count = net_utils.decode_sequence(vocab, data.labels)
local s = (n-1)*opt.batch_size*5+1
atten_out_all:narrow(2,s,opt.batch_size*5):copy(atten)
for k=1,#sents do
local idx = math.floor((k-1)/5)+1
local img_id = data.img_id[idx]
local entry
--if imgId_cell[img_id] == nil then -- make sure there are one caption for each image.
--imgId_cell[img_id] = 1
local prob_tmp = {}
for m = 1, count[k] do
table.insert(prob_tmp, 1-atten[m][k][1])
end
entry = {image_id = img_id, caption = sents[k], prob = prob_tmp}
table.insert(predictions, entry)
--end
end
end
local lang_stats
if opt.language_eval == 1 then
lang_stats = net_utils.language_eval(predictions, {id = opt.id, dataset = opt.dataset})
end
return predictions, lang_stats, atten_out_all
end
local split_predictions, lang_stats, atten_out_all = evaluate_split('test', {val_images_use = opt.val_images_use, verbose = opt.verbose})
if lang_stats then
print(lang_stats)
end
utils.write_json('visu_gt_test.json', split_predictions)
torch.save('atten_gt_test_1.t7', atten_out_all)