Discrepancy between evaluate_generator() and predict_generator() in binary image classification #264

gundalav · 2018-01-27T11:11:05Z

I have the following code. The data set can be downloaded here or here. The data set contains images categorized as cat or dog.

The task of this code is for training cats and dogs image data.
So that given a picture, it can tell whether it's cat's or dog.
It is motivated by this page. Below is the fully running code:

library(keras)
library(tidyverse)


# Organize dataset --------------------------------------------------------
options(warn = -1)

# Ths input
original_dataset_dir <- "data/kaggle_cats_dogs/original/"


# Create new organized dataset directory ----------------------------------

base_dir <- "data/kaggle_cats_dogs_small/"
dir.create(base_dir)

model_dir <- paste0(base_dir, "model/")
dir.create(model_dir)

train_dir <- file.path(base_dir, "train")
dir.create(train_dir)

validation_dir <- file.path(base_dir, "validation")
dir.create(validation_dir)

test_dir <- file.path(base_dir, "test")
dir.create(test_dir)

train_cats_dir <- file.path(train_dir, "cats")
dir.create(train_cats_dir)

train_dogs_dir <- file.path(train_dir, "dogs")
dir.create(train_dogs_dir)

validation_cats_dir <- file.path(validation_dir, "cats")
dir.create(validation_cats_dir)

validation_dogs_dir <- file.path(validation_dir, "dogs")
dir.create(validation_dogs_dir)

test_cats_dir <- file.path(test_dir, "cats")
dir.create(test_cats_dir)

test_dogs_dir <- file.path(test_dir, "dogs")
dir.create(test_dogs_dir)

# Copying files from original dataset to newly created directory
fnames <- paste0("cat.", 1:1000, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(train_cats_dir)
)


fnames <- paste0("cat.", 1001:1500, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(validation_cats_dir)
)

fnames <- paste0("cat.", 1501:2000, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(test_cats_dir)
)

fnames <- paste0("dog.", 1:1000, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(train_dogs_dir)
)

fnames <- paste0("dog.", 1001:1500, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(validation_dogs_dir)
)

fnames <- paste0("dog.", 1501:2000, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(test_dogs_dir)
)

options(warn = 0)

# Making model ------------------------------------------------------------


conv_base <- application_vgg16(
  weights = "imagenet",
  include_top = FALSE,
  input_shape = c(150, 150, 3)
)


model <- keras_model_sequential() %>%
  conv_base() %>%
  layer_flatten() %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 1, activation = "sigmoid")

summary(model)

length(model$trainable_weights)
freeze_weights(conv_base)
length(model$trainable_weights)



# Train model -------------------------------------------------------------

train_datagen <- image_data_generator(
  rescale = 1 / 255,
  rotation_range = 40,
  width_shift_range = 0.2,
  height_shift_range = 0.2,
  shear_range = 0.2,
  zoom_range = 0.2,
  horizontal_flip = TRUE,
  fill_mode = "nearest"
)

# Note that the validation data shouldn't be augmented!
test_datagen <- image_data_generator(rescale = 1 / 255)

train_generator <- flow_images_from_directory(
  train_dir, # Target directory
  train_datagen, # Data generator
  target_size = c(150, 150), # Resizes all images to 150 × 150
  shuffle = FALSE,
  batch_size = 20,
  class_mode = "binary" # binary_crossentropy loss for binary labels
)

test_generator <- flow_images_from_directory(
  test_dir, # Target directory
  train_datagen, # Data generator
  target_size = c(150, 150), # Resizes all images to 150 × 150
  shuffle = FALSE,
  batch_size = 20,
  class_mode = "binary" # binary_crossentropy loss for binary labels
)

validation_generator <- flow_images_from_directory(
  validation_dir,
  test_datagen,
  target_size = c(150, 150),
  shuffle = FALSE,
  batch_size = 20,
  class_mode = "binary"
)


# Fine tuning -------------------------------------------------------------


unfreeze_weights(conv_base, from = "block3_conv1")

# Compile model -----------------------------------------------------------



model %>% compile(
  loss = "binary_crossentropy",
  optimizer = optimizer_rmsprop(lr = 2e-5),
  metrics = c("accuracy")
)


# Evaluate  by epochs  ---------------------------------------------------------------

#  # This create plots accuracy of various epochs (slow)
history <- model %>% fit_generator(
  train_generator,
  steps_per_epoch = 100,
  epochs = 50, # was 50
  validation_data = validation_generator,
  validation_steps = 50
)

Evaluation gives the following great result:

> model %>% evaluate_generator(test_generator, steps = 50)
$loss
[1] 0.262562

$acc
[1] 0.923

But then, I tried to 'manually' check the prediction accuracy the following way

predict <- model %>%
  predict_generator(test_generator, step = 50, verbose = 1)

# Eye-ball check the prediction
# Essentially predict_proba is the probability of the image being dog
stat_df <- as.tibble(cbind(predict, test_generator$filenames, test_generator$classes)) %>%
  rename(
    predict_proba = V1,
    filename = V2,
    test_label = V3
  ) %>%
  mutate(predicted_label = ifelse(predict_proba > 0.5, 1, 0)) %>%
  # sample_n(size= 20) %>% 
  mutate(predicted_label = as.integer(predicted_label)) %>%
  mutate(predicted_label_name = ifelse(predicted_label == 0, "cats", "dogs")) %>%
  separate(filename, into=c("true_label","fname"), sep = "[//]" )


    stat_df %>% head()
# A tibble: 6 x 6
#         predict_proba true_label        fname test_label predicted_label predicted_label_name
#                 <chr>      <chr>        <chr>      <chr>           <int>                <chr>
# 1 3.82422604161547e-06       cats cat.1501.jpg          0               1                 dogs
# 2 7.63378269391404e-18       cats cat.1502.jpg          0               1                 dogs
# 3 5.30394572706427e-07       cats cat.1503.jpg          0               1                 dogs
# 4 5.94179291934537e-19       cats cat.1504.jpg          0               1                 dogs
# 5 8.22905276436359e-05       cats cat.1505.jpg          0               1                 dogs
# 6 0.000384396902518347       cats cat.1506.jpg          0               0                 cats

Proportion of predicted label

> stat_df %>% group_by(predicted_label_name) %>% summarise(n=n())
# A tibble: 2 x 2
  predicted_label_name     n
                 <chr> <int>
1                 cats   197
2                 dogs   803

Number of prediction correctly predicted as dog or cat:

> stat_df %>% filter(true_label == predicted_label_name & true_label == "dogs")  %>% dim()
[1] 432   6
> stat_df %>% filter(true_label == predicted_label_name & true_label == "cats")  %>% dim()
[1] 129   6

Which says that out of 803 prediction only 432 is correctly predicted as dogs (that's around 54% accuracy). Why is that? Where did I go wrong?

Note that the evaluate_generator() gives around 92% accuracy. What's the correct interpretation?

How can I resolve the difference?

The text was updated successfully, but these errors were encountered:

jjallaire · 2018-02-10T01:15:56Z

There appear to be problems preserving the order of the inputs using predict_generator() (this is an issue in the core Keras library I think). See #149. In the meantime I would just use standard predict or predict_on_batch.

mizia42 · 2020-03-23T01:34:25Z

setting shuffle=false to evaluate_generator and predict_generator fixed the issue for me

gundalav closed this as completed Feb 14, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Discrepancy between evaluate_generator() and predict_generator() in binary image classification #264

Discrepancy between evaluate_generator() and predict_generator() in binary image classification #264

gundalav commented Jan 27, 2018

jjallaire commented Feb 10, 2018

mizia42 commented Mar 23, 2020

Discrepancy between evaluate_generator() and predict_generator() in binary image classification #264

Discrepancy between evaluate_generator() and predict_generator() in binary image classification #264

Comments

gundalav commented Jan 27, 2018

jjallaire commented Feb 10, 2018

mizia42 commented Mar 23, 2020