From 0760f07aebc025ac755a2f57040d00311c06641e Mon Sep 17 00:00:00 2001 From: Vlad Dracula Date: Fri, 6 Oct 2023 10:46:58 +1000 Subject: [PATCH] use simpler method to create test set than keras method --- episodes/02-image-data.md | 208 +++++++++++++++++---------------- episodes/scripts/image-data.py | 80 +++++++++---- 2 files changed, 166 insertions(+), 122 deletions(-) diff --git a/episodes/02-image-data.md b/episodes/02-image-data.md index af6b8451..92b202e7 100644 --- a/episodes/02-image-data.md +++ b/episodes/02-image-data.md @@ -245,13 +245,13 @@ The min, max, and mean pixel values are 0.0 , 255.0 , and 87.0 respectively. After normalization, the min, max, and mean pixel values are 0.0 , 1.0 , and 0.0 respectively. ``` -Of course, if we have a large number of images to process we do not want to perform these steps one at a time. As you might have guessed, `tf.keras.utils` also provides a function to load an entire directories: `image_dataset_from_directory()` +Of course, if there are a large number of images to prepare you do not want to copy and paste these steps for each image we have in our dataset. -Here we will load an entire directory of images to create a test dataset. +Here we will use `for` loops to find all the images in a directory and prepare them to create a test dataset that aligns with our training dataset. -**CINIC-10** +## CINIC-10 Test Dataset Preparation -Out test dataset is a sample of images from an existing image dataset known as [CINIC-10] (CINIC-10 Is Not ImageNet or CIFAR-10) that was designed to be used as a drop-in alternative to the CIFAR-10 dataset we used in the introduction. +Our test dataset is a sample of images from an existing image dataset known as [CINIC-10] (CINIC-10 Is Not ImageNet or CIFAR-10) that was designed to be used as a drop-in alternative to the CIFAR-10 dataset we used in the introduction. The test image directory was set up to have the following structure: @@ -265,71 +265,126 @@ main_directory/ ......image_2.jpg ``` -If we use this directory structure, keras will automatically infer the image labels. +We will use this structure to create two lists: one for images in those folders and one for the image label. We can then use the lists to resize, convert to arrays, and normalize the images. ```python -# load the required libraries -from keras.utils import image_dataset_from_directory +import os +import numpy as np -# define the image directory +# set the mian directory test_image_dir = 'D:/20230724_CINIC10/test_images' -# read in the images, infer the labels, and resize to match training dataset -test_images = image_dataset_from_directory(test_image_dir, labels='inferred', batch_size=None, image_size=(32,32), shuffle=False) +# make two lists of the subfolders (ie class or label) and filenames +test_filenames = [] +test_labels = [] + +for dn in os.listdir(test_image_dir): + + for fn in os.listdir(os.path.join(test_image_dir, dn)): + + test_filenames.append(fn) + test_labels.append(dn) + +# prepare the images +# create an empty numpy array to hold the processed images +test_images = np.empty((len(test_filenames), 32, 32, 3), dtype=np.float32) + +# use the dirnames and filenanes to process each +for i in range(len(test_filenames)): + + # set the path to the image + img_path = os.path.join(test_image_dir, test_labels[i], test_filenames[i]) + + # load the image and resize at the same time + img = load_img(img_path, target_size=(32,32)) + + # convert to an array + img_arr = img_to_array(img) + + # normalize + test_images[i] = img_arr/255.0 + +print(test_images.shape) +print(test_images.__class__) ``` -```output -Found 10000 files belonging to 10 classes. +(10000, 32, 32, 3) + ``` -In most cases, after loading your images and preprocessing them to match your training dataset attributes, you will divide them up into training, validation, and test datasets. We already loaded training and validation data in Episode 1, so this test dataset will be used to test our model predictions in a later episode. Moreover, because the CINIC-10 data is intended to be a drop-in replacement for CIFAR-10, we just need to normalize the data to be on the same scale as our training data. +::::::::::::::::::::::::::::::::::::: challenge +Training and Test sets -```python -# normalize test images -import tensorflow as tf +Take a look at the training and test set we created. + +Q1. How many samples does the training set have and are the classes well balanced? +Q2. How many samples does the test set have and are the classes well balanced? + +Hint1: Check the object class to understand what methods are available. +Hint2: Use the `train_labels' object to find out if the classes are well balanced. + +:::::::::::::::::::::::: solution + +Q1. Training Set -# define a function to normalize each image in the test set -def process(image,label): - image = tf.cast(image/255. ,tf.float32) - return image,label +```python +print('The training set is of type', train_images.__class__) +print('The training set has', train_images.shape[0] 'samples.\n') -test_images = test_images.map(process) +print('The number of labels in our training set and the number images in each class are:\n') +print(np.unique(train_labels, return_counts=True)) ``` + ```output +The training set is of type +The training set has 50000 samples. +The number of labels in our training set and the number images in each class are: + +(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8), + array([5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000], dtype=int64)) ``` -TODO now a MapDataset! this will affect challenge below too but might be good to present these different datasets...getting too complicated +Q2. Test Set (we can use the same code as the training set) +```python +print('The test set is of type', test_images.__class__) +print('The test set has', test.shape[0] 'samples.\n') -### Data Splitting +print('The number of labels in our test set and the number images in each class are:\n') +print(np.unique(test_labels, return_counts=True)) +``` -In the previous episode we saw that the keras installation includes the Cifar-10 dataset and that by using the 'cifar10.load_data()' method the returned data is split into two (train and validations sets) but there was not a test dataset. +```output +The test set is of type +The test set has 10000 samples. -When using a different dataset, or loading your own set of images, you will do the splits yourself. +The number of labels in our test set and the number images in each class are: -To split the cleaned dataset into a training and test set we will use a very convenient function from sklearn called `train_test_split`. This function takes a number of parameters which are extensively explained [train_test_split]: +(array(['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', + 'horse', 'ship', 'truck'], dtype=' -The training set has 50000 samples. +`sklearn.model_selection.train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)` -The number of labels in our training set and the number images in each class are: +Take a look at the help and write a function to split an imaginary dataset into a train/test split of 80/20 using stratified sampling. -(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8), - array([5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000], dtype=int64)) -``` +:::::::::::::::::::::::: solution -Q2. We used a different function to load the test images which means it will be of a different class. The function `image_dataset_from_directory` returns a [tf.data.Dataset] object +Noting there are a couple ways to do this, here is one example: -```python -print('The test set is of type', test_images.__class__) -print('The test set has', len(test_images), 'samples.\n') -#print(test_images) ``` -```output -The training set is of type - - -``` - -This is an object type we have not see before but we know keras inferred the labels from the directory structure. This dataset object was designed for performance and extracting information from it is not as straightforward. +from sklearn.model_selection import train_test_split -```python -labels = [] -for (image,label) in test_images: - labels.append(label.numpy()) -labels = pd.Series(labels) -count = labels.value_counts().sort_index() -print(count) -``` -```output -airplane 1000 -automobile 1000 -bird 1000 -cat 1000 -deer 1000 -dog 1000 -frog 1000 -horse 1000 -ship 1000 -truck 1000 -dtype: int64 +X_train, X_test, y_train, y_test = train_test_split(image_dataset, target, test_size=0.2, random_state=42, shuffle=True, stratify=target) ``` +- The first two parameters are the dataset (X) and the corresponding targets (y) (i.e. class labels) +- Next is the named parameter `test_size` this is the fraction of the dataset that is used for testing, in this case `0.2` means 20% of the data will be used for testing. +- `random_state` controls the shuffling of the dataset, setting this value will reproduce the same results (assuming you give the same integer) every time it is called. +- `shuffle` which can be either `True` or `False`, it controls whether the order of the rows of the dataset is shuffled before splitting. It defaults to `True`. +- `stratify` is a more advanced parameter that controls how the split is done. By setting it to `target` the train and test sets the function will return will have roughly the same proportions (with regards to the number of images of a certain class) as the dataset. ::::::::::::::::::::::::::::::::: :::::::::::::::::::::::::::::::::::::::::::::::: -There are other preprocessing steps you might need to take for your particular problem. We will discuss a few commons one briefly before getting back to our model. - ### Image Colours diff --git a/episodes/scripts/image-data.py b/episodes/scripts/image-data.py index ed10fc53..e059c862 100644 --- a/episodes/scripts/image-data.py +++ b/episodes/scripts/image-data.py @@ -52,41 +52,75 @@ # extract the min, max, and mean pixel values AFTER print('After normalization, the min, max, and mean pixel values are', new_img_arr_norm.min(), ',', new_img_arr_norm.max(), ', and', new_img_arr_norm.mean().round(), 'respectively.') -#### Load multiple images at the same time +#### CINIC-10 Test Dataset Preparation -from keras.utils import image_dataset_from_directory -test_image_dir = 'D:/20230724_CINIC10/test_images' -test_images = image_dataset_from_directory(test_image_dir, labels='inferred', batch_size=None, image_size=(32,32), shuffle=False) - -# need to normalize -import tensorflow as tf +# Load multiple images into single object to be able to process multiple images at the same time -def process(image,label): - image = tf.cast(image/255. ,tf.float32) - return image,label +# main_directory/ +# ...class_a/ +# ......image_1.jpg +# ......image_2.jpg +# ...class_b/ +# ......image_1.jpg +# ......image_2.jpg -test_images = test_images.map(process) +import os +import numpy as np -# now a MapDataset! this will affect +# set the mian directory +test_image_dir = 'D:/20230724_CINIC10/test_images' +# make two lists of the subfolders (ie class or label) and filenames +test_filenames = [] +test_labels = [] + +for dn in os.listdir(test_image_dir): + + for fn in os.listdir(os.path.join(test_image_dir, dn)): + + test_filenames.append(fn) + test_labels.append(dn) + +# prepare the images +# create an empty numpy array to hold the processed images +test_images = np.empty((len(test_filenames), 32, 32, 3), dtype=np.float32) + +# use the dirnames and filenanes to process each +for i in range(len(test_filenames)): + + # set the path to the image + img_path = os.path.join(test_image_dir, test_labels[i], test_filenames[i]) + + # load the image and resize at the same time + img = load_img(img_path, target_size=(32,32)) + + # convert to an array + img_arr = img_to_array(img) + + # normalize + test_images[i] = img_arr/255.0 + +print(test_images.shape) +print(test_images.__class__) + # Challenge TRAINING AND TEST SETS # Q1 print('The training set is of type', train_images.__class__) print('The training set has', train_images.shape[0], 'samples.\n') -import numpy as np print('The number of labels in our training set and the number images in each class are:\n') -np.unique(train_labels, return_counts=True) +print(np.unique(train_labels, return_counts=True)) # Q2 print('The test set is of type', test_images.__class__) -print('The test set has', len(test_images), 'samples.\n') -#print(test_images) - -labels = [] -for (image,label) in test_images: - labels.append(label.numpy()) -labels = pd.Series(labels) -count = labels.value_counts().sort_index() -print(count) +print('The test set has', test_images.shape[0], 'samples.\n') + +print('The number of labels in our test set and the number images in each class are:\n') +print(np.unique(test_labels, return_counts=True)) + + +# Challenge Data Splitting Example + +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(image_dataset, target, test_size=0.2, random_state=42, shuffle=True, stratify=target) \ No newline at end of file