Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Hot Encoding to the RNN model #447

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 99 additions & 67 deletions modules/rnn_cc_detection/training_code/rnn_model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@

import tensorflow as tf
import sklearn as sk
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
# from tensorflow.keras import layers
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.preprocessing.text import one_hot
# from tensorflow.keras.models import load_model
# from tensorflow.keras.utils import to_categorical
from keras import layers
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.models import load_model
from keras.utils import to_categorical

parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -68,6 +73,7 @@
help='Where to store the train model',
type=str,
required=False,
default="model",
)
args = parser.parse_args()

Expand Down Expand Up @@ -128,54 +134,71 @@


# Change the letters in the state to an integer representing it uniquely. We 'encode' them.
df['state'] = df['state'].apply(lambda x: [[int_of_letters[i]] for i in x])
# df['state'] = df['state'].apply(lambda x: [[int_of_letters[i]] for i in x])
# So far, only 1 feature per letter
features_per_sample = 1

# Calculate the maximum sequence length
max_seq_length = max(len(sequence) for sequence in df['state'])

# Convert the data into the appropriate shape
# x_data is a list of lists. The 1st dimension is the outtuple, the second the letter. Each letter is now an int value. shape=(num_outuples, features_per_sample)
x_data = df['state'].to_numpy()
if args.verbose:
print(f'There are {len(x_data)} outtuples')
# y_data is a list of ints that are 0 or 1. One integer per outtupple. shape=(num_outuples, 1)
y_data = df['label'].to_numpy()
if args.verbose:
print(f'There are {len(y_data)} labels')
# Search the sample with max len in the training. It should be already cuted by the csv_read function to a max. Here we just check
max_length_of_outtupple = max([len(sublist) for sublist in df.state.to_list()])
if args.verbose:
print(
f'The max len of the letters in all outtuples is: {max_length_of_outtupple}'
)

# Here x_data is a array of lists [[]]
if args.verbose:
print(
f'x_data type {type(x_data)} of shape {x_data.shape}. x_data[0] type is {type(x_data[0])}'
)
print(f'x_data[0] is {x_data[0]}')


# Padding.
# Since not all outtuples have the same amount of letters, we need to add padding at the end
# Transforms the list to a 2D Numpy array of shape (num_samples, num_timesteps)
# num_timesteps is either the maxlen argument if provided, or the length of the longest sequence otherwise.
# Sequences that are shorter than num_timesteps are padded with value at the end.
# padding: 'pre' or 'post': pad either before or after each sequence.
# truncating: 'pre' or 'post': remove values from sequences larger than maxlen, either at the beginning or at the end of the sequences.
# One-hot encoding
x_data = []
for sequence in df['state']:
one_hot_sequence = [to_categorical(int_of_letters[letter[0]], num_classes=vocabulary_size) for letter in sequence]
padded_sequence = pad_sequences([one_hot_sequence], maxlen=max_seq_length, padding='post', truncating='post')[0]
x_data.append(padded_sequence.flatten())
# x_data = np.array(x_data)

# If the input is a string
# padded_x_data = pad_sequences(x_data, maxlen=max_length_of_outtupple, padding='post', value='0', dtype=object )
# x_data_oh = [to_categorical(sample, num_classes=vocabulary_size) for sample in x_data]
x_data_oh = np.array(x_data)
# y_data remains the same
y_data = df['label'].to_numpy()

# If the input are integers
padded_x_data = pad_sequences(
x_data, maxlen=max_length_of_outtupple, padding='post'
)
if args.verbose:
print(
f'padded_x_data is of type {type(padded_x_data)}, of shape {padded_x_data.shape}. padded_x_data[0] type is {type(padded_x_data[0])}. Shape of second list is {padded_x_data[0].shape}'
)
# Convert the data into the appropriate shape
# x_data is a list of lists. The 1st dimension is the outtuple, the second the letter. Each letter is now an int value. shape=(num_outuples, features_per_sample)
# x_data = df['state'].to_numpy()
# if args.verbose:
# print(f'There are {len(x_data)} outtuples')
# # y_data is a list of ints that are 0 or 1. One integer per outtupple. shape=(num_outuples, 1)
# y_data = df['label'].to_numpy()
# if args.verbose:
# print(f'There are {len(y_data)} labels')
# # Search the sample with max len in the training. It should be already cuted by the csv_read function to a max. Here we just check
# max_length_of_outtupple = max([len(sublist) for sublist in df.state.to_list()])
# if args.verbose:
# print(
# f'The max len of the letters in all outtuples is: {max_length_of_outtupple}'
# )

# # Here x_data is a array of lists [[]]
# if args.verbose:
# print(
# f'x_data type {type(x_data)} of shape {x_data.shape}. x_data[0] type is {type(x_data[0])}'
# )
# print(f'x_data[0] is {x_data[0]}')



"""Since we are using one hot ecoding, we do not need padding"""
# # Padding.
# # Since not all outtuples have the same amount of letters, we need to add padding at the end
# # Transforms the list to a 2D Numpy array of shape (num_samples, num_timesteps)
# # num_timesteps is either the maxlen argument if provided, or the length of the longest sequence otherwise.
# # Sequences that are shorter than num_timesteps are padded with value at the end.
# # padding: 'pre' or 'post': pad either before or after each sequence.
# # truncating: 'pre' or 'post': remove values from sequences larger than maxlen, either at the beginning or at the end of the sequences.
#
# # If the input is a string
# # padded_x_data = pad_sequences(x_data, maxlen=max_length_of_outtupple, padding='post', value='0', dtype=object )
#
# # If the input are integers
# padded_x_data = pad_sequences(
# x_data, maxlen=max_length_of_outtupple, padding='post'
# )
# if args.verbose:
# print(
# f'padded_x_data is of type {type(padded_x_data)}, of shape {padded_x_data.shape}. padded_x_data[0] type is {type(padded_x_data[0])}. Shape of second list is {padded_x_data[0].shape}'
# )


# Split the data in training and testing
Expand All @@ -184,33 +207,41 @@
# For now, just use all the data

# Split the one-hot
# train_x_data = x_data_oh
# train_y_data = y_data
train_x_data = x_data_oh
train_y_data = y_data

# Split the padded data only without one-hot
train_x_data = padded_x_data
train_y_data = y_data
# train_x_data = padded_x_data
# train_y_data = y_data


# # Hyperparameters
# # Real data
# # Store the dimensions
# # batch_size = 100 # group of outtuples as a batch
# num_outtuples = train_x_data.shape[0] # number_of_outtuples in general
# # max_length_of_outtupple # max amount of letters in each outtuple (500 now)
#
# # In the case of hot-encoding, the amount of features per letter per sample, is 50, which is the vocabulary size
# # features_per_sample = vocabulary_size # amount of positions of the hot encoding (50 letters, so 50)
# # print(f'We have as input shape: {num_outtuples}, {max_length_of_outtupple}, {features_per_sample}')
# # input_shape = (max_length_of_outtupple, features_per_sample)
#
# # In the case of not using hot-encoding, the amount of features per sample is 1, because we only have one value
# # The amount of time steps is the amount of letters, since one letter is one time step, which is the amount of letters max, which 500
# timesteps = max_length_of_outtupple
# input_shape = (timesteps, features_per_sample)
# print(
# f'We have as shape: Num of samples: {num_outtuples}, Num of letters per sample (timesteps): {timesteps}, each letter has {features_per_sample} values. The input shape is {input_shape}'
# )

max_length_of_outtupple = max([len(sublist) for sublist in df.state.to_list()])

# Hyperparameters
# Real data
# Store the dimensions
# batch_size = 100 # group of outtuples as a batch
num_outtuples = train_x_data.shape[0] # number_of_outtuples in general
# max_length_of_outtupple # max amount of letters in each outtuple (500 now)

# In the case of hot-encoding, the amount of features per letter per sample, is 50, which is the vocabulary size
# features_per_sample = vocabulary_size # amount of positions of the hot encoding (50 letters, so 50)
# print(f'We have as input shape: {num_outtuples}, {max_length_of_outtupple}, {features_per_sample}')
# input_shape = (max_length_of_outtupple, features_per_sample)

# In the case of not using hot-encoding, the amount of features per sample is 1, because we only have one value
# The amount of time steps is the amount of letters, since one letter is one time step, which is the amount of letters max, which 500
num_outtuples = train_x_data.shape[0]
# timesteps = train_x_data.shape[1]
timesteps = max_length_of_outtupple
input_shape = (timesteps, features_per_sample)
print(
f'We have as shape: Num of samples: {num_outtuples}, Num of letters per sample (timesteps): {timesteps}, each letter has {features_per_sample} values. The input shape is {input_shape}'
)

# num_epochs = 500

Expand All @@ -223,6 +254,7 @@
# Create the model of RNN
model = tf.keras.models.Sequential()
model.add(layers.Embedding(vocabulary_size, 16, mask_zero=True))
# model.add(layers.Embedding(vocabulary_size, 16, mask_zero=True), input_shape=(timesteps, vocabulary_size))
# GRU is the main RNN layer, inputs: A 3D tensor, with shape [batch, timesteps, feature]
model.add(
layers.Bidirectional(
Expand Down
Loading