From aa7f80bf3ffb47533cae5aa79b00ffe2e6138215 Mon Sep 17 00:00:00 2001
From: tahifahimi <fahimitahera@gmail.com>
Date: Sun, 4 Feb 2024 20:18:30 -0700
Subject: [PATCH] add Hot Encoding to the RNN model

---
 .../training_code/rnn_model_training.py       | 166 +++++++++++-------
 1 file changed, 99 insertions(+), 67 deletions(-)

diff --git a/modules/rnn_cc_detection/training_code/rnn_model_training.py b/modules/rnn_cc_detection/training_code/rnn_model_training.py
index 1ed4cc41b..11e662f04 100644
--- a/modules/rnn_cc_detection/training_code/rnn_model_training.py
+++ b/modules/rnn_cc_detection/training_code/rnn_model_training.py
@@ -8,11 +8,16 @@
 
 import tensorflow as tf
 import sklearn as sk
-from tensorflow.keras import layers
-from tensorflow.keras.preprocessing.sequence import pad_sequences
-from tensorflow.keras.preprocessing.text import one_hot
-from tensorflow.keras.models import load_model
-from tensorflow.keras.utils import to_categorical
+# from tensorflow.keras import layers
+# from tensorflow.keras.preprocessing.sequence import pad_sequences
+# from tensorflow.keras.preprocessing.text import one_hot
+# from tensorflow.keras.models import load_model
+# from tensorflow.keras.utils import to_categorical
+from keras import layers
+from keras.preprocessing.sequence import pad_sequences
+from keras.preprocessing.text import one_hot
+from keras.models import load_model
+from keras.utils import to_categorical
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -68,6 +73,7 @@
     help='Where to store the train model',
     type=str,
     required=False,
+    default="model",
 )
 args = parser.parse_args()
 
@@ -128,54 +134,71 @@
 
 
 # Change the letters in the state to an integer representing it uniquely. We 'encode' them.
-df['state'] = df['state'].apply(lambda x: [[int_of_letters[i]] for i in x])
+# df['state'] = df['state'].apply(lambda x: [[int_of_letters[i]] for i in x])
 # So far, only 1 feature per letter
 features_per_sample = 1
 
+# Calculate the maximum sequence length
+max_seq_length = max(len(sequence) for sequence in df['state'])
 
-# Convert the data into the appropriate shape
-# x_data is a list of lists. The 1st dimension is the outtuple, the second the letter. Each letter is now an int value. shape=(num_outuples, features_per_sample)
-x_data = df['state'].to_numpy()
-if args.verbose:
-    print(f'There are {len(x_data)} outtuples')
-# y_data is a list of ints that are 0 or 1. One integer per outtupple. shape=(num_outuples, 1)
-y_data = df['label'].to_numpy()
-if args.verbose:
-    print(f'There are {len(y_data)} labels')
-# Search the sample with max len in the training. It should be already cuted by the csv_read function to a max. Here we just check
-max_length_of_outtupple = max([len(sublist) for sublist in df.state.to_list()])
-if args.verbose:
-    print(
-        f'The max len of the letters in all outtuples is: {max_length_of_outtupple}'
-    )
-
-# Here x_data is a array of lists [[]]
-if args.verbose:
-    print(
-        f'x_data type {type(x_data)} of shape {x_data.shape}. x_data[0] type is {type(x_data[0])}'
-    )
-    print(f'x_data[0] is {x_data[0]}')
-
-
-# Padding.
-# Since not all outtuples have the same amount of letters, we need to add padding at the end
-# Transforms the list to a 2D Numpy array of shape (num_samples, num_timesteps)
-# num_timesteps is either the maxlen argument if provided, or the length of the longest sequence otherwise.
-# Sequences that are shorter than num_timesteps are padded with value at the end.
-# padding: 'pre' or 'post': pad either before or after each sequence.
-# truncating: 'pre' or 'post': remove values from sequences larger than maxlen, either at the beginning or at the end of the sequences.
+# One-hot encoding
+x_data = []
+for sequence in df['state']:
+    one_hot_sequence = [to_categorical(int_of_letters[letter[0]], num_classes=vocabulary_size) for letter in sequence]
+    padded_sequence = pad_sequences([one_hot_sequence], maxlen=max_seq_length, padding='post', truncating='post')[0]
+    x_data.append(padded_sequence.flatten())
+# x_data = np.array(x_data)
 
-# If the input is a string
-# padded_x_data = pad_sequences(x_data, maxlen=max_length_of_outtupple, padding='post', value='0', dtype=object )
+# x_data_oh = [to_categorical(sample, num_classes=vocabulary_size) for sample in x_data]
+x_data_oh = np.array(x_data)
+# y_data remains the same
+y_data = df['label'].to_numpy()
 
-# If the input are integers
-padded_x_data = pad_sequences(
-    x_data, maxlen=max_length_of_outtupple, padding='post'
-)
-if args.verbose:
-    print(
-        f'padded_x_data is of type {type(padded_x_data)}, of shape {padded_x_data.shape}. padded_x_data[0] type is {type(padded_x_data[0])}. Shape of second list is {padded_x_data[0].shape}'
-    )
+# Convert the data into the appropriate shape
+# x_data is a list of lists. The 1st dimension is the outtuple, the second the letter. Each letter is now an int value. shape=(num_outuples, features_per_sample)
+# x_data = df['state'].to_numpy()
+# if args.verbose:
+#     print(f'There are {len(x_data)} outtuples')
+# # y_data is a list of ints that are 0 or 1. One integer per outtupple. shape=(num_outuples, 1)
+# y_data = df['label'].to_numpy()
+# if args.verbose:
+#     print(f'There are {len(y_data)} labels')
+# # Search the sample with max len in the training. It should be already cuted by the csv_read function to a max. Here we just check
+# max_length_of_outtupple = max([len(sublist) for sublist in df.state.to_list()])
+# if args.verbose:
+#     print(
+#         f'The max len of the letters in all outtuples is: {max_length_of_outtupple}'
+#     )
+
+# # Here x_data is a array of lists [[]]
+# if args.verbose:
+#     print(
+#         f'x_data type {type(x_data)} of shape {x_data.shape}. x_data[0] type is {type(x_data[0])}'
+#     )
+#     print(f'x_data[0] is {x_data[0]}')
+
+
+
+"""Since we are using one hot ecoding, we do not need padding"""
+# # Padding.
+# # Since not all outtuples have the same amount of letters, we need to add padding at the end
+# # Transforms the list to a 2D Numpy array of shape (num_samples, num_timesteps)
+# # num_timesteps is either the maxlen argument if provided, or the length of the longest sequence otherwise.
+# # Sequences that are shorter than num_timesteps are padded with value at the end.
+# # padding: 'pre' or 'post': pad either before or after each sequence.
+# # truncating: 'pre' or 'post': remove values from sequences larger than maxlen, either at the beginning or at the end of the sequences.
+#
+# # If the input is a string
+# # padded_x_data = pad_sequences(x_data, maxlen=max_length_of_outtupple, padding='post', value='0', dtype=object )
+#
+# # If the input are integers
+# padded_x_data = pad_sequences(
+#     x_data, maxlen=max_length_of_outtupple, padding='post'
+# )
+# if args.verbose:
+#     print(
+#         f'padded_x_data is of type {type(padded_x_data)}, of shape {padded_x_data.shape}. padded_x_data[0] type is {type(padded_x_data[0])}. Shape of second list is {padded_x_data[0].shape}'
+#     )
 
 
 # Split the data in training and testing
@@ -184,33 +207,41 @@
 # For now, just use all the data
 
 # Split the one-hot
-# train_x_data = x_data_oh
-# train_y_data = y_data
+train_x_data = x_data_oh
+train_y_data = y_data
 
 # Split the padded data only without one-hot
-train_x_data = padded_x_data
-train_y_data = y_data
+# train_x_data = padded_x_data
+# train_y_data = y_data
+
 
+# # Hyperparameters
+# # Real data
+# # Store the dimensions
+# # batch_size = 100 # group of outtuples as a batch
+# num_outtuples = train_x_data.shape[0]   # number_of_outtuples in general
+# # max_length_of_outtupple # max amount of letters in each outtuple (500 now)
+#
+# # In the case of hot-encoding, the amount of features per letter per sample, is 50, which is the vocabulary size
+# # features_per_sample = vocabulary_size # amount of positions of the hot encoding (50 letters, so 50)
+# # print(f'We have as input shape: {num_outtuples}, {max_length_of_outtupple}, {features_per_sample}')
+# # input_shape = (max_length_of_outtupple, features_per_sample)
+#
+# # In the case of not using hot-encoding, the amount of features per sample is 1, because we only have one value
+# # The amount of time steps is the amount of letters, since one letter is one time step, which is the amount of letters max, which 500
+# timesteps = max_length_of_outtupple
+# input_shape = (timesteps, features_per_sample)
+# print(
+#     f'We have as shape: Num of samples: {num_outtuples}, Num of letters per sample (timesteps): {timesteps}, each letter has {features_per_sample} values. The input shape is {input_shape}'
+# )
+
+max_length_of_outtupple = max([len(sublist) for sublist in df.state.to_list()])
 
 # Hyperparameters
-# Real data
-# Store the dimensions
-# batch_size = 100 # group of outtuples as a batch
-num_outtuples = train_x_data.shape[0]   # number_of_outtuples in general
-# max_length_of_outtupple # max amount of letters in each outtuple (500 now)
-
-# In the case of hot-encoding, the amount of features per letter per sample, is 50, which is the vocabulary size
-# features_per_sample = vocabulary_size # amount of positions of the hot encoding (50 letters, so 50)
-# print(f'We have as input shape: {num_outtuples}, {max_length_of_outtupple}, {features_per_sample}')
-# input_shape = (max_length_of_outtupple, features_per_sample)
-
-# In the case of not using hot-encoding, the amount of features per sample is 1, because we only have one value
-# The amount of time steps is the amount of letters, since one letter is one time step, which is the amount of letters max, which 500
+num_outtuples = train_x_data.shape[0]
+# timesteps = train_x_data.shape[1]
 timesteps = max_length_of_outtupple
 input_shape = (timesteps, features_per_sample)
-print(
-    f'We have as shape: Num of samples: {num_outtuples}, Num of letters per sample (timesteps): {timesteps}, each letter has {features_per_sample} values. The input shape is {input_shape}'
-)
 
 # num_epochs = 500
 
@@ -223,6 +254,7 @@
 # Create the model of RNN
 model = tf.keras.models.Sequential()
 model.add(layers.Embedding(vocabulary_size, 16, mask_zero=True))
+# model.add(layers.Embedding(vocabulary_size, 16, mask_zero=True), input_shape=(timesteps, vocabulary_size))
 # GRU is the main RNN layer, inputs: A 3D tensor, with shape [batch, timesteps, feature]
 model.add(
     layers.Bidirectional(