stratosphereips · prakharguptaujjain · Apr 20, 2023
diff --git a/modules/rnn-cc-detection/training_code/GRU_using_numbers b/modules/rnn-cc-detection/training_code/GRU_using_numbers
@@ -0,0 +1,296 @@
+import sys
+import numpy as np
+import pandas as pd
+import argparse
+from sklearn import metrics
+from sklearn.model_selection import train_test_split
+from random import shuffle
+
+import tensorflow as tf
+import sklearn as sk
+from tensorflow.keras import layers
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import one_hot
+from tensorflow.keras.models import load_model
+from tensorflow.keras.utils import to_categorical
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '-D',
+    '--dataset_file',
+    help='File containing data for training',
+    type=str,
+    required=True,
+)
+parser.add_argument(
+    '-M',
+    '--max_letters',
+    help='Max sequence length',
+    type=int,
+    required=False,
+    default=500,
+)
+parser.add_argument(
+    '-m',
+    '--min_letters',
+    help='Min sequence length',
+    type=int,
+    required=False,
+    default=5,
+)
+parser.add_argument(
+    '-v',
+    '--verbose',
+    help='Level of verbosity',
+    type=bool,
+    required=False,
+    default=False,
+)
+parser.add_argument(
+    '-b',
+    '--batch_size',
+    help='Size of the minibatch',
+    type=int,
+    required=False,
+    default=100,
+)
+parser.add_argument(
+    '-e',
+    '--epochs',
+    help='Number of epochs in training',
+    type=int,
+    required=False,
+    default=200,
+)
+parser.add_argument(
+    '-S',
+    '--model_file',
+    help='Where to store the train model',
+    type=str,
+    required=False,
+)
+args = parser.parse_args()
+
+
+if args.verbose:
+    # Versions
+    print(f'Numpy: {np.__version__}')
+    print(f'TensorFlow: {tf.__version__}')
+    print(f'Pandas: {pd.__version__}')
+    print(f'Sklearn: {sk.__version__}')
+
+# Load the dataset
+# Cut the max amount of letters in the state to a maximum.
+# Better to do it here in the read_csv so we dont use memory later. Here those lines never go into memory.
+f = lambda x: x[: args.max_letters]
+with open(args.dataset_file, 'rb') as csvfile:
+    df = pd.read_csv(
+        csvfile,
+        delimiter='|',
+        names=['note', 'label', 'model_id', 'state'],
+        skipinitialspace=True,
+        converters={'state': f},
+    )
+
+if args.verbose:
+    df.describe()
+
+
+# Clean the dataset
+df.dropna(axis=0, how='any', inplace=True)
+df.drop(axis=1, columns=['note', 'model_id'], inplace=True)
+
+# Delete the strings of letters with less than a certain amount
+indexNames = df[df['state'].str.len() < args.min_letters].index
+df.drop(indexNames, inplace=True)
+
+
+# Add a new column to the dataframe with the label. The label is 'Normal' for the normal data and 'Malcious' for the malware data
+df.loc[df.label.str.contains('Normal'), 'label'] = 'Normal'
+df.loc[df.label.str.contains('Botnet'), 'label'] = 'Malicious'
+df.loc[df.label.str.contains('Malware'), 'label'] = 'Malicious'
+
+# Change the labels from Malicious/Normal to 1/0 integers in the df
+df.label.replace('Malicious', 1, inplace=True)
+df.label.replace('Normal', 0, inplace=True)
+
+
+# Convert each of the stratosphere letters to an integer. There are 50
+vocabulary = list('abcdefghiABCDEFGHIrstuvwxyzRSTUVWXYZ1234567890,.+*')
+int_of_letters = {}
+for i, letter in enumerate(vocabulary):
+    int_of_letters[letter] = float(i)
+if args.verbose:
+    print(
+        f'There are {len(int_of_letters)} letters in total. From letter index {min(int_of_letters.values())} to letter index {max(int_of_letters.values())}.'
+    )
+vocabulary_size = len(int_of_letters)
+
+vocabulary_size=len(vocabulary)
+
+# Convert the letters in the state to a sequence of one-hot encoded vectors
+import tensorflow
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+
+# Calculate the maximum sequence length
+max_seq_length = max(len(sequence) for sequence in df['state'])
+
+x_data = []
+for sequence in df['state']:
+    one_hot_sequence = [to_categorical(int_of_letters[letter], num_classes=vocabulary_size) for letter in sequence]
+    padded_sequence = pad_sequences([one_hot_sequence], maxlen=max_seq_length, padding='post', truncating='post')[0]
+    x_data.append(padded_sequence.flatten())
+x_data = np.array(x_data)
+
+# Change the letters in the state to an integer representing it uniquely. We 'encode' them.
+df['state'] = df['state'].apply(lambda x: [[int_of_letters[i]] for i in x])
+# So far, only 1 feature per letter
+features_per_sample = 1
+
+
+# Convert the data into the appropriate shape
+# x_data is a list of lists. The 1st dimension is the outtuple, the second the letter. Each letter is now an int value. shape=(num_outuples, features_per_sample)
+if args.verbose:
+    print('There are {} outtuples'.format(len(x_data)))
+# y_data is a list of ints that are 0 or 1. One integer per outtupple. shape=(num_outuples, 1)
+y_data = df['label'].to_numpy()
+if args.verbose:
+    print('There are {} labels'.format(len(y_data)))
+# Search the sample with max len in the training. It should be already cuted by the csv_read function to a max. Here we just check
+max_length_of_outtupple = max([len(sublist) for sublist in df.state.to_list()])
+if args.verbose:
+    print(
+        'The max len of the letters in all outtuples is: {}'.format(
+            max_length_of_outtupple
+        )
+    )
+
+# Here x_data is a array of lists [[]]
+if args.verbose:
+    print(
+        f'x_data type {type(x_data)} of shape {x_data.shape}. x_data[0] type is {type(x_data[0])}'
+    )
+    print(f'x_data[0] is {x_data[0]}')
+
+
+# Padding.
+# Since not all outtuples have the same amount of letters, we need to add padding at the end
+# Transforms the list to a 2D Numpy array of shape (num_samples, num_timesteps)
+# num_timesteps is either the maxlen argument if provided, or the length of the longest sequence otherwise.
+# Sequences that are shorter than num_timesteps are padded with value at the end.
+# padding: 'pre' or 'post': pad either before or after each sequence.
+# truncating: 'pre' or 'post': remove values from sequences larger than maxlen, either at the beginning or at the end of the sequences.
+
+# If the input is a string
+# padded_x_data = pad_sequences(x_data, maxlen=max_length_of_outtupple, padding='post', value='0', dtype=object )
+
+# If the input are integers
+padded_x_data = pad_sequences(
+    x_data, maxlen=max_length_of_outtupple, padding='post'
+)
+if args.verbose:
+    print(
+        f'padded_x_data is of type {type(padded_x_data)}, of shape {padded_x_data.shape}. padded_x_data[0] type is {type(padded_x_data[0])}. Shape of second list is {padded_x_data[0].shape}'
+    )
+
+
+# Split the data in training and testing
+# train_data, test_data = train_test_split(df, test_size=0.2, shuffle=True)
+
+# For now, just use all the data
+
+# Split the one-hot
+# train_x_data = x_data_oh
+# train_y_data = y_data
+
+# Split the padded data only without one-hot
+train_x_data = padded_x_data
+train_y_data = y_data
+
+
+# Hyperparameters
+# Real data
+# Store the dimensions
+# batch_size = 100 # group of outtuples as a batch
+num_outtuples = train_x_data.shape[0]   # number_of_outtuples in general
+# max_length_of_outtupple # max amount of letters in each outtuple (500 now)
+
+# In the case of hot-encoding, the amount of features per letter per sample, is 50, which is the vocabulary size
+# features_per_sample = vocabulary_size # amount of positions of the hot encoding (50 letters, so 50)
+# print(f'We have as input shape: {num_outtuples}, {max_length_of_outtupple}, {features_per_sample}')
+# input_shape = (max_length_of_outtupple, features_per_sample)
+
+# In the case of not using hot-encoding, the amount of features per sample is 1, because we only have one value
+# The amount of time steps is the amount of letters, since one letter is one time step, which is the amount of letters max, which 500
+timesteps = max_length_of_outtupple
+input_shape = (timesteps, features_per_sample)
+print(
+    f'We have as shape: Num of samples: {num_outtuples}, Num of letters per sample (timesteps): {timesteps}, each letter has {features_per_sample} values. The input shape is {input_shape}'
+)
+
+# num_epochs = 500
+
+# The shape of the input is now : (2200, 500, 50)
+# 2200, amount of outtuples
+# 500, is the padded amount of letters in each outtuple
+# 50, the one hot on the amount of letters
+
+
+# Create the model of RNN
+model = tf.keras.models.Sequential()
+model.add(layers.Embedding(vocabulary_size, 16, mask_zero=True))
+
+# GRU is the main RNN layer, inputs: A 3D tensor, with shape [batch, timesteps, feature]
+model.add(layers.Bidirectional(layers.GRU(64, return_sequences=True), merge_mode='concat'))
+model.add(layers.Bidirectional(layers.GRU(32, return_sequences=False), merge_mode='concat'))
+model.add(layers.Dense(64, activation='relu'))
+model.add(layers.Dropout(0.5))
+model.add(layers.Dense(32, activation='relu'))
+model.add(layers.Dropout(0.5))
+model.add(layers.Dense(1, activation='sigmoid'))
+# Fully connected layer with 1 neuron output
+# Final output value between 0 and 1 as probability
+model.compile(
+    loss='binary_crossentropy',
+    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
+    metrics=['accuracy']
+)
+
+
+# Train the model
+# This is already separating in trainign and validation
+history = model.fit(
+    train_x_data,
+    train_y_data,
+    epochs=100,
+    batch_size=64,
+    validation_split=0.1,
+    verbose=1,
+    shuffle=True,
+)
+
+if args.verbose:
+    model.summary()
+model.save(args.model_file, overwrite=False)
+
+# To plot the results
+import matplotlib.pyplot as plt
+
+acc = history.history['accuracy']
+val_acc = history.history['val_accuracy']
+loss = history.history['loss']
+val_loss = history.history['val_loss']
+epochs = range(1, len(acc) + 1)
+plt.plot(epochs, acc, 'ro', label='Training acc')
+plt.plot(epochs, val_acc, 'r', label='Validation acc')
+
+plt.title('Training and validation accuracy')
+plt.legend()
+plt.savefig('test_results_acc.png')
+
+plt.close()
+plt.plot(epochs, loss, 'bo', label='Training loss')
+plt.plot(epochs, val_loss, 'b', label='Validation loss')
+plt.title('Training and validation loss')
+plt.legend()
+plt.savefig('test_results_loss.png')