From 4dd08f972ee6274318e157b6530e1663a1c5ca5e Mon Sep 17 00:00:00 2001 From: Madala Gopichand Date: Thu, 6 Apr 2023 11:58:36 +0530 Subject: [PATCH] Added a new module for anomaly detection This script can be used to train a model for anomaly detector using browser's User agents --- modules/anomaly_detector/train.py | 86 +++++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 87 insertions(+) create mode 100644 modules/anomaly_detector/train.py diff --git a/modules/anomaly_detector/train.py b/modules/anomaly_detector/train.py new file mode 100644 index 000000000..cadce98bd --- /dev/null +++ b/modules/anomaly_detector/train.py @@ -0,0 +1,86 @@ +import argparse +import pandas as pd +import numpy as np +import tensorflow as tf +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report +from transformers import BertTokenizer, TFBertForSequenceClassification + + +def preprocess_data(df): + # Preprocess the User agent column + df['User agent'] = df['User agent'].str.extract(r'\(([^\)]*)\)')[0] + df['User agent'] = df['User agent'].str.split(' ').str[0] + + # Encode the categorical variables + le = LabelEncoder() + df['User agent'] = le.fit_transform(df['User agent']) + df['IP address'] = le.fit_transform(df['IP address']) + + return df + + +def tokenize_data(data, tokenizer, max_length): + inputs = tokenizer.batch_encode_plus( + data, + add_special_tokens=True, + max_length=max_length, + padding='max_length', + return_attention_mask=True, + truncation=True + ) + return np.array(inputs['input_ids']), np.array(inputs['attention_mask']) + + +def build_model(num_labels): + model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels) + model.layers[0].trainable = False + model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), + loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='acc')]) + return model + + +def main(data_path, test_size, max_length, batch_size, epochs, verbose): + # Load the dataset + df = pd.read_csv(data_path) + + # Preprocess the dataset + df = preprocess_data(df) + + # Tokenize the data + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + X_text = df['Text'].values + X_input_ids, X_attention_masks = tokenize_data(X_text, tokenizer, max_length) + + # Split the data into training and testing sets + X = np.hstack((df[['User agent', 'IP address']].values, X_input_ids)) + y = df['Label'].values + X_train, X_test, y_train, y_test, train_masks, test_masks = train_test_split(X, y, X_attention_masks, + test_size=test_size, + random_state=42) + + # Build and train the model + num_labels = len(np.unique(y_train)) + model = build_model(num_labels) + history = model.fit([X_train, train_masks], y_train, validation_data=([X_test, test_masks], y_test), + batch_size=batch_size, epochs=epochs, verbose=verbose) + + # Predict on the test set + y_pred = model.predict([X_test, test_masks]).argmax(axis=-1) + + # Print the classification report + if verbose==1: + print(classification_report(y_test, y_pred)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', type=str, default='path/to/dataset.csv', help='Path to the input data file.') + parser.add_argument('--test_size', type=float, default=0.2, help='Fraction of the dataset to be used for testing.') + parser.add_argument('--max_length', type=int, default=128, help='Maximum length of input sequence.') + parser.add_argument('--batch_size', type=int, default=32, help='Batch size for training the model.') + parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training the model.') + parser.add_argument('--verbose', type=int, default=1,help='Verbosity mode. 0 = silent, 1 = printing into command line') + parser.add_argument diff --git a/requirements.txt b/requirements.txt index 11e74a010..5e1de4b33 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ certifi==2019.11.28 tensorflow==2.4.1 colorama==0.4.4 Keras==2.4.3 +transformers==4.27.4 validators==0.18.2 ipwhois==1.2.0 matplotlib==3.1.2