-
Notifications
You must be signed in to change notification settings - Fork 1
/
datasets.py
114 lines (84 loc) · 4.69 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import ember
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import os
def get_ember_data(data_dir, seed=42):
emberdf = ember.read_metadata(data_dir)
X_train, y_train, X_test, y_test = ember.read_vectorized_features(data_dir)
# Get the labels of the "unlabeled" data using the avclass column
# This is only required for the dual input DNN but is done for all models
# for reasons of uniformity.
idx_unlab_mal = emberdf.query("label==-1")["avclass"].dropna().index
idx_unlab_ben = emberdf[emberdf.avclass.isnull()].query("label == -1").index
# Read the data and select the unlabeled instances
X_train_mal = X_train[idx_unlab_mal]
X_train_ben = X_train[idx_unlab_ben]
y_train_ben = np.zeros(X_train_ben.shape[0])
y_train_mal = np.ones(X_train_mal.shape[0])
# Create the training dataset
y_train = np.concatenate((y_train_ben, y_train_mal))
X_train = np.vstack((X_train_ben, X_train_mal))
X_train_unlab, y_train_unlab = shuffle(X_train, y_train, random_state=seed)
return X_train_unlab, X_test, y_train_unlab, y_test
def get_sorel_data(data_dir, seed=42):
ndim = 2381
X_train_path = os.path.join(data_dir, "X_val.dat")
y_train_path = os.path.join(data_dir, "y_val.dat")
y_train = np.memmap(y_train_path, dtype=np.float32, mode="r")
N = y_train.shape[0]
X_train = np.memmap(X_train_path, dtype=np.float32, mode="r", shape=(N, ndim))
X_test_path = os.path.join(data_dir, "X_test.dat")
y_test_path = os.path.join(data_dir, "y_test.dat")
y_test = np.memmap(y_test_path, dtype=np.float32, mode="r")
N = y_test.shape[0]
X_test = np.memmap(X_test_path, dtype=np.float32, mode="r", shape=(N, ndim))
# X_train, y_train = shuffle(X_train, y_train, random_state=seed)
return X_train, X_test, y_train, y_test
def get_av_data(data_dir, family, av_provider, seed):
X_ben = np.memmap(os.path.join(data_dir, 'benign/X_merged_Apr19.dat'), dtype=np.float32, mode='r', shape=(36226, 2381))
X_mal = np.memmap(os.path.join(data_dir, 'malware/X_mal.dat'), dtype=np.float64, mode='r', shape=(391008, 2381))
df_ben = pd.read_csv(os.path.join(data_dir, 'benign_with_ts_vt_final_labels_2022.csv'), index_col=0)
df_mal = pd.read_csv(os.path.join(data_dir, f'malware_test_labeled_{family}_2022d.csv'), index_col=0)
y_target_ben = df_ben[av_provider].values
y_target_mal = df_mal[av_provider].values
# Split malware data to train and test based on time
train_idx = list(df_mal[df_mal.label == 'train'].index)
test_idx = list(df_mal[df_mal.label == 'test'].index)
X_train_mal = X_mal[train_idx]
X_test_mal = X_mal[test_idx]
y_target_train_mal = y_target_mal[train_idx]
y_target_test_mal = y_target_mal[test_idx]
print("X_train_mal", X_train_mal.shape)
print("X_test_mal", X_test_mal.shape)
print("y_target_train_mal", y_target_train_mal.shape)
print("y_target_test_mal", y_target_test_mal.shape)
train_idx = list(df_ben[df_ben.label == 'train'].index)
test_idx = list(df_ben[df_ben.label == 'test'].index)
X_train_ben = X_ben[train_idx]
X_test_ben = X_ben[test_idx]
y_target_train_ben = y_target_ben[train_idx]
y_target_test_ben = y_target_ben[test_idx]
print("X_train_ben", X_train_ben.shape)
print("X_test_ben", X_test_ben.shape)
print("y_target_train_ben", y_target_train_ben.shape)
print("y_target_test_ben", y_target_test_ben.shape)
# Create the true labels
y_train_mal = np.ones(len(y_target_train_mal))
y_train_ben = np.zeros(len(y_target_train_ben))
y_test_mal = np.ones(len(y_target_test_mal))
y_test_ben = np.zeros(len(y_target_test_ben))
X_train = np.vstack((X_train_mal, X_train_ben))
X_test = np.vstack((X_test_mal, X_test_ben))
y_target_train = np.concatenate((y_target_train_mal, y_target_train_ben))
y_target_test = np.concatenate((y_target_test_mal, y_target_test_ben))
y_train = np.concatenate((y_train_mal, y_train_ben))
y_test = np.concatenate((y_test_mal, y_test_ben))
assert X_train.shape[0] == y_train.shape[0], "Shape of X and y do not match!"
assert X_test.shape[0] == y_test.shape[0], "Shape of X_test and y_test do not match!"
assert X_train.shape[0] == y_target_train.shape[0], "Shape of X and y_target_train do not match!"
assert X_test.shape[0] == y_target_test.shape[0], "Shape of X_test and y_target_test do not match!"
# Shuffle the data to mix malware and benign samples
X_train, y_target_train, y_train = shuffle(X_train, y_target_train, y_train, random_state=seed)
X_test, y_target_test, y_test = shuffle(X_test, y_target_test, y_test, random_state=seed)
return X_train, X_test, y_target_train, y_target_test, y_train, y_test