-
Notifications
You must be signed in to change notification settings - Fork 0
/
mal_url.py
63 lines (49 loc) · 1.7 KB
/
mal_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#Nathan Dullea
#Artificial Intelligence Project
#Using Sklearn to Classify Malicious URLs
import pandas as pd
import numpy as np
import sklearn
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
#Function to separate urls into tokens
def makeTokens(input):
tokenList = []
#Separate input on slash dash dot and underscore
tokensBySlash = str(input.encode('utf-8')).split('/')
for i in tokensBySlash:
tokensByDot = str(i).split('.')
for j in tokensByDot:
tokensByDash = str(j).split('-')
for k in tokensByDash:
tokensByUnderscore = str(k).split('_')
for l in tokensByUnderscore:
tokenList = tokenList + [l]
#Turn to set to remove redundancies and back to List
tokenList = list(set(tokenList))
#Remove com and from tokenList
if 'com' in tokenList:
tokenList.remove('com')
return tokenList
def setupDataAndClassifier():
#Read Data from csv into DataFrame
data = pd.read_csv('/Users/nathandullea/Desktop/data.csv')
#Convert to Array and randomize data
alldata = np.array(data)
random.shuffle(alldata)
#Separate Labels and URLs into lists
y = [d[1] for d in alldata] #labels ('good' or 'bad')
urls = [d[0] for d in alldata]
#tfidfvectorizer is equivalent to countvectorizer follower by tfidf transform
vectorizer = TfidfVectorizer(tokenizer=makeTokens)
X = vectorizer.fit_transform(urls)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Create Logistic Regression Model
lgs = LogisticRegression()
#Train Logistic Regression Model
lgs.fit(X_train, y_train)
#Print the Score
print(lgs.score(X_test, y_test))
setupDataAndClassifier()