-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfake_news_classifier.py
134 lines (89 loc) · 4.68 KB
/
fake_news_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import pandas as pd
import nltk
import csv, re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score
from sklearn import svm
import time
# This function encodes the given string into ascii characters(and ignores errors)
# and returns the ascii decoded string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
def encode_Ascii(text):
encoded_string = text.encode("ascii", "ignore")
return encoded_string.decode()
# This function lemmatizes the given text()
def lem_dataFrame(text):
tkn = nltk.tokenize.WhitespaceTokenizer()
lmt = nltk.stem.WordNetLemmatizer()
return [lmt.lemmatize(w) for w in tkn.tokenize(text)]
#This function finds all of the hashtags from the data
def get_hashtags(df):
list_hashtags = []
for tweet in df.values:
for tag in re.findall(r"\B#\w*[a-zA-Z]+\w*", tweet):
if tag.lower() not in list_hashtags:
list_hashtags.append(tag.lower())
return list_hashtags
words = set(nltk.corpus.words.words()) # a list of all english words
# This function removes all non-english text from a String list
def keep_english(word_list):
english_words = []
for w in word_list:
if w.lower() in words and len(w) > 2: english_words.append(w)
return english_words
def avg_word(sentence):
words = sentence.split()
if(len(words) == 0):
return 0
return (sum(len(word) for word in words)/len(words))
def data_clean(filename):
tweet_dataset = pd.read_table(filename, sep='\t', quoting=csv.QUOTE_NONE) # read the csv file into a DataFrame object
tweet_dataset.drop(['tweetId', 'userId', 'imageId(s)', 'timestamp', 'username'], axis = 1, inplace = True) # delete the useless columns
tweet_dataset['tweetText'] = tweet_dataset['tweetText'].apply(str.lower)
tweet_dataset['tweetText'] = tweet_dataset['tweetText'].str.replace(r'\\n|\\r|\\', '', regex = True) # Remove end of line characters
tweet_dataset['tweetText'] = tweet_dataset['tweetText'].str.replace(r'(https|http)?:(\s)*\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', ' ', regex = True) # Remove URLs (including edge cases)
# print(tweet_dataset)
tweet_dataset['tweetText'] = tweet_dataset['tweetText'].apply(encode_Ascii) # Remove non-ascii characters
hashtags = get_hashtags(tweet_dataset['tweetText']) # Stores all hashtags before the '#' is deleted
words.union(hashtags) # add all hashtags as keywords(so they don't get deleted)
tweet_dataset['tweetText'] = tweet_dataset['tweetText'].str.replace(r'(@[A-Za-z0–9_]+)|[^\w\s]', '', regex = True) # Remove all punctuation and twitter tags
tweet_dataset['tweetText'] = tweet_dataset['tweetText'].apply(lem_dataFrame) # lemmatize the words
tweet_dataset['tweetText'] = tweet_dataset['tweetText'].apply(lambda tweet: [word for word in tweet if word not in nltk.corpus.stopwords.words('english')])
tweet_dataset['tweetText'] = tweet_dataset['tweetText'].apply(keep_english)
tweet_dataset['tweetText'] = tweet_dataset['tweetText'].apply(lambda tweet_words: ' '.join([word for word in tweet_words]))
tweet_dataset['label'] = tweet_dataset['label'].apply(lambda label: label if label != 'humor' else 'fake')
return tweet_dataset
starttrain = time.time()
training_set = data_clean("mediaeval-2015-trainingset.txt")
# training_set.info();
indexNames = training_set[ training_set['tweetText'] == "" ].index
training_set.drop(indexNames, inplace=True)
# print(training_set)
# Good setting - min_df=0.0001, max_df=0.1
tf_idf = TfidfVectorizer(min_df=0.0001, max_df=0.1, ngram_range=(1,2),max_features=2000)
count_vector = tf_idf.fit_transform(training_set.tweetText)
# print(count_vector.shape)
# MultinomialNB()
# svm.SVC()
# RandomForestClassifier(n_estimators =10,criterion="entropy",random_state =0)
# KNeighborsClassifier(neighbors=10)
# The above algorithms need to be imported from the respective sklearn libraries
clf = svm.SVC()
clf.fit(count_vector.toarray(), list(training_set['label']))
endtrain = time.time()
starttest = time.time()
test_dataset = data_clean("mediaeval-2015-testset.txt")
test_vector = tf_idf.transform(test_dataset.tweetText)
print(test_vector.shape)
prediction = clf.predict(test_vector.toarray())
endtest = time.time()
cm = confusion_matrix(list(test_dataset.label), prediction)
print("F1 accuracy:")
print(f1_score(list(test_dataset['label']),prediction,average = 'micro'))
print("Confusion Matrix:")
print(cm)
print("Training: ",endtrain - starttrain) # prints total calculation time of the whole algorithm
print("Testing: ",endtest - starttest) # prints total calculation time of the whole algorithm