-
Notifications
You must be signed in to change notification settings - Fork 0
/
training.py
60 lines (48 loc) · 1.66 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
import os
import nltk
import pickle
#positive tweets will be used for training
pos_tweets = []
for path, subdirs, files in os.walk('/home/mohamed/Twitter/Positive'):
for filename in files:
f = os.path.join(path, filename)
file_data = open(f, 'r')
pos_tweets.append((file_data.read(), 'positive'))
#negative tweets will be used for training
neg_tweets = []
for path, subdirs, files in os.walk('/home/mohamed/Twitter/Negative'):
for filename in files:
f = os.path.join(path, filename)
file_data = open(f, 'r')
pos_tweets.append((file_data.read(), 'negative'))
tweets = []
for(words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >=3]
tweets.append((words_filtered, sentiment))
def get_words_in_tweets(tweets):
all_words = []
for(words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys() #usch that worrlist is a dictionary
# of key(word) and value(frequency)
return word_features
word_features = get_word_features(get_words_in_tweets(tweets))
#handling the input
def extract_features(document):
document_words = set(document) #eliminate duplicates
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
# define the training set
training_set = nltk.classify.apply_features(extract_features, tweets)
#training our classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)
# saving the classifier object
f = open("classifier_object.pickle", 'wb')
pickle.dump(classifier, f)
f.close()