forked from lettergram/sentence-classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentence_fasttext.py
154 lines (115 loc) · 4.85 KB
/
sentence_fasttext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
'''
Written by Austin Walters
Last Edit: October 23, 2019
For use on austingwalters.com
A FastText to classify a sentence as one
of the common sentance types:
Question, Statement, Command, Exclamation
Mostly taken from Keras Examples:
https://github.com/keras-team/keras
Based on Joulin et al's paper:
Bags of Tricks for Efficient Text Classification
https://arxiv.org/abs/1607.01759
'''
from __future__ import print_function
import os
import numpy as np
import keras
from sentence_types import load_encoded_data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D
def create_ngram_set(input_list, ngram_value=2):
"""
Extract a set of n-grams from a list of integers.
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
"""
gram_list = []
for i in range(ngram_value):
gram_list.append(input_list[i:])
return set(zip(*gram_list))
def add_ngram(sequences, token_indice, ngram_range=2):
"""
Augment the input list of list (sequences) by appending n-grams values.
Example: adding bi-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
Example: adding tri-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
>>> add_ngram(sequences, token_indice, ngram_range=3)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
"""
new_sequences = []
for input_list in sequences:
new_list = input_list[:]
for ngram_value in range(2, ngram_range + 1):
for i in range(len(new_list) - ngram_value + 1):
ngram = tuple(new_list[i:i + ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list)
return new_sequences
# Set parameters:
# ngram_range = 3 # add tri-grams features
# ngram_range = 2 # add bi-grams features
ngram_range = 1 # add uni-grams features
max_words = 10000
maxlen = 500
batch_size = 32
embedding_dims = 50
epochs = 5
x_train, x_test, y_train, y_test = load_encoded_data(data_split=0.8)
num_classes = np.max(y_train) + 1
print(num_classes, 'classes')
print('Convert class vector to binary class matrix '
'(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('Average train sequence length: {}'.format(
np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(
np.mean(list(map(len, x_test)), dtype=int)))
if ngram_range > 1:
print('Adding {}-gram features'.format(ngram_range))
# Create set of unique n-gram from the training set
ngram_set = set()
for input_list in x_train:
for i in range(2, ngram_range + 1):
set_of_ngram = create_ngram_set(input_list, ngram_value=i)
ngram_set.update(set_of_ngram)
# Dictionary mapping n-gram token to a unique integer
# Integer values are greater than max_words in order
# to avoid collision with existing features
start_index = max_words + 1
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
indice_token = {token_indice[k]: k for k in token_indice}
# max_words is the highest integer that could be found in the dataset
max_words = np.max(list(indice_token.keys())) + 1
# Augmenting x_train and x_test with n-grams features
x_train = add_ngram(x_train, token_indice, ngram_range)
x_test = add_ngram(x_test, token_indice, ngram_range)
print('Average train sequence length: {}'.format(
np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(
np.mean(list(map(len, x_test)), dtype=int)))
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('Constructing model!')
model = Sequential()
model.add(Embedding(max_words, embedding_dims, input_length=maxlen))
model.add(GlobalAveragePooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size,
epochs=epochs, validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])