forked from shivaneej/Genessay
-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrixGen.py
68 lines (60 loc) · 2.19 KB
/
matrixGen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import csv
import re
from nltk.util import ngrams
import numpy as np
def extractNGrams(sentence,n):
sentence = sentence.lower()
sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)
tokens = [token for token in sentence.split(" ") if token != ""]
output = list(ngrams(tokens,n))
NGrams = ['_'.join(t.strip() for t in tup) for tup in output]
return NGrams
def makeMatrix(matrixDict, uniqueNGrams):
for rowname in matrixDict.keys():
row = matrixDict.get(rowname)
cols = row.keys()
for ngram in uniqueNGrams:
if ngram not in cols:
row[ngram] = 0
matrixDict[rowname] = row
def saveMatrix(filename, matrixDict, uniqueNGrams):
csvfile = open(filename, 'w',newline='')
row = [" "]
row.extend(list(uniqueNGrams))
filewriter = csv.writer(csvfile, delimiter=',')
filewriter.writerow(row)
for rowname in uniqueNGrams:
row = [rowname]
row.extend([matrixDict[rowname].get(col) for col in uniqueNGrams])
filewriter.writerow(row)
def wordContext(listNGrams, uniqueNGrams, matrixDict):
n = len(listNGrams)
uniqueNGrams |= set(listNGrams)
for i in range(n-1):
current = listNGrams[i]
nextWord = listNGrams[i+1]
row = matrixDict.get(current, {})
cell = row.get(nextWord,0)
row[nextWord] = cell + 1
matrixDict[current] = row
row = matrixDict.get(nextWord, {})
cell = row.get(current,0)
row[current] = cell + 1
matrixDict[nextWord] = row
return matrixDict
def updateMatrices(data_dir):
DATASET_FILE = data_dir + DATASET_FILENAME
for i in range(N):
sample_file = open(DATASET_FILE, 'r')
data = sample_file.read()
samples = list(data.split("\n\n"))
matrixDict = {}
uniqueNGrams = set()
for sample in samples:
listNgrams = extractNGrams(sample, i + 1)
matrixDict = wordContext(listNgrams, uniqueNGrams, matrixDict)
makeMatrix(matrixDict, uniqueNGrams)
saveMatrix(data_dir + '/' + OP_FILE_NAMES[i], matrixDict, uniqueNGrams)
DATASET_FILENAME = '/og.txt'
OP_FILE_NAMES = ['unigrams_a.csv', 'bigrams_a.csv', 'trigrams_a.csv']
N = len(OP_FILE_NAMES)