-
Notifications
You must be signed in to change notification settings - Fork 0
/
manigen.py
144 lines (104 loc) · 3.88 KB
/
manigen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: UTF-8 -*-
# Random Text generator
from random import randint,choice
import nltk,re,json
SENT_END = ['.', '!', '?','--','...']
PUNC = ['.', '!', '?','--','...','`','\'',',']
def generate_manifesto(file_in):
m = ManifestoGenerator(file_in)
data = []
for i in range(50):
data.append(m.format_sentence(m.bigram_phrase()))
for i in range(50):
data.append(m.format_sentence(m.markov_phrase()))
data = json.dumps(data, separators=(', ',': '),indent="\t")
with open('data.json', 'w') as f:
f.write(data)
return 1
class ManifestoGenerator:
def __init__(self,file_in):
with open(file_in) as f:
text = f.read()
sents = nltk.sent_tokenize(text)
sent_lengths = []
for s in sents:
curr_sent = nltk.word_tokenize(s)
length = len(curr_sent)
sent_lengths.append(length)
self.tokens = nltk.word_tokenize(text)
bigrams = nltk.bigrams(self.tokens)
self.sent_lengths = sent_lengths
self.cdf = nltk.ConditionalFreqDist(bigrams)
def bigram_phrase(self):
sentence = []
curr_word = choice(self.tokens)
sentence.append(curr_word)
sent_length = choice(self.sent_lengths)
for i in range(sent_length):
index = 0
curr_dist = self.cdf[curr_word]
for i in curr_dist.values():
if i < 5:
del curr_dist[index]
index += 1
curr_word = choice(list(self.cdf[curr_word].keys()))
sentence.append(curr_word)
return sentence
# def test_sentence_substrings(sentence, text, n=6):
# words = string.split(sentence)
# groups = [words[i:i+n] for i in range(0, len(words), n)]
# for group in groups:
# group = " ".join(group)
# if group in text:
# return False
# return True
def markov_phrase(self):
arr = []
end_sentence = []
dict = {}
prev1 = ''
prev2 = ''
for word in self.tokens:
if prev1 != '' and prev2 != '':
key = (prev2, prev1)
if key in dict:
dict[key].append(word)
else:
dict[key] = [word]
if re.match("[\.\?\!]", prev1[-1:]):
end_sentence.append(key)
prev2 = prev1
prev1 = word
if end_sentence == []:
return
key = ()
sentence = []
attempts = 0
for i in range(choice(self.sent_lengths)):
if key in dict:
word = choice(dict[key])
sentence.append(word)
key = (key[1], word)
if key in end_sentence:
# sentence_str = " ".join(sentence)
# check if the beginning of sentence occurs in the text
# if sentence_str[:15] not in phrase and sentence_str:
key = choice(end_sentence)
else:
key = choice(end_sentence)
return sentence
def format_sentence(self,sent_list):
if sent_list[0] in PUNC:
del sent_list[0]
sent_list[0] = sent_list[0].title()
sent_list = [v for v in sent_list if v != '.']
sent_list = [v for v in sent_list if not v.isdigit()]
while len(sent_list[-1]) < 5:
sent_list[-1] = choice(self.tokens)
sent_string = " ".join(sent_list)
sent_string += choice(SENT_END)
sent_string = re.sub(r'\s(,|;|\'|\"|\:)',r'\1',sent_string)
sent_string = re.sub(r'(,|;|\'|\"|\:|`)+',r'\1',sent_string)
return sent_string
if __name__ == '__main__':
generate_manifesto('arttech.txt')