-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompileTrainingData.py
72 lines (65 loc) · 2.18 KB
/
compileTrainingData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import io
import codecs
import format
from pathlib import Path
trainc = codecs.open('train.txt', 'w', encoding='utf-8')
vkName = "имя фамилия" #lowercase: "имя фамилия"
myNameVkOpt = '/./' + vkName
dialoguesPath = 'C:/dialogues'
alines = []
blines = []
blockB = False
writeToA = True
def writeLines(lines, oppNameVkOpt):
global writeToA
global blockB
for i in range(len(lines)):
if lines[i].startswith(oppNameVkOpt):
writeToA=True
blockB=False
elif lines[i].startswith(myNameVkOpt):
writeToA=False
else:
if writeToA:
if i+1<len(lines):
if lines[i+1].startswith(myNameVkOpt):
alines.append(format.format(lines[i]))
if not writeToA and not blockB:
blines.append(format.format(lines[i]))
blockB = True
def composeFile():
i = 0
print("lines in a: ", len(alines))
print("lines in b: ", len(blines))
for line1 in alines:
if i<len(alines) and i<len(blines):
if not (alines[i]=='' or blines[i]==''):
if not 'http' in (alines[i] or blines[i]):
trainc.write(alines[i].strip('\n'))
trainc.write('\t')
trainc.write(blines[i].strip('\n'))
trainc.write('\n')
i = i + 1
print("formatted lines: ", i)
trainc.close()
def main():
rootdir = Path(dialoguesPath)
file_list = [f for f in rootdir.glob('**/*') if f.is_file()]
for f in file_list:
with io.open (f, encoding = 'utf-8') as f:
lines = []
foldername = os.path.basename(os.path.dirname(f.name)).lower()
oppName = foldername.split('(')[0]
oppNameVkOpt = "/./" + oppName
lines.append(oppNameVkOpt)
lines.append('')
lines.append(myNameVkOpt)
lines.append('')
for line in f:
if not line.startswith('\t'):
line = line.rstrip('\n')
lines.append(line.strip().lower())
writeLines(lines, oppNameVkOpt)
composeFile()
main()