-
Notifications
You must be signed in to change notification settings - Fork 3
/
text_select_part3.py
61 lines (49 loc) · 2.12 KB
/
text_select_part3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
# path from where to get the txt files
saved_path = "E:/Thesis stuff/10k abbot/python/10ktxt/"
#path to where to save the txt
selected_path = "E:/Thesis stuff/10k abbot/python/Multiple 10k/10k_select/"
#path for 10ks with ITEM1
item1_path = "E:/Thesis stuff/10k abbot/python/10kcap/"
list_txt = os.listdir(selected_path)
saved_txt = os.listdir(item1_path)
def allindices(text, sub, listindex=[]):
i = text.find(sub)
while i >= 0:
listindex.append(i)
i = text.find(sub, i+1)
return listindex
item1_begin = 'ITEM XX.'
item3_begin = 'ITEM 3.'
for text in list_txt:
# the path must be united with each item from the list
file_path = selected_path + text
# opens the txt document and encodes it with utf-8
file = open(file_path, "r+", encoding="utf-8")
# reads the opened file
file_read = file.read()
# point to the beginning and the end of text we want to select
begin = allindices(file_read, item1_begin, listindex=[])
end = allindices(file_read, item3_begin, listindex=[])
try:
save_txt = file_read[begin[0]:end[0]].strip()
except IndexError:
continue
if len(save_txt) >= 2000:
saved_file = open(item1_path + text, "w+", encoding="utf-8") # save the file with the complete names
saved_file.write(save_txt) # write to the new text files the selected text
saved_file.close() # close the file
print("1 works for " + text)
else:
try:
save_txt = file_read[begin[1]:end[1]].strip()
except IndexError:
continue
if len(save_txt) >= 2000:
saved_file = open(item1_path + text, "w+", encoding="utf-8") # save the file with the complete names
saved_file.write(save_txt) # write to the new text files the selected text
saved_file.close() # close the file
print("2 works for " + text)
else:
print("not here " + text)
file.close()