-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild.py
executable file
·392 lines (279 loc) · 10.6 KB
/
build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#!/usr/bin/env python3
import app
import fnmatch
import json
import logging
import os
import traceback
titles = """
genesis=>The First Book of Moses: Called Genesis
exodus=>The Second Book of Moses: Called Exodus
leviticus=>The Third Book of Moses: Called Leviticus
numbers=>The Fourth Book of Moses: Called Numbers
deuteronomy=>The Fifth Book of Moses: Called Deuteronomy
joshua=>The Book of Joshua
judges=>The Book of Judges
ruth=>The Book of Ruth
1_samuel=>The First Book of Samuel
2_samuel=>The Second Book of Samuel
1_kings=>The First Book of the Kings
2_kings=>The Second Book of the Kings
1_chronicles=>The First Book of the Chronicles
2_chronicles=>The Second Book of the Chronicles
ezra=>Ezra
nehemiah=>The Book of Nehemiah
esther=>The Book of Esther
job=>The Book of Job
psalms=>The Book of Psalms
proverbs=>The Proverbs
ecclesiastes=>Ecclesiastes
songs=>The Song of Solomon
isaiah=>The Book of the Prophet Isaiah
jeremiah=>The Book of the Prophet Jeremiah
lamentations=>The Lamentations of Jeremiah
ezekiel=>The Book of the Prophet Ezekiel
daniel=>The Book of Daniel
hosea=>Hosea
joel=>Joel
amos=>Amos
obadiah=>Obadiah
jonah=>Jonah
micah=>Micah
nahum=>Nahum
habakkuk=>Habakkuk
zephaniah=>Zephaniah
haggai=>Haggai
zechariah=>Zechariah
malachi=>Malachi
matthew=>The Gospel According to Saint Matthew
mark=>The Gospel According to Saint Mark
luke=>The Gospel According to Saint Luke
john=>The Gospel According to Saint John
acts=>The Acts of the Apostles
romans=>The Epistle of Paul the Apostle to the Romans
1_corinthians=>The First Epistle of Paul the Apostle to the Corinthians
2_corinthians=>The Second Epistle of Paul the Apostle to the Corinthians
galatians=>The Epistle of Paul the Apostle to the Galatians
ephesians=>The Epistle of Paul the Apostle to the Ephesians
philippians=>The Epistle of Paul the Apostle to the Philippians
colossians=>The Epistle of Paul the Apostle to the Colossians
1_thessalonians=>The First Epistle of Paul the Apostle to the Thessalonians
2_thessalonians=>The Second Epistle of Paul the Apostle to the Thessalonians
1_timothy=>The First Epistle of Paul the Apostle to Timothy
2_timothy=>The Second Epistle of Paul the Apostle to Timothy
titus=>The Epistle of Paul the Apostle to Titus
philemon=>The Epistle of Paul the Apostle to Philemon
hebrews=>The Epistle of Paul the Apostle to the Hebrews
james=>The General Epistle of James
1_peter=>The First Epistle General of Peter
2_peter=>The Second General Epistle of Peter
1_john=>The First Epistle General of John
2_john=>The Second Epistle General of John
3_john=>The Third Epistle General of John
jude=>The General Epistle of Jude
revelation=>The Revelation of Saint John the Divine
"""
nt = [
"matthew",
"mark",
"luke",
"john",
"acts",
"romans",
"1_corinthians",
"2_corinthians",
"galatians",
"ephesians",
"philippians",
"colossians",
"1_thessalonians",
"2_thessalonians",
"1_timothy",
"2_timothy",
"titus",
"philemon",
"hebrews",
"james",
"1_peter",
"2_peter",
"1_john",
"2_john",
"3_john",
"jude",
"revelation"
]
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# source file
file_spec = app.dirs.data.join("10.txt.utf-8")
lines = app.read_list_file(file_spec)
content = []
total = 0
is_add_lines = False
# read all of the lines of the file.
# eliminate blank lines
# single line that contains ***
# single line that contains The New Testament of the King James Bible
for line in lines:
if line:
if "The First Book of Moses: Called Genesis" in line:
total += 1
if total >= 2:
is_add_lines = True
if "*** END OF THE PROJECT GUTENBERG EBOOK THE KING JAMES VERSION OF THE BIBLE ***" in line:
is_add_lines = False
if is_add_lines and not "***" in line and not "The New Testament of the King James Bible" in line:
content.append(line)
# build a list of book_names / titles from the heredoc above.
keys = []
for title in titles.splitlines():
if title:
keys.append(title)
try:
# loop through the book_names / titles
# basically, scans the lines of the content.
# looks for a line containing the current title.
# when found, records the index of that line, then, continues
# scanning for the title of the next book. when found, records
# the index. (the book of revelation will never have a next_book_title)
for index, key in enumerate(keys):
book_name, title = key.split("=>")
next_book_name, next_title = "", ""
if index < len(keys) - 1:
next_book_name, next_title = keys[index + 1].split("=>")
index_begin = -1
index_end = -1
for index_line, line in enumerate(content):
# if the index_begin has not been set yet
if index_begin < 0:
if line.strip() == title:
index_begin = index_line
# if the index_end has not been set yet
elif index_end < 0:
if line.strip() == next_title:
index_end = index_line
break
# if the beginning index has been found
if index_begin >= 0:
book_content = []
# if an index_end has been found, otherwise, include all content
# to the end of content[]
if index_end > 0:
book_content = content[index_begin + 1:index_end]
else:
book_content = content[index_begin + 1:]
dir_output = app.dirs.data.text.ot
if book_name in nt:
dir_output = app.dirs.data.text.nt
file_spec = dir_output.join(f"{book_name}.txt")
app.write_list_file(file_spec, book_content)
except Exception as err:
traceback.print_exc()
try:
dir_source = app.dirs.data.text
# now that the source file has been broken into individual files containing
# the content for each book. we need to clean them up a little bit.
# simply due to the fact that the authors did not design the format of the source
# file to be easily parsable, there are a few areas of garbage data.
# for each bible book file
# the file is read, the garbage is removed and it is reformatted to ensure
# that the carriage return characters are removed and to ensure that the produced
# text file has one verse per line in the file.
for root, directories, files in os.walk(dir_source.to_string(), topdown = True):
# process each file in the nt/ot directories
for file_name in fnmatch.filter(files, "*.txt"):
# build a full file spec
file_spec = os.path.join(root, file_name)
# read the contents of that file as lines
lines = app.read_list_file(file_spec)
# buffer to hold the full content of the file with the line breaks (carriage returns) removed.
buffer = ""
# loop through all the lines and glue them into one large string buffer.
for line in lines:
if line:
# concantenate the line with a preceding space.
buffer += f" {line}"
# strip leading and trailing spaces
buffer = buffer.strip()
# garbage data may exist prior to the first verse, so, search for 1:1
# and if the index is greater than zero some type of garbage must exist.
# simply eliminate the garbage by reassigning buffer variable to include
# all of the content from the index to the end.
index = buffer.index("1:1")
if index > 0:
buffer = buffer[index:]
# the approach is the break the entire content of the buffer into individual
# tokens by splitting buffer based on a space " ".
# then, loop through all of the tokens and identify the indexes of each token
# that represents a chapter:verse marker. all of those indexes are added to a list
# in order.
token_indexes = []
tokens = buffer.split(" ")
for token_index, token in enumerate(tokens):
# chapter:verse is formatted as 1:1
# if the current token contains a ":", then, it might be a chapter:verse marker.
if ":" in token:
before, after = token.split(":")
# if both strings on either side of the token are numbers, then,
# we are considering it to be a chapter:verse marker.
if before.isdigit() and after.isdigit():
token_indexes.append(token_index)
# buffer representing the lines of the file with one verse per line.
new_contents = []
# loop through the list of token_indexes.
# content lines are reconstructed by grabbing the tokens between
# token_indexes recorded in the code above.
# each value in the token_indexes list is the index of a chapter:verse marker.
# each token index is considered the beginning index, so, the ending index is
# simply the next beginning index in the series.
# the last index will not have an ending index. simply grab the rest of the content
# in the tokens list.
for token_index, index_begin in enumerate(token_indexes):
# set the value to below zero
# this is how i indentify the last one.
index_end = -1
# make sure this is not the last token_index in the token_indexes list.
if token_index < len(token_indexes) - 1:
# this gets the next token index in the list.
index_end = token_indexes[token_index + 1]
# if both values have been set, get the range.
# otherwise, get from the index_begin to the end of tokens.
if index_end > index_begin:
line_tokens = tokens[index_begin:index_end]
else:
line_tokens = tokens[index_begin:]
# now, reconstruct the line and add it to the new contents list.
new_contents.append(" ".join(line_tokens))
# rewrite the contents of the file.
app.write_list_file(file_spec, new_contents)
# now, generate JSON content
file_spec_json = file_spec.replace(".txt", ".json")
file_spec_json = file_spec_json.replace("text", "json")
# read the contents of the file as lines
lines = app.read_list_file(file_spec)
# json_content is simply and array of arrays.
# add an empty array representing contents of chapter one.
json_content = [[]]
chapter_number = 1
for line in lines:
# split the line via space " " character.
tokens = line.split(" ")
# the first token should ALWAYS BE a chapter:verse marker.
# determine the chapter of the chapter:verse marker.
chapter_str = tokens[0].split(":")[0]
# reconstruct the line excluding the chapter:verse marker.
new_line = " ".join(tokens[1:])
# if chapter number of the current line matches chapter_number
# simply add it to the current chapter array which is the last array of json_content
# if chapter_number does not match,
if not (chapter_number == int(chapter_str)):
chapter_number = int(chapter_str)
json_content.append([])
# append the reconstructed line to the end of the current chapter array which
# is the last array in json_content
json_content[-1].append(new_line)
# write the target file as JSON content
with open(file_spec_json, "w", encoding="utf-8") as f:
f.write(json.dumps(json_content, indent=4, ensure_ascii=False))
except Exception as err:
traceback.print_exc()
logging.info("complete")