-
Notifications
You must be signed in to change notification settings - Fork 0
/
splitbooks.py
117 lines (96 loc) · 4.35 KB
/
splitbooks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import csv
from hebrew_numbers import gematria_to_int
from pathlib import Path
import sys
import zipfile
class BookSplitter:
def __init__(self):
self.outdirpath = Path(r'C:\Users\Marc\code\miqra-data\source')
self.output_rows = 0
def split_books(self, filepath):
self._path_obj = Path(filepath)
print(filepath)
book = ''
book_rows = []
with open(filepath, encoding='utf-8') as tsvfile:
tsvreader = csv.reader(tsvfile, delimiter='\t')
for row_num, row in enumerate(tsvreader):
# print(row[1])
if(str(row[1]) == '0'):
new_book = self.get_file_stem(row[0])
if new_book != book:
if book:
self.save_book_rows(book, book_rows)
book = new_book
book_rows = []
book_rows.append(row)
self.save_book_rows(book, book_rows)
# verify and cleanup
input_rows = row_num + 1
print('')
print(' Input Rows: {}'.format(input_rows))
print('Output Rows: {}'.format(self.output_rows))
if input_rows != self.output_rows:
raise Exception('input rows does not equal output rows')
def get_file_stem(self, s):
sefer, perek = s.split('/')
if sefer == 'ספר בראשית':
if gematria_to_int(perek) < 12:
return 'ספר בראשית - א-יא'
elif gematria_to_int(perek) < 37:
return 'ספר בראשית - יב-לו'
else:
return 'ספר בראשית - לז-נ'
elif sefer == 'ספר תהלים':
if gematria_to_int(perek) < 42:
return 'ספר תהלים - ראשון'
elif gematria_to_int(perek) < 73:
return 'ספר תהלים - שני'
elif gematria_to_int(perek) < 90:
return 'ספר תהלים - שלישי'
elif gematria_to_int(perek) < 107:
return 'ספר תהלים - רביעי'
else:
return 'ספר תהלים - חמישי'
elif ' ' in perek:
section = perek.split(' ')[0]
# replace quotes, which can make a filename invalid, with proper gershayim
section = section.replace('"', chr(0x05f4))
return '{} - {}'.format(sefer, section)
else:
return sefer
def save_book_rows(self, book, rows):
print('Saving {} rows...'.format(len(rows)))
dirpath = Path(self.outdirpath, self._path_obj.stem)
if book:
dirpath.mkdir(exist_ok=True)
filepath = Path(dirpath, book).with_suffix('.tsv')
print('Saving to {}...'.format(filepath))
with open(filepath, 'w', newline='', encoding='utf-8', ) as outfile:
# To show TSV files nicely, Github requires any cell that has a quotation mark in it to:
# 1. have the whole cell surrounded with quotation marks
# 2. have any question marks doubled
# The "excel-tab" dialect built into the Python csv module does this.
tsvwriter = csv.writer(outfile, dialect='excel-tab')
tsvwriter.writerows(rows)
self.output_rows += len(rows)
if __name__ == '__main__':
# BookSplitter().split_books(sys.argv[1])
filenames = [
'חמש מגילות.tsv',
'כתובים אחרונים.tsv',
'נביאים אחרונים.tsv',
'נביאים ראשונים.tsv',
'ספרי אמת.tsv',
'תורה.tsv',
'README.tsv',
'templates תבניות.tsv'
]
dirname = r'C:\Users\marc\code\miqra-scripts\downloadfromsheets-cache'
for filename in filenames:
BookSplitter().split_books(Path(dirname, filename))
with zipfile.ZipFile(Path(dirname, 'Miqra_al_pi_ha-Masorah.zip'), 'r') as zip_ref:
zip_ref.extractall(Path(dirname, 'Miqra_al_pi_ha-Masorah'))
import shutil
shutil.copyfile(Path(dirname, 'Miqra_al_pi_ha-Masorah', 'README.html'), r'C:\Users\Marc\code\miqra-data\source\README.html')
shutil.copyfile(Path(dirname, 'Miqra_al_pi_ha-Masorah', 'templates תבניות.html'), r'C:\Users\Marc\code\miqra-data\source\templates תבניות.html')