-
Notifications
You must be signed in to change notification settings - Fork 0
/
logic.py
143 lines (105 loc) · 4.62 KB
/
logic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import re
import csv
import pandas as pd
import pypandoc
from docx import Document
from flask import Flask
from pathlib import Path
from operator import itemgetter
from utils import parse_date, remove_file
app = Flask(__name__)
def everything_function(f: Path) -> None:
# Define the input and output paths
steps = [
(convert_to_markdown, [f, Path('input.md')]),
(clean_markdown_document, [Path('input.md'), Path('cleaned.md')]),
(extract_numbered_items_to_csv, [Path('cleaned.md'), Path('all_dates.csv')]),
(extract_dates, [Path('all_dates.csv'), Path('dates_extracted.csv')]),
(create_word_document_from_csv, [Path('dates_extracted.csv'), Path('draft-chronology.docx')])
]
try:
for step, args in steps:
step(*args)
remove_file(args[0])
finally:
remove_file(f)
def convert_to_markdown(input_file: Path, output_file: Path) -> Path:
pypandoc.convert_file(input_file, 'md', outputfile=str(output_file))
return output_file
def clean_markdown_document(input_file: Path, output_file: Path) -> None:
with input_file.open('r', encoding='utf-8') as file:
content = file.read()
first_occurrence_index = content.find("1.")
first_occurrence_of_footnote = content.find("[^1]:")
if first_occurrence_index != -1:
if first_occurrence_of_footnote != -1 and first_occurrence_of_footnote > first_occurrence_index:
cleaned_content = content[:first_occurrence_of_footnote]
else:
cleaned_content = content[first_occurrence_index:]
else:
cleaned_content = content
with output_file.open('w', encoding='utf-8') as file:
file.write(cleaned_content)
print(f"Markdown document cleaned and saved as {output_file}")
def extract_numbered_items_to_csv(input_file: Path, output_file: Path) -> None:
with input_file.open('r', encoding='utf-8') as file:
content = file.read()
pattern = r'(\d+)\.\s*(.*?)\n(?=\d+\.\s|$)'
matches = re.findall(pattern, content, re.DOTALL)
with output_file.open('w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Number', 'Text'])
for number, text in matches:
writer.writerow([number, text.strip()])
print(f"Data has been extracted and saved to {output_file}")
def extract_dates(input_file: Path, output_file: Path) -> None:
extracted_data = []
date_patterns = [
r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
r'\b(\d{1,2} [A-Za-z]{3,9} \d{2,4})\b',
r'\b([A-Za-z]{3,9} \d{1,2}, \d{2,4})\b',
r'\b(\d{4}-\d{2}-\d{2})\b'
]
with input_file.open('r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
paragraph_number = row['Number']
text = row['Text']
for pattern in date_patterns:
matches = re.findall(pattern, text)
for match in matches:
date = parse_date(match)
if date:
extracted_data.append({'Date': date, 'Text': text, 'Paragraph Number': paragraph_number})
extracted_data.sort(key=itemgetter('Date'))
with output_file.open('w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Text', 'Paragraph Number']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for entry in extracted_data:
writer.writerow({'Date': entry['Date'].strftime("%Y-%m-%d"),
'Text': entry['Text'],
'Paragraph Number': entry['Paragraph Number']})
print(f"Date extraction completed and saved to {output_file}")
def create_word_document_from_csv(input_file: Path, output_file: Path) -> None:
data = pd.read_csv(input_file)
doc = Document()
doc.add_heading('Draft Chronology', level=1)
table = doc.add_table(rows=1, cols=3)
table.style = 'Table Grid'
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Date'
hdr_cells[1].text = 'Text'
hdr_cells[2].text = 'Paragraph Number'
for index, row in data.iterrows():
row_cells = table.add_row().cells
row_cells[0].text = str(row['Date'])
row_cells[1].text = str(row['Text'])
row_cells[2].text = str(row['Paragraph Number'])
doc.save(output_file)
print(f"Word document '{output_file}' created successfully.")
@app.route('/')
def main():
everything_function(Path('your_input_file_here'))
if __name__ == '__main__':
app.run(debug=True)