-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_freqs.py
120 lines (98 loc) · 3.37 KB
/
merge_freqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import argparse
import sqlite3
import yaml
from logging import getLogger,config
from pathlib import Path
#Set up logger
with open("./logging_config.yaml","r",encoding="utf-8") as r:
logging_config=yaml.safe_load(r)
config.dictConfig(logging_config)
logger=getLogger(__name__)
def fn_gather_local(input_dirname:str,output_filepath:str):
#Get input files
input_dir=Path(input_dirname)
input_files=list(input_dir.glob("*.db"))
input_files.sort()
logger.info(f"{len(input_files)} files exist in the input directory")
#Create table to gather local frequencies
logger.info("Creating table to gather local frequencies...")
with sqlite3.connect(output_filepath) as conn:
cur=conn.cursor()
cur.execute(
"""
CREATE TABLE local_freqs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word STRING NOT NULL,
freq INTEGER NOT NULL
);
"""
)
conn.commit()
#Insert all records of the local frequency tables into the gathering table
logger.info("Start gathering records from local frequency tables...")
with sqlite3.connect(output_filepath) as conn:
cur=conn.cursor()
for input_file in input_files:
logger.info(f"Processing '{input_file.name}'")
#Attach temp DB
cur.execute(f"ATTACH DATABASE '{str(input_file)}' AS tmpdb;")
#Insert records
cur.execute(
"""
INSERT INTO local_freqs (word,freq)
SELECT word,freq
FROM tmpdb.freqs;
"""
)
conn.commit()
#Detach temp DB
cur.execute("DETACH tmpdb;")
logger.info("Finished gathering records from local frequency tables")
def fn_merge_local(output_filepath:str):
#Merge local frequencies
logger.info("Start merging local frequencies...")
with sqlite3.connect(output_filepath) as conn:
cur=conn.cursor()
#Create table to merge frequencies
cur.execute(
"""
CREATE TABLE freqs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word STRING NOT NULL,
freq INTEGER NOT NULL
);
"""
)
conn.commit()
#Consolidate local frequencies
cur.execute(
"""
INSERT INTO freqs (word,freq)
SELECT word,SUM(freq)
FROM local_freqs
GROUP BY word;
"""
)
conn.commit()
#Remove table for local frequencies
cur.execute("DROP TABLE local_freqs;")
conn.commit()
logger.info("Finished merging local frequencies")
def main(args):
input_dirname:str=args.input_dirname
output_filepath:str=args.output_filepath
gather_local:bool=args.gather_local
merge_local:bool=args.merge_local
logger.debug(args)
if gather_local:
fn_gather_local(input_dirname,output_filepath)
if merge_local:
fn_merge_local(output_filepath)
if __name__=="__main__":
parser=argparse.ArgumentParser()
parser.add_argument("-i","--input-dirname",type=str)
parser.add_argument("-o","--output-filepath",type=str)
parser.add_argument("--gather-local",action="store_true")
parser.add_argument("--merge-local",action="store_true")
args=parser.parse_args()
main(args)