Skip to content

Commit

Permalink
Fix Python code style (#11)
Browse files Browse the repository at this point in the history
  • Loading branch information
tushuhei authored Nov 24, 2021
1 parent 9e30272 commit 1fdfbf8
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 229 deletions.
52 changes: 26 additions & 26 deletions scripts/build_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,41 +23,41 @@


def rollup(weights_filename: str, model_filename: str, scale: int = 1000):
"""Rolls up the weights and outputs a model in JSON with integer scores.
"""Rolls up the weights and outputs a model in JSON with integer scores.
Args:
weights_filename (str): A file path for the input weights file.
model_filename (str): A file path for the output model file.
scale (int, optional): A scale factor for the output score.
"""
decision_trees: typing.Dict[str, float] = dict()
with open(weights_filename) as f:
for row in f:
feature = row.split('\t')[0]
score = float(row.split('\t')[1])
decision_trees.setdefault(feature, 0)
decision_trees[feature] += score
with open(model_filename, 'w') as f:
decision_trees_intscore = dict(
(item[0], int(item[1] * scale)) for item in decision_trees.items())
json.dump(decision_trees_intscore, f)
decision_trees: typing.Dict[str, float] = dict()
with open(weights_filename) as f:
for row in f:
feature = row.split('\t')[0]
score = float(row.split('\t')[1])
decision_trees.setdefault(feature, 0)
decision_trees[feature] += score
with open(model_filename, 'w') as f:
decision_trees_intscore = dict(
(item[0], int(item[1] * scale)) for item in decision_trees.items())
json.dump(decision_trees_intscore, f)


def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('weight_file',
help='A file path for the learned weights.')
parser.add_argument(
'-o',
'--outfile',
help='A file path to export a model file. (default: model.json)',
default='model.json')
args = parser.parse_args()
weights_filename = args.weight_file
model_filename = args.outfile
rollup(weights_filename, model_filename)
print('Model file is exported as', model_filename)
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'weight_file', help='A file path for the learned weights.')
parser.add_argument(
'-o',
'--outfile',
help='A file path to export a model file. (default: model.json)',
default='model.json')
args = parser.parse_args()
weights_filename = args.weight_file
model_filename = args.outfile
rollup(weights_filename, model_filename)
print('Model file is exported as', model_filename)


if __name__ == '__main__':
main()
main()
31 changes: 16 additions & 15 deletions scripts/encode_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,23 @@


def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'source_data',
help='''File path of the source training data to extract features.''')
parser.add_argument('-o',
'--outfile',
help='''Output file path for the encoded training data.
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'source_data',
help='''File path of the source training data to extract features.''')
parser.add_argument(
'-o',
'--outfile',
help='''Output file path for the encoded training data.
(default: encoded_data.txt)''',
default='encoded_data.txt')
args = parser.parse_args()
source_filename = args.source_data
train_data_filename = args.outfile
feature_extractor.process(source_filename, train_data_filename)
print('\033[92mEncoded training data is output to: %s\033[0m' %
(train_data_filename))
default='encoded_data.txt')
args = parser.parse_args()
source_filename = args.source_data
train_data_filename = args.outfile
feature_extractor.process(source_filename, train_data_filename)
print('\033[92mEncoded training data is output to: %s\033[0m' %
(train_data_filename))


if __name__ == '__main__':
main()
main()
152 changes: 79 additions & 73 deletions scripts/load_knbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,110 +27,116 @@


class KNBCHTMLParser(HTMLParser):
"""Parses the HTML files in the KNBC corpus and outputs the chunks."""
def __init__(self, split_tab: bool = True):
super().__init__()
self.chunks = ['']
self.n_rows = 0
self.n_cols = 0
self.current_word = None
self.split_tab = split_tab

def handle_starttag(self, tag, _):
if tag == 'tr':
self.n_rows += 1
self.n_cols = 0
self.current_word = None
if tag == 'td':
self.n_cols += 1

def handle_endtag(self, tag):
if tag != 'tr': return
if (self.n_rows > 2 and self.n_cols == 1
and (self.split_tab or self.current_word == '文節区切り')):
self.chunks.append('')
if self.n_cols == 5:
self.chunks[-1] += self.current_word

def handle_data(self, data):
if self.n_cols == 1:
self.current_word = data
"""Parses the HTML files in the KNBC corpus and outputs the chunks."""

def __init__(self, split_tab: bool = True):
super().__init__()
self.chunks = ['']
self.n_rows = 0
self.n_cols = 0
self.current_word = None
self.split_tab = split_tab

def handle_starttag(self, tag, _):
if tag == 'tr':
self.n_rows += 1
self.n_cols = 0
self.current_word = None
if tag == 'td':
self.n_cols += 1

def handle_endtag(self, tag):
if tag != 'tr':
return
if (self.n_rows > 2 and self.n_cols == 1 and
(self.split_tab or self.current_word == '文節区切り')):
self.chunks.append('')
if self.n_cols == 5:
self.chunks[-1] += self.current_word

def handle_data(self, data):
if self.n_cols == 1:
self.current_word = data


def break_before_open_parentheses(chunks: typing.List[str]):
"""Adds chunk breaks before every open parentheses.
"""Adds chunk breaks before every open parentheses.
Args:
chunks (List[str]): Source chunks.
Returns:
Processed chunks.
"""
out: typing.List[str] = []
for chunk in chunks:
if '(' in chunk:
index = chunk.index('(')
if index > 0: out.append(chunk[:index])
out.append(chunk[index:])
else:
out.append(chunk)
return out
out: typing.List[str] = []
for chunk in chunks:
if '(' in chunk:
index = chunk.index('(')
if index > 0:
out.append(chunk[:index])
out.append(chunk[index:])
else:
out.append(chunk)
return out


def postprocess(chunks: typing.List[str]):
"""Applies some processes to modify the extracted chunks.
"""Applies some processes to modify the extracted chunks.
Args:
chunks (List[str]): Source chunks.
Returns:
Processed chunks.
"""
chunks = break_before_open_parentheses(chunks)
return chunks
chunks = break_before_open_parentheses(chunks)
return chunks


def download_knbc(target_dir: str):
"""Downloads the KNBC corpus and extracts files.
"""Downloads the KNBC corpus and extracts files.
Args:
target_dir: A path to the directory to expand files.
"""
os.makedirs(target_dir, exist_ok=True)
download_file_path = os.path.join(target_dir, 'knbc.tar.bz2')
try:
urllib.request.urlretrieve(RESOURCE_URL, download_file_path)
except urllib.error.HTTPError:
print(f'\033[91mResource unavailable: {RESOURCE_URL}\033[0m')
raise
with tarfile.open(download_file_path, 'r:bz2') as t:
t.extractall(path=target_dir)
os.makedirs(target_dir, exist_ok=True)
download_file_path = os.path.join(target_dir, 'knbc.tar.bz2')
try:
urllib.request.urlretrieve(RESOURCE_URL, download_file_path)
except urllib.error.HTTPError:
print(f'\033[91mResource unavailable: {RESOURCE_URL}\033[0m')
raise
with tarfile.open(download_file_path, 'r:bz2') as t:
t.extractall(path=target_dir)


def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-o',
'--outfile',
help='''File path to output the training data.
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'-o',
'--outfile',
help='''File path to output the training data.
(default: source.txt)''',
default='source.txt')
args = parser.parse_args()
outfile = args.outfile
html_dir = 'data/KNBC_v1.0_090925_utf8/html/'
if not os.path.isdir(html_dir):
download_knbc('data')
with open(outfile, 'w') as f:
for file in sorted(os.listdir(html_dir)):
if file[-11:] != '-morph.html': continue
parser = KNBCHTMLParser(split_tab=False)
data = open(os.path.join(html_dir, file)).read()
parser.feed(data)
chunks = parser.chunks
chunks = postprocess(chunks)
if len(chunks) < 2: continue
f.write(utils.SEP.join(chunks) + '\n')
print('\033[92mTraining data is output to: %s\033[0m' % (outfile))
default='source.txt')
args = parser.parse_args()
outfile = args.outfile
html_dir = 'data/KNBC_v1.0_090925_utf8/html/'
if not os.path.isdir(html_dir):
download_knbc('data')
with open(outfile, 'w') as f:
for file in sorted(os.listdir(html_dir)):
if file[-11:] != '-morph.html':
continue
parser = KNBCHTMLParser(split_tab=False)
data = open(os.path.join(html_dir, file)).read()
parser.feed(data)
chunks = parser.chunks
chunks = postprocess(chunks)
if len(chunks) < 2:
continue
f.write(utils.SEP.join(chunks) + '\n')
print('\033[92mTraining data is output to: %s\033[0m' % (outfile))


if __name__ == '__main__':
main()
main()
Loading

0 comments on commit 1fdfbf8

Please sign in to comment.