Skip to content

Commit

Permalink
clean build include/exclude usage
Browse files Browse the repository at this point in the history
  • Loading branch information
barrust committed Jan 20, 2024
1 parent 99bbd88 commit c36ebe4
Showing 1 changed file with 102 additions and 136 deletions.
238 changes: 102 additions & 136 deletions scripts/build_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@ def load_file(filename, encoding="utf-8"):
yield fobj


def load_include_exclude(filename, encoding="utf-8"):
with load_file(filename=filename, encoding=encoding) as f:
for line in f:
if line[0] == "#":
continue
line = line.strip().split()
for l in line:
yield l.strip().lower()


def export_word_frequency(filepath, word_frequency):
"""Export a word frequency as a json object
Expand Down Expand Up @@ -80,10 +90,10 @@ def build_word_frequency(filepath, language, output_path):
"""
# NLTK is only needed in this portion of the project
try:
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
import nltk # type: ignore
from nltk.tag import pos_tag # type: ignore
from nltk.tokenize import WhitespaceTokenizer # type: ignore
from nltk.tokenize.toktok import ToktokTokenizer # type: ignore
except ImportError as ex:
raise ImportError("To build a dictioary from scratch, NLTK is required!\n{}".format(ex.message))

Expand Down Expand Up @@ -237,11 +247,9 @@ def clean_english(word_frequency, filepath_exclude, filepath_include, filepath_d
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
Expand All @@ -263,13 +271,11 @@ def clean_english(word_frequency, filepath_exclude, filepath_include, filepath_d
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -349,11 +355,9 @@ def clean_spanish(word_frequency, filepath_exclude, filepath_include, filepath_d
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
Expand All @@ -375,13 +379,11 @@ def clean_spanish(word_frequency, filepath_exclude, filepath_include, filepath_d
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -410,11 +412,9 @@ def clean_italian(word_frequency, filepath_exclude, filepath_include, filepath_d
# TODO: other possible fixes?

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
Expand All @@ -436,13 +436,11 @@ def clean_italian(word_frequency, filepath_exclude, filepath_include, filepath_d
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -486,11 +484,9 @@ def clean_german(word_frequency, filepath_exclude, filepath_include, filepath_di
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
Expand All @@ -512,13 +508,11 @@ def clean_german(word_frequency, filepath_exclude, filepath_include, filepath_di
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -562,11 +556,9 @@ def clean_french(word_frequency, filepath_exclude, filepath_include, filepath_di
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
Expand All @@ -588,13 +580,11 @@ def clean_french(word_frequency, filepath_exclude, filepath_include, filepath_di
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -638,11 +628,9 @@ def clean_portuguese(word_frequency, filepath_exclude, filepath_include, filepat
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
Expand All @@ -664,13 +652,11 @@ def clean_portuguese(word_frequency, filepath_exclude, filepath_include, filepat
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -733,20 +719,16 @@ def clean_russian(word_frequency, filepath_exclude, filepath_include):
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -790,20 +772,16 @@ def clean_arabic(word_frequency, filepath_exclude, filepath_include):
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -847,20 +825,16 @@ def clean_basque(word_frequency, filepath_exclude, filepath_include):
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -932,20 +906,16 @@ def clean_latvian(word_frequency, filepath_exclude, filepath_include):
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down Expand Up @@ -1047,11 +1017,9 @@ def clean_dutch(word_frequency, filepath_exclude, filepath_include, filepath_dic
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
Expand All @@ -1073,13 +1041,11 @@ def clean_dutch(word_frequency, filepath_exclude, filepath_include, filepath_dic
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency

Expand Down

0 comments on commit c36ebe4

Please sign in to comment.