Skip to content

Commit

Permalink
add Italian dictionary (#167)
Browse files Browse the repository at this point in the history
  • Loading branch information
barrust authored Dec 28, 2023
1 parent 82784ca commit 264cc0c
Show file tree
Hide file tree
Showing 8 changed files with 3,383 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Version 0.7.4

* Leveraged the dictionary files from [levidromelist](https://www.levidromelist.com/levidrome-list/dictionary) to attempt to clean up the `en`, `es`, `fr`, `pt`, `'de`, and `nl`dictionaries; Attempts to resolve issues #164, #155, #150, #140, #115, and #107; see [issue #126](https://github.com/barrust/pyspellchecker/issues/126)
* Added `Italian` language support; see [#167](https://github.com/barrust/pyspellchecker/pull/167)

## Version 0.7.3

Expand Down
69 changes: 67 additions & 2 deletions scripts/build_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
Basque Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.au.gz
Latvian Input: https://huggingface.co/datasets/RaivisDejus/latvian-text
Dutch Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.nl.gz
Italian Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.it.gz
Requirements:
The script requires more than the standard library to run in its
entirety. You will also need to install the NLTK package to build a
Expand Down Expand Up @@ -88,7 +89,7 @@ def build_word_frequency(filepath, language, output_path):

nltk.download("averaged_perceptron_tagger")
word_frequency = Counter()
if language == "es":
if language in ["es", "it"]:
tok = ToktokTokenizer()
else:
tok = WhitespaceTokenizer()
Expand Down Expand Up @@ -385,6 +386,67 @@ def clean_spanish(word_frequency, filepath_exclude, filepath_include, filepath_d
return word_frequency


def clean_italian(word_frequency, filepath_exclude, filepath_include, filepath_dictionary):
letters = set("abcdefghijklmnopqrstuvwxyzáéíóúüàèìòù")

# fix issues with words containing other characters
invalid_chars = list()
for key in word_frequency:
kl = set(key)
if kl.issubset(letters):
continue
invalid_chars.append(key)
for misfit in invalid_chars:
word_frequency.pop(misfit)

# remove small numbers
small_frequency = list()
for key in word_frequency:
if word_frequency[key] <= MINIMUM_FREQUENCY:
small_frequency.append(key)
for misfit in small_frequency:
word_frequency.pop(misfit)

# TODO: other possible fixes?

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
with load_file(filepath_dictionary) as fobj:
dictionary_words = []
for line in fobj:
if line[0] in letters and line.islower():
line = line.strip()
dictionary_words.append(line)

for word in word_frequency:
if word not in dictionary_words:
final_words_to_remove.append(word)
for word in final_words_to_remove:
word_frequency.pop(word)

for word in dictionary_words:
if word not in word_frequency:
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency


def clean_german(word_frequency, filepath_exclude, filepath_include, filepath_dictionary):
"""Clean a German word frequency list
Expand Down Expand Up @@ -1034,7 +1096,7 @@ def _parse_args():
"--language",
required=True,
help="The language being built",
choices=["en", "es", "de", "fr", "pt", "ru", "ar", "lv", "eu", "nl"],
choices=["en", "es", "de", "fr", "pt", "ru", "ar", "lv", "eu", "nl", "it"],
)
parser.add_argument(
"-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
Expand Down Expand Up @@ -1102,6 +1164,9 @@ def _parse_args():
elif args.language == "es":
dict_path = os.path.abspath("{}/levidromelist-dicts/spanish.txt".format(data_path))
word_frequency = clean_spanish(word_frequency, exclude_filepath, include_filepath, dict_path)
elif args.language == "it":
dict_path = os.path.abspath("{}/levidromelist-dicts/italian.txt".format(data_path))
word_frequency = clean_italian(word_frequency, exclude_filepath, include_filepath, dict_path)
elif args.language == "de":
dict_path = os.path.abspath("{}/levidromelist-dicts/new_german.txt".format(data_path))
word_frequency = clean_german(word_frequency, exclude_filepath, include_filepath, dict_path)
Expand Down
Empty file added scripts/data/it_exclude.txt
Empty file.
Binary file added scripts/data/it_full.json.gz
Binary file not shown.
Empty file added scripts/data/it_include.txt
Empty file.
Loading

0 comments on commit 264cc0c

Please sign in to comment.