Skip to content

Commit

Permalink
Merge pull request #71 from Glitchy-Tozier/Small-Fixes
Browse files Browse the repository at this point in the history
Add/improve scripts
  • Loading branch information
dariogoetz authored Nov 27, 2023
2 parents e9a5d5b + 013c89f commit 86f0621
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 8 deletions.
14 changes: 13 additions & 1 deletion scripts/ngrams/clean_uni_leipzig_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,25 @@

args = parser.parse_args()

# Check if input file exists
if not os.path.exists(args.infile):
print(f"Input file '{args.infile}' does not exist.")
exit(1)
# Check if output file exists
if os.path.exists(args.outfile):
print(
f"Warning: Output file '{args.outfile}' already exists and will be overwritten."
)

# delete leading line numbers
os.system(f"cut -f2 {args.infile} > {args.outfile}")
# replace 4 out of 5 line breaks with spaces
with open(args.outfile) as fp:
s = fp.read()

res = re.sub("(\n)", lambda m, c=itertools.count(): m.group() if next(c) % 5 == 4 else " ", s)
res = re.sub(
"(\n)", lambda m, c=itertools.count(): m.group() if next(c) % 5 == 4 else " ", s
)

with open(args.outfile, "w") as fp:
fp.write(res)
67 changes: 67 additions & 0 deletions scripts/ngrams/filter_impossible_ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
This script checks your layout-config and removes all impossible ngrams from row specified ngrams-directory.
"""

import yaml
import os
import shutil


def load_yaml_from_file(yaml_file):
with open(yaml_file, "r") as file:
yaml_code = file.read()
yaml_data = yaml.safe_load(yaml_code)
return yaml_data


def filter_ngrams(layout_chars, ngram_dir, output_dir):
os.makedirs(output_dir, exist_ok=True)
count = 0

for i, filename in enumerate(os.listdir(ngram_dir)):
if filename.endswith(".txt"):
input_filepath = os.path.join(ngram_dir, filename)
output_filepath = os.path.join(output_dir, filename)
with open(input_filepath, "r") as input_file, open(
output_filepath, "w"
) as output_file:
print("Processing", input_filepath, "→", output_filepath)
for line in input_file:
# Split line into frequency and ngram
frequency, ngram = line.split(" ", 1)
ngram = ngram[:-1]
# ngram = ngram.rstrip() # Remove trailing whitespace and newline

valid_ngram = True
for char in ngram:
if char not in layout_chars:
valid_ngram = False
break

""" if (not valid_ngram) and (count < 100) and (filename[0] == "1"):
count += 1
print(count, frequency, ngram, valid_ngram) """

if valid_ngram:
output_file.write(line)


# Example usage
yaml_file = "config/keyboard/my_keyboard_config.yml" # Specify the keyboard-config here
ignore_in_layout = "☒■⇩⇘⇧⇗♕⇇↜⇉↝♛" # ♔
ngram_dir = "ngrams/made_up_dir"
output_dir = "ngrams/made_up_dir_reduced"

yaml_data = load_yaml_from_file(yaml_file)
layout_chars = set()
print("\nCharacters in Layout:")
for row in yaml_data["base_layout"]["keys"]:
for key in row:
print(key)
for c in key:
layout_chars.add(c)

for c in ignore_in_layout:
layout_chars.remove(c)

filter_ngrams(layout_chars, ngram_dir, output_dir)
43 changes: 43 additions & 0 deletions scripts/ngrams/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import argparse
import sys
import os


def main(ngrams_directory):
filenames = [
os.path.join(ngrams_directory, "1-grams.txt"),
os.path.join(ngrams_directory, "2-grams.txt"),
os.path.join(ngrams_directory, "3-grams.txt")
]

for filename in filenames:
fTot = 0
f = []
l = []
with open(filename) as ngrams:
i = 0
for ngram in ngrams:
freqStr, letters = ngram.split(" ", 1)
freq = float(freqStr)

f.append(freq)
l.append(letters)
fTot += freq

with open(filename, "w") as ngrams:
for freq, ngram in zip(f, l):
ngrams.write(str(100 * freq / fTot) + " " + ngram)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Normalize n-gram frequencies in a directory of n-gram files."
"Normalization converts absolute frequencies into percentages of "
"how often an n-gram occurs within the corpus.")
parser.add_argument("ngrams_directory", help="Path to the directory containing the n-gram files.")
args = parser.parse_args()

if not os.path.isdir(args.ngrams_directory):
print("Error: Invalid n-gram directory path. Please provide a valid directory path.", file=sys.stderr)
sys.exit(1)

main(args.ngrams_directory)
37 changes: 30 additions & 7 deletions scripts/remove_duplicate_found_layouts.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,53 @@
"""
This script removes all the duplicate layouts from your `solutions.txt`-file.
This script removes all the duplicate layouts from a specified file.
"""

def main():
import argparse
import os


def remove_duplicates(filename):
originalCount = 0
uniqueLayouts = [] # A list is used instead of a set to preserve ordering.
uniqueLayouts = [] # A list is used instead of a set to preserve ordering.

# Fill up [uniqueLayouts].
with open("../solutions.txt") as layouts:
with open(filename) as layouts:
for layout in layouts:
originalCount += 1
if layout not in uniqueLayouts:
uniqueLayouts.append(layout)

if originalCount == len(uniqueLayouts):
print ("There are no duplicate Layouts.")
print("There are no duplicate Layouts.")
else:
# Write all unique layouts to the same file, replacing the old text.
with open("../solutions.txt", "w") as layouts:
with open(filename, "w") as layouts:
for layout in uniqueLayouts:
layouts.write(layout)

# Display results
print("Updated file!")
print("Original count:", originalCount,)
print("Original count:", originalCount)
print("New count: ", len(uniqueLayouts))


def main():
parser = argparse.ArgumentParser(
description="Remove duplicate layouts from a file."
) # Create an argument parser
parser.add_argument(
"filename", help="Name of the file to process"
) # Add the required unnamed command-line parameter for the filename
args = parser.parse_args() # Parse the command-line arguments

# Check if the file exists
if not os.path.exists(args.filename):
print(f"Error: The file '{args.filename}' does not exist.")
exit(1)

# Call the function to remove duplicates
remove_duplicates(args.filename)


if __name__ == "__main__":
main()

0 comments on commit 86f0621

Please sign in to comment.