-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathclusterization.py
98 lines (80 loc) · 3.82 KB
/
clusterization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import argparse
import glob
import json
import os
import freetype
import numpy as np
from PIL import ImageFont, Image, ImageDraw
from sklearn.cluster import MiniBatchKMeans
RUSSIAN_LETTERS = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
FONT_COLOR = "#000000"
BACKGROUND_COLOR = "#FFFFFF"
def clusterization(
font_path: str = "resources/fonts/NotoSans.ttf",
output_path: str = "resources/letter_replacement/clusterization.json",
font_size: int = 13,
clusters: int = 500,
) -> dict[str, list[str]]:
"""
Clustering character images supported by a specified font.
:param font_path: the path to the font that needs to generate images (default: fonts/NotoSans.ttf).
:param output_path: the path to the output file (default: letter_replacement/clusterization.json).
:param font_size: font size in pixels (default: 13)
:param clusters: number of clusters into which to divide a set of character images (default: 500)
:return: a dictionary {russian symbol: a list of symbols from the same cluster}
"""
face = freetype.Face(font_path)
image_font = ImageFont.truetype(font_path, max(font_size - 2, 8))
supported_chars = sorted([chr(c) for c, g in face.get_chars() if c])
max_len = 0
for c in supported_chars:
line_width, line_height = image_font.getsize(c)
max_len = max(max_len, line_height, line_width)
letter_images_dirname = "letter_images"
all_images_dirname = os.path.join(letter_images_dirname, "all")
if not os.path.exists(all_images_dirname):
os.makedirs(all_images_dirname)
char2image = {}
for ind, c in enumerate(supported_chars):
image = Image.new("L", (max_len, max_len), color=BACKGROUND_COLOR)
draw = ImageDraw.Draw(image)
w, h = draw.textsize(c, font=image_font)
draw.text(
xy=((max_len - w) / 2, (max_len - h) / 2),
text=c,
fill=FONT_COLOR,
font=image_font,
)
image.save(f"{all_images_dirname}/{ind}.jpg")
char2image[c] = image
char2vector = {ch: np.asarray(image).reshape(-1).astype(np.float32) / 255.0 for ch, image in char2image.items()}
clusterizer = MiniBatchKMeans(n_clusters=clusters)
clusters = clusterizer.fit_predict(list(char2vector.values()))
char2cluster = {ch: cluster for ch, cluster in zip(char2image.keys(), clusters)}
cluster2char = [[] for _ in range(clusterizer.n_clusters)]
for ch, cluster in char2cluster.items():
cluster2char[cluster].append(ch)
for lower_letter in RUSSIAN_LETTERS:
for is_upper, letter in enumerate([lower_letter, lower_letter.upper()]):
cluster = char2cluster[letter]
letter_name = f"capital_{lower_letter}" if is_upper else lower_letter
letter_dir = f"{letter_images_dirname}/{letter_name}"
if not os.path.exists(letter_dir):
os.mkdir(letter_dir)
else:
for file in glob.glob(letter_dir + "/*"):
os.remove(file)
for ind, similar_letter in enumerate(cluster2char[cluster]):
char2image[similar_letter].save(f"{letter_dir}/{similar_letter}.jpg")
result = {ch: cluster2char[char2cluster[ch]] for ch in RUSSIAN_LETTERS + RUSSIAN_LETTERS.upper()}
with open(output_path, "w") as json_file:
json.dump(result, json_file, ensure_ascii=False)
return result
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--font-path", type=str, default="resources/fonts/NotoSans.ttf")
parser.add_argument("--output-path", type=str, default="resources/letter_replacement/clusterization.json")
parser.add_argument("--font-size", type=int, default=13)
parser.add_argument("--clusters", type=int, default=500)
args = parser.parse_args()
clusterization(**vars(args))