forked from fionapigott/emoji-counter
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_unicode_tables.py
124 lines (113 loc) · 5.04 KB
/
parse_unicode_tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import fileinput
import pickle
from collections import defaultdict
# Set the encoding that we want the results in:
## utf 8
encoding = "utf-8"
start_byte = 0
start_bytes = ""
# utf 32
#encoding = "utf-32"
#start_byte = 4
#start_bytes = "\xff\xfe\x00\x00"
# ############################################
emoji_dict = defaultdict(list)
emoji_dict_with_info = {}
uid = 0
for line in fileinput.FileInput("emoji_data.txt"):
# If the line starts with "#", it's a comment:
if line[0] == "#" and line.strip() != "":
continue
# else, it's an emoji
else:
code_point_data = [x.strip() for x in line.split("#")[0].split(";")]
# if it's one of these weird cases, deal with it later:
# the regional letter indicator range, for one
if code_point_data in ["1F1E6..1F1FF"]:
continue
# parse the range of code points
if ".." in code_point_data[0]:
char_range = code_point_data[0].split("..")
range_list = [unichr(x) for x in range(int(char_range[0],16), int(char_range[1],16)+1)]
else:
try:
range_list = [unichr(int(code_point_data[0],16))]
except ValueError:
# print line
continue
# store the emoji
for emoji in range_list:
emoji_dict[code_point_data[1]].append(emoji.encode(encoding)[start_byte:])
# You can count however many emoji of whatever type that you like
# But for some reason emoji_dict["Emoji"] incodes some #s, ignore them with len(emoji) > 1
for emoji in set(emoji_dict["Emoji_Presentation"] + emoji_dict["Emoji"]):
if len((start_bytes + emoji).decode(encoding).encode("utf-8")) > 1:
emoji_dict_with_info[emoji] = {
"modifiers_possible_before": [],
"modifiers_possible_after": [],
"can_stand_alone": True,
"look_for_modifiers": False,
"name": str(uid)
}
#print uid
uid += 1
# add the modifiers
for emoji in emoji_dict["Emoji_Modifier_Base"]:
for modifier in emoji_dict["Emoji_Modifier"]:
try:
emoji_dict_with_info[emoji]["modifiers_possible_after"].append(modifier)
emoji_dict_with_info[emoji]["look_for_modifiers"] = True
# These guys weren't in the original set, ignore them or uncomment and include them
except KeyError:
#emoji_dict_with_info[emoji] = {
# "modifiers_possible_before": [],
# "modifiers_possible_after": [modifier],
# "can_stand_alone": True,
# "look_for_modifiers": True,
# "name": str(uid)
# }
#uid += 1
pass
# I'm going to add all possible combinations of the regional letter codes (for flags)
#"1F1E6..1F1FF"
flag_range_list = [unichr(x) for x in range(int("1F1E6",16), int("1F1FF",16)+1)]
for letter_1 in flag_range_list:
emoji_dict_with_info[letter_1.encode(encoding)[start_byte:]] = {
"modifiers_possible_after": [x.encode(encoding)[start_byte:] for x in flag_range_list],
"modifiers_possible_before": [],
"can_stand_alone": False,
"look_for_modifiers": True,
"name": str(uid)
}
uid += 1
# I'm going to add the "number" + "combining enclosing keycap character"
keycap = "\xe2\x83\xa3"
numbers = ["\x23", "\x30", "\x31", "\x32", "\x33", "\x34", "\x35", "\x36", "\x37", "\x38", "\x39"]
emoji_dict_with_info[keycap.decode("utf-8").encode(encoding)[start_byte:]] = {
"modifiers_possible_after": [],
"modifiers_possible_before": [x.decode("utf-8").encode(encoding)[start_byte:] for x in numbers],
"can_stand_alone": False,
"look_for_modifiers": True,
"name": str(uid)
}
# if the encoding is utf-8, we will need this
if encoding == "utf-8":
unicode_markers = {}
for emoji in emoji_dict_with_info:
if len(emoji) > 1:
unicode_markers[emoji[0]] = len(emoji)
pickle.dump(unicode_markers, open("unicode_markers_{}.pkl".format(encoding),"w"))
# Dump the dictionary
pickle.dump(emoji_dict_with_info, open("emoji_dict_{}.pkl".format(encoding),"w"))
print "Dictionary of emoji created at emoji_dict_{}.pkl".format(encoding)
print "Encoding is {}".format(encoding)
# Debugging
## print the entire thing, so you know what you're dealing with
## you could use the ids to selectively not count emoji, or selectively count emoji
#print "emoji, byte string in {}, id, with before modifiers, with after modifiers".format(encoding)
#for emoji in [x[0] for x in sorted(emoji_dict_with_info.items(), key = lambda x: int(x[1]["name"]))]:
# print((start_bytes + emoji).decode(encoding).encode("utf-8") + " , " +
# str([emoji]).strip("][") + " , " +
# emoji_dict_with_info[emoji]["name"] + " , " +
# " ".join([(start_bytes + emoji + m).decode(encoding).encode("utf-8") for m in emoji_dict_with_info[emoji]["modifiers_possible_after"]]) + " , " +
# " ".join([(start_bytes + emoji + m).decode(encoding).encode("utf-8") for m in emoji_dict_with_info[emoji]["modifiers_possible_before"]]))