-
Notifications
You must be signed in to change notification settings - Fork 847
/
Copy pathcustom_components.py
79 lines (64 loc) Β· 3.15 KB
/
custom_components.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from typing import List
import jieba
from tokenizers import NormalizedString, PreTokenizedString, Regex, Tokenizer
from tokenizers.decoders import Decoder
from tokenizers.models import BPE
from tokenizers.normalizers import Normalizer
from tokenizers.pre_tokenizers import PreTokenizer
class JiebaPreTokenizer:
def jieba_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
splits = []
# we need to call `str(normalized_string)` because jieba expects a str,
# not a NormalizedString
for token, start, stop in jieba.tokenize(str(normalized_string)):
splits.append(normalized_string[start:stop])
return splits
# We can also easily do it in one line:
# return [normalized_string[w[1] : w[2]] for w in jieba.tokenize(str(normalized_string))]
def odd_number_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
# Just an odd example...
splits = []
last = 0
for i, char in enumerate(str(normalized_string)):
if char.isnumeric() and int(char) % 2 == 1:
splits.append(normalized_string[last:i])
last = i
# Don't forget the last one
splits.append(normalized_string[last:])
return splits
def pre_tokenize(self, pretok: PreTokenizedString):
# Let's call split on the PreTokenizedString to split using `self.jieba_split`
pretok.split(self.jieba_split)
# Here we can call `pretok.split` multiple times if we want to apply
# different algorithm, but we generally just need to call it once.
pretok.split(self.odd_number_split)
class CustomDecoder:
def decode(self, tokens: List[str]) -> str:
return "".join(tokens)
class CustomNormalizer:
def normalize(self, normalized: NormalizedString):
# Most of these can be replaced by a `Sequence` combining some provided Normalizer,
# (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
# and it should be the preferred way. That being said, here is an example of the kind
# of things that can be done here:
normalized.nfkc()
normalized.filter(lambda char: not char.isnumeric())
normalized.replace(Regex("\s+"), " ")
normalized.lowercase()
# This section shows how to attach these custom components to the Tokenizer
tok = Tokenizer(BPE())
tok.normalizer = Normalizer.custom(CustomNormalizer())
tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
tok.decoder = Decoder.custom(CustomDecoder())
input = "ζ°Έεζθ£
ι₯°εζιε
¬εΈ"
print("PreTokenize:", input)
print(tok.pre_tokenizer.pre_tokenize_str(input))
# [('ζ°Έε', (0, 2)), ('ζθ£
', (2, 4)), ('ι₯°ε', (4, 6)), ('ζιε
¬εΈ', (6, 10))]
input = "112233"
print("PreTokenize:", input)
print(tok.pre_tokenizer.pre_tokenize_str(input))
# [('1', (0, 1)), ('122', (1, 4)), ('3', (4, 5)), ('3', (5, 6))]
input = "1234 βπ’π©π©π¬ π±π₯π’π―π’ ππ πΉβ―πΆπ ππππ£ ππ£ππππ!"
print("Normalize:", input)
print(tok.normalizer.normalize_str(input))
# " hello there my dear dear friend!"