-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvinegar_tools.py
120 lines (94 loc) · 3.08 KB
/
vinegar_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import functools
from collections import defaultdict
_LOOSE_ENGLISH_IC_ = 0.062
_ERROR_MARGIN_ = 0.05
_STRICT_ENGLISH_IC_ = 0.0615
_HASH_MAP_ = defaultdict(list)
_target_wordlist_file_ = 'resources/english_wordlist.txt'
def _make_HASH_MAP_():
for line in open(_target_wordlist_file_, 'r'):
_HASH_MAP_[hash(line[:-1])].append(line[:-1])
@functools.lru_cache(10000)
def is_english_word(word: str) -> bool:
if not _HASH_MAP_:
_make_HASH_MAP_()
hash_chain = _HASH_MAP_.get(hash(word))
result = (word in hash_chain) if hash_chain else False
return result
@functools.lru_cache(100)
def is_text_english(text: str) -> bool:
if len(text) == 0:
return True
for possible_length in range(1, len(text) + 1):
word = text[:possible_length]
rest = text[possible_length:]
if is_english_word(word) and is_text_english(rest):
return True
return False
def is_text_partially_english(text: str, lower_bound: int) -> int:
"""if a text of at least length lower_bound from beginning --> true
ex: "youewrewr" with lower_bound 3 -> true
ex: "youewrewr" with lower_bound 4 -> false
"""
if not (0 <= lower_bound <= len(text)):
return False
for possible_length in range(lower_bound, len(text) + 1):
current_text = text[:possible_length]
if is_text_english(current_text):
return True
return False
def is_text_within_allowed_English_IC_margin(text: str) -> bool:
if _LOOSE_ENGLISH_IC_ * (1 - _ERROR_MARGIN_) <= calculate_IC(text):
return True
else:
return False
def calculate_IC(text: str) -> float:
occurrence_list = [text.count(i) for i in set(text)]
text_length = len(text)
index_of_coincidence = 0
for x in occurrence_list:
index_of_coincidence += x * (x - 1)
index_of_coincidence /= text_length * (text_length - 1)
return index_of_coincidence
def find_key_length(text: str) -> int:
key_length = 0
MAXIMUM_KEY_LENGTH = 100
# keep history of last 3
IC_triad = []
for _ in range(min(len(text), MAXIMUM_KEY_LENGTH + 1)):
key_length += 1
text_groups = [[] for _ in range(key_length)]
for i in range(len(text)):
text_groups[i % key_length].append(text[i])
current_IC = 0
for text_group in text_groups:
current_IC += calculate_IC(''.join(text_group))
current_IC /= key_length
print(f'Current length: %s, IC: %s' % (key_length, current_IC))
if len(IC_triad) > 3:
IC_triad.pop(0)
IC_triad.append(current_IC)
# IC of a key component satisfy:
# * if key_length = 1 --> IC > IC_2
# * else IC_(X-1) < IC_(X) > IC_(X+1)
if key_length == 1:
continue
elif key_length == 2:
if IC_triad[0] > _STRICT_ENGLISH_IC_ and IC_triad[0] > IC_triad[1]:
key_length = 1
break
elif key_length == 3:
if IC_triad[0] > _STRICT_ENGLISH_IC_ and IC_triad[0] > IC_triad[1]:
key_length = 2
break
else:
if IC_triad[1] > _STRICT_ENGLISH_IC_ and IC_triad[0] < IC_triad[1] > IC_triad[2]:
key_length -= 2
break
return key_length
if __name__ == '__main__':
print(is_english_word('hello'))
target = 'wellhellothererrrr'
print(target, len(target))
print(is_text_english(target))
print(is_text_partially_english(target, len(target) - 3))