-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathGibberish.class.php
executable file
·189 lines (168 loc) · 6.78 KB
/
Gibberish.class.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
<?php
// Copyright Oliver Lillie 2011
// $Id: Whitechars.php 1938 2011-06-09 13:25:22Z buggedcom $ $Rev: 1938 $
/**
* Tests text content for gibberish input such as
* tapoktrpasawe
* qweasd qwa as
* aıe qwo ıak kqw
* qwe qwe qwe a
*
* @link http://stackoverflow.com/questions/6297991/is-there-any-way-to-detect-strings-like-putjbtghguhjjjanika
* @link https://github.com/rrenaud/Gibberish-Detector
* @link http://en.wikipedia.org/wiki/Markov_chain
* @param string $text The text to check.
* @param array $options
* @return mixed
* @author Oliver Lillie
* @author Rob Renaud Python implementation
*/
class Gibberish
{
protected static $_accepted_characters = 'abcdefghijklmnopqrstuvwxyz ';
public static function test($text, $lib_path, $raw=false)
{
if(file_exists($lib_path) === false)
{
// TODO throw error?
return -1;
}
$trained_library = unserialize(file_get_contents($lib_path));
if(is_array($trained_library) === false)
{
// TODO throw error?
return -1;
}
$value = self::_averageTransitionProbability($text, $trained_library['matrix']);
if($raw === true)
{
return $value;
}
if($value <= $trained_library['threshold'])
{
return true;
}
return false;
}
protected static function _normalise($line)
{
// Return only the subset of chars from accepted_chars.
// This helps keep the model relatively small by ignoring punctuation,
// infrequenty symbols, etc.
return preg_replace('/[^a-z\ ]/', '', strtolower($line));
}
public static function train($big_text_file, $good_text_file, $bad_text_file, $lib_path)
{
$errors = array();
if (is_file($big_text_file) === false) {
$errors[] = 'specified big_text_file does not exist';
}
if (is_file($good_text_file) === false) {
$errors[] = 'specified good_text_file does not exist';
}
if (is_file($bad_text_file) === false) {
$errors[] = 'specified bad_text_file does not exist';
}
if ($errors) {
echo 'File Errors(s):<br>';
echo implode('<br>', $errors).'<br><br>';
return false;
}
$k = strlen(self::$_accepted_characters);
$pos = array_flip(str_split(self::$_accepted_characters));
// Assume we have seen 10 of each character pair. This acts as a kind of
// prior or smoothing factor. This way, if we see a character transition
// live that we've never observed in the past, we won't assume the entire
// string has 0 probability.
$log_prob_matrix = array();
$range = range(0, count($pos)-1);
foreach ($range as $index1)
{
$array = array();
foreach ($range as $index2)
{
$array[$index2] = 10;
}
$log_prob_matrix[$index1] = $array;
}
// Count transitions from big text file, taken
// from http://norvig.com/spell-correct.html
$lines = file($big_text_file);
foreach ($lines as $line)
{
// Return all n grams from l after normalizing
$filtered_line = str_split(self::_normalise($line));
$a = false;
foreach ($filtered_line as $b)
{
if($a !== false)
{
$log_prob_matrix[$pos[$a]][$pos[$b]] += 1;
}
$a = $b;
}
}
unset($lines, $filtered_line);
// Normalize the counts so that they become log probabilities.
// We use log probabilities rather than straight probabilities to avoid
// numeric underflow issues with long texts.
// This contains a justification:
// http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/
foreach ($log_prob_matrix as $i => $row)
{
$s = (float) array_sum($row);
foreach($row as $k=>$j)
{
$log_prob_matrix[$i][$k] = log($j/$s);
}
}
// Find the probability of generating a few arbitrarily choosen good and
// bad phrases.
$good_lines = file($good_text_file);
$good_probs = array();
foreach ($good_lines as $line)
{
array_push($good_probs, self::_averageTransitionProbability($line, $log_prob_matrix));
}
$bad_lines = file($bad_text_file);
$bad_probs = array();
foreach ($bad_lines as $line)
{
array_push($bad_probs, self::_averageTransitionProbability($line, $log_prob_matrix));
}
// Assert that we actually are capable of detecting the junk.
$min_good_probs = min($good_probs);
$max_bad_probs = max($bad_probs);
if($min_good_probs <= $max_bad_probs)
{
return false;
}
// And pick a threshold halfway between the worst good and best bad inputs.
$threshold = ($min_good_probs + $max_bad_probs) / 2;
// save matrix
return file_put_contents($lib_path, serialize(array(
'matrix' => $log_prob_matrix,
'threshold' => $threshold,
))) > 0;
}
public static function _averageTransitionProbability($line, $log_prob_matrix)
{
// Return the average transition prob from line through log_prob_mat.
$log_prob = 1.0;
$transition_ct = 0;
$pos = array_flip(str_split(self::$_accepted_characters));
$filtered_line = str_split(self::_normalise($line));
$a = false;
foreach ($filtered_line as $b)
{
if($a !== false)
{
$log_prob += $log_prob_matrix[$pos[$a]][$pos[$b]];
$transition_ct += 1;
}
$a = $b;
}
# The exponentiation translates from log probs to probs.
return exp($log_prob / max($transition_ct, 1));
}
}