-
Notifications
You must be signed in to change notification settings - Fork 1
/
sampling_misc.py
221 lines (208 loc) · 8.59 KB
/
sampling_misc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
"""
All of the sampling that doesn't quite fit into sampler.py. Most likely
because we are not looking at trend with depth.
"""
from __future__ import division
import sys
from collections import deque
import pandas as pd
import numpy as np
from string_comparisons import prefix_substring_match
import arbitrary_probs_util as string_util
sys.path.append('..')
import merge_alg
def sample_stat_failure_chain_dist(num_samples, probs, depth):
"""Sample to estimate the stationary distribution of failure chain length.
Args:
num_samples: The number of samples to estimate from.
probs: The probabilities of seeing each letter in a set.
depth: The depth to go to in order to start sampling.
Returns: List representing the probability vector.
"""
alphabet = string_util.get_default_alphabet(len(probs))
observed = []
for _ in xrange(num_samples):
gen_string = string_util.create_random_string(probs, depth)
root = merge_alg.aho_merge(gen_string, alphabet).get_root()
# Iterate through to find first state at appropriate depth.
state = root
for child in state.goto.values():
if child is not state:
state = child
break
while state.depth < depth:
state = state.goto.values()[0]
chain_length = _get_failure_chain_length(state)
while len(observed) < chain_length:
observed.append(0)
observed[chain_length - 1] += 1
return [o / num_samples for o in observed]
def sample_state_has_new_thread(num_samples, probs, depth,
failure_length_conditions=None):
"""Samples whether a state at specified conditions has a new thread.
Args:
num_samples: The number of samples to draw.
probs: The probabilities of seeing each letter in the set.
depth: The depth that we should look for states at.
failure_length_conditions: List of conditions for how long the failure
chain should be for the state we are looking at. If None, there is
no condition.
Returns: Pandas DataFrame with 'failure_chain_length' and 'has_new_thread'.
"""
alphabet = string_util.get_default_alphabet(len(probs))
chain_lengths = []
has_new_threads = []
for _ in xrange(num_samples):
# Assemble DFA.
gen_string = string_util.create_random_string(probs, depth + 1)
dfa = merge_alg.aho_merge(gen_string, alphabet)
root = dfa.get_root()
states = _get_states_at_depth(root, depth)
# Look at DFA and add samples to lists.
if failure_length_conditions is None:
selected_state = states[0]
chain_lengths.append(_get_failure_chain_length(selected_state))
has_new_threads.append(_has_new_thread(selected_state))
else:
curr_chain_lengths = [_get_failure_chain_length(state)
for state in states]
for length in failure_length_conditions:
try:
state_index = curr_chain_lengths.index(length)
chain_lengths.append(length)
has_new_threads.append(_has_new_thread(states[state_index]))
except ValueError:
# There was no state with conditions.
pass
# Return lists as a pandas dataframe.
return pd.DataFrame({'failure_chain_length': chain_lengths,
'has_new_thread': has_new_threads})
def sample_failure_depth_size_trend(num_samples, probs, depth1, depth2):
"""Sample whether depth2 has a failure to depth1, how many states depth
1 has, and if G[1:depth1] == G[depth2-depth1:depth1]
Args:
num_samples: The number of samples to look at.
probs: The probability of seeing each letter in a set.
depth1: The depth where we count number of states.
depth2: Want to see if depth2 has failure to depth1.
Returns: DataFrame with 'has_failure' (0 or 1), 'depth_size', and
'string_match' (0 or 1).
"""
has_failure = []
depth_size = []
string_match = []
alphabet = string_util.get_default_alphabet(len(probs))
for _ in xrange(num_samples):
gen_string = string_util.create_random_string(probs, depth2)
# Check the string first.
string_match.append(prefix_substring_match(gen_string,
depth2 - depth1, depth1))
dfa = merge_alg.aho_merge(gen_string, alphabet)
root = dfa.get_root()
# Iterate over graph once instead of helper functions.
curr_depth_size = 0
curr_has_failure = 0
queue = deque()
queue.appendleft(root)
seen = set()
seen.add(root.sid)
while len(queue) > 0:
curr = queue.pop()
if curr.depth == depth1:
curr_depth_size += 1
elif curr.depth == depth2 and curr.failure.depth == depth1:
curr_has_failure = 1
break # Can break here since doing BFS
for child in curr.goto.values():
if child.sid not in seen:
queue.appendleft(child)
seen.add(child.sid)
has_failure.append(curr_has_failure)
depth_size.append(curr_depth_size)
# Return lists as a pandas dataframe.
return pd.DataFrame({'string_match': string_match,
'has_failure': has_failure,
'depth_size': depth_size})
def sample_total_states(num_samples, probs, length):
"""Sample the total number of states in the DFA.
Args:
num_samples: The number of samples to take.
probs: The probabilities of seeing each letter.
length: The length of the generalized string to test.
Returns: Tuple with first thing being average, second being max, and third
the standard deviation.
"""
max_states = 0
avg_states = 0
states = []
alphabet = string_util.get_default_alphabet(len(probs))
for _ in xrange(num_samples):
gen_string = string_util.create_random_string(probs, length)
dfa = merge_alg.aho_merge(gen_string, alphabet)
num_states = dfa.get_num_states()
states.append(num_states)
return (np.mean(states), max(states), np.std(states))
def sample_signature_size(num_samples, probs, depth):
"""Sample the signature size at the depth.
Args:
num_samples: Number of samples to do.
probs: The probabilities of seeing a letter.
depth: The depth at which to sample at.
Returns: List of the samples drawn.
"""
alph = string_util.get_default_alphabet(len(probs))
samples = []
for _ in xrange(num_samples):
gen_string = string_util.create_random_string(probs, depth)
dfa = merge_alg.aho_merge(gen_string, alph)
# Do DFS until we see state of appropriate depth.
root = dfa.get_root()
stack = deque()
stack.append(root)
seen = set([root.sid])
while len(stack) > 0:
curr = stack.pop()
if curr.depth == depth:
samples.append(_get_failure_chain_length(curr) + 1)
break
for child in curr.goto.values():
if child.sid not in seen:
stack.append(child)
seen.add(child.sid)
return samples
def _get_states_at_depth(root, depth):
"""Assemble all states at the specified depth."""
queue = deque()
queue.appendleft(root)
seen = set()
seen.add(root.sid)
states = []
while len(queue) > 0:
curr = queue.pop()
if curr.depth == depth:
states.append(curr)
else:
for child in curr.goto.values():
if child.sid not in seen:
queue.appendleft(child)
seen.add(child.sid)
return states
def _has_new_thread(state):
"""Returns 1 if state has a new thread, 0 otherwise."""
for child in state.goto.values():
if child.failure is not None and child.failure.depth == 1:
return 1
return 0
def _get_failure_chain_length(state):
"""Returns the number of failure transitions in the failure chain."""
count = 0
curr = state
while curr.failure is not None:
curr = curr.failure
count += 1
return count
if __name__ == '__main__':
# print sample_state_has_new_thread(10, [0.5 for _ in range(4)], 15)
# print sample_state_has_new_thread(10, [0.5 for _ in range(4)], 15, [2])
# print sample_stat_failure_chain_dist(10, [0.5 for _ in range(4)], 1)
print sample_failure_depth_size_trend(10, [0.5 for _ in range(4)], 3, 7)