1 | |
---|
2 | import sys |
---|
3 | import os |
---|
4 | import functools |
---|
5 | import math |
---|
6 | import statistics |
---|
7 | import gzip |
---|
8 | import random |
---|
9 | import scipy |
---|
10 | import scipy.stats |
---|
11 | import numpy |
---|
12 | |
---|
13 | # Don't trust numpy's seeding |
---|
14 | numpy.random.seed(random.SystemRandom().randint(0,2**32-1)) |
---|
15 | |
---|
16 | |
---|
17 | def mad(arr): |
---|
18 | """ Median Absolute Deviation: a "Robust" version of standard deviation. |
---|
19 | Indices variabililty of the sample. |
---|
20 | https://en.wikipedia.org/wiki/Median_absolute_deviation |
---|
21 | """ |
---|
22 | arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays. |
---|
23 | med = numpy.median(arr) |
---|
24 | return numpy.median(numpy.abs(arr - med)) |
---|
25 | |
---|
26 | |
---|
27 | def cov(x,y): |
---|
28 | mx = statistics.mean(x) |
---|
29 | my = statistics.mean(y) |
---|
30 | products = [] |
---|
31 | for i in range(0,len(x)): |
---|
32 | products.append((x[i] - mx)*(y[i] - my)) |
---|
33 | |
---|
34 | return statistics.mean(products) |
---|
35 | |
---|
36 | |
---|
37 | def difference(ls): |
---|
38 | return ls[0]-ls[1] |
---|
39 | |
---|
40 | def product(ls): |
---|
41 | return ls[0]*ls[1] |
---|
42 | |
---|
43 | def hypotenuse(ls): |
---|
44 | return math.hypot(ls[0],ls[1]) |
---|
45 | |
---|
46 | def trustValues(derived, trustFunc): |
---|
47 | ret_val = [] |
---|
48 | for k,v in derived.items(): |
---|
49 | ret_val.append((trustFunc((v['long'],v['short'])), k)) |
---|
50 | |
---|
51 | ret_val.sort() |
---|
52 | return ret_val |
---|
53 | |
---|
54 | |
---|
55 | def prunedWeights(derived, trust, alpha): |
---|
56 | weights = {} |
---|
57 | |
---|
58 | threshold = len(trust)*(1.0-alpha) |
---|
59 | for i in range(0,len(trust)): |
---|
60 | if i < threshold: |
---|
61 | weights[trust[i][1]] = 1.0 |
---|
62 | else: |
---|
63 | weights[trust[i][1]] = 0.0 |
---|
64 | |
---|
65 | return weights |
---|
66 | |
---|
67 | |
---|
68 | def linearWeights(derived, trust, alpha): |
---|
69 | x1 = trust[0][0] |
---|
70 | y1 = 1.0 + (alpha*10) |
---|
71 | x2 = trust[(len(trust)-1)//3][0] |
---|
72 | y2 = 1.0 |
---|
73 | m = (y1-y2)/(x1-x2) |
---|
74 | b = y1 - m*x1 |
---|
75 | |
---|
76 | weights = {} |
---|
77 | for t,k in trust: |
---|
78 | weights[k] = m*t+b |
---|
79 | if weights[k] < 0.0: |
---|
80 | weights[k] = 0.0 |
---|
81 | |
---|
82 | return weights |
---|
83 | |
---|
84 | |
---|
85 | def invertedWeights(derived,trust,alpha): |
---|
86 | # (x+1-first_sample)^(-alpha) |
---|
87 | #scale = trust[0][0] |
---|
88 | |
---|
89 | #weights = {} |
---|
90 | #for t,k in trust: |
---|
91 | # weights[k] = (t+1-scale)**(-1.0*alpha) |
---|
92 | # if weights[k] < 0.0: |
---|
93 | # weights[k] = 0.0 |
---|
94 | |
---|
95 | weights = {} |
---|
96 | for i in range(len(trust)): |
---|
97 | w = 10.0/(i+2.0)-0.2 |
---|
98 | if w < 0.0: |
---|
99 | w = 0.0 |
---|
100 | weights[trust[i][1]] = w |
---|
101 | |
---|
102 | |
---|
103 | return weights |
---|
104 | |
---|
105 | |
---|
106 | |
---|
107 | def arctanWeights(derived,trust,alpha): |
---|
108 | shift = trust[int((len(trust)-1)*(1.0-alpha))][0] |
---|
109 | minimum = trust[0][0] |
---|
110 | |
---|
111 | weights = {} |
---|
112 | for i in range(len(trust)): |
---|
113 | w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum)) |
---|
114 | if w < 0.0: |
---|
115 | w = 0.0 |
---|
116 | weights[trust[i][1]] = w |
---|
117 | |
---|
118 | return weights |
---|
119 | |
---|
120 | |
---|
121 | def arctanWeights2(derived,trust,alpha): |
---|
122 | shift = trust[int((len(trust)-1)*(1.0-alpha))][0] |
---|
123 | minimum = trust[0][0] |
---|
124 | stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median |
---|
125 | |
---|
126 | weights = {} |
---|
127 | for i in range(len(trust)): |
---|
128 | w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum)) |
---|
129 | if w < 0.0: |
---|
130 | w = 0.0 |
---|
131 | weights[trust[i][1]] = w |
---|
132 | |
---|
133 | return weights |
---|
134 | |
---|
135 | |
---|
136 | def midsummary(values, distance=25): |
---|
137 | #return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0 |
---|
138 | l,h = numpy.percentile(values, (50-distance,50+distance)) |
---|
139 | return (l+h)/2.0 |
---|
140 | |
---|
141 | def trimean(values, distance=25): |
---|
142 | return (midsummary(values, distance) + statistics.median(values))/2 |
---|
143 | |
---|
144 | def ubersummary(values, distance=25): |
---|
145 | left2 = 50-distance |
---|
146 | left3 = 50-(distance/2.0) |
---|
147 | left1 = left2/2.0 |
---|
148 | right2 = 50+distance |
---|
149 | right3 = 50+(distance/2.0) |
---|
150 | right1 = (right2+100)/2.0 |
---|
151 | l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1)) |
---|
152 | #print(l1,l2,l3,m,r3,r2,r1) |
---|
153 | return (l1+l2*4+l3+r3+r2*4+r1)/12.0 |
---|
154 | #return statistics.mean((l1,l2,l3,m,r3,r2,r1)) |
---|
155 | |
---|
156 | |
---|
157 | def quadsummary(values, distance=25): |
---|
158 | left1 = 50-distance |
---|
159 | left2 = (left1+50)/2.0 |
---|
160 | right1 = 50+distance |
---|
161 | right2 = (right1+50)/2.0 |
---|
162 | l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1)) |
---|
163 | #print(left1,left2,left3,50,right3,right2,right1) |
---|
164 | #print(l1,l2,l3,m,r3,r2,r1) |
---|
165 | return (l1+l2+r2+r1)/4.0 |
---|
166 | #return statistics.mean((l1,l2,l3,m,r3,r2,r1)) |
---|
167 | |
---|
168 | |
---|
169 | def tsvalwmean(subseries): |
---|
170 | weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries] |
---|
171 | normalizer = sum(weights)/len(weights) |
---|
172 | return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer |
---|
173 | for i in range(len(weights))]) |
---|
174 | |
---|
175 | #def tsvalwmean(subseries): |
---|
176 | # return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries]) |
---|
177 | |
---|
178 | |
---|
179 | def weightedMean(derived, weights): |
---|
180 | normalizer = sum(weights.values())/len(weights) |
---|
181 | return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()]) |
---|
182 | |
---|
183 | def weightedMeanTsval(derived, weights): |
---|
184 | normalizer = sum(weights.values())/len(weights) |
---|
185 | return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()]) |
---|
186 | |
---|
187 | |
---|
188 | |
---|
189 | |
---|
190 | def estimateMean(trustFunc, weightFunc, alpha, derived): |
---|
191 | trust = trustValues(derived, trustFunc) |
---|
192 | weights = weightFunc(derived, trust, alpha) |
---|
193 | return weightedMean(derived, weights) |
---|
194 | |
---|
195 | |
---|
196 | def estimateMeanTsval(trustFunc, weightFunc, alpha, derived): |
---|
197 | trust = trustValues(derived, trustFunc) |
---|
198 | weights = weightFunc(derived, trust, alpha) |
---|
199 | return weightedMeanTsval(derived, weights) |
---|
200 | |
---|
201 | |
---|
202 | def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials): |
---|
203 | ret_val = [] |
---|
204 | for t in range(num_trials): |
---|
205 | ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size))) |
---|
206 | |
---|
207 | return ret_val |
---|
208 | |
---|
209 | |
---|
210 | # Returns 1 if unusual_case is unusual in the expected direction |
---|
211 | # 0 if it isn't unusual |
---|
212 | # -1 if it is unusual in the wrong direction |
---|
213 | def multiBoxTest(params, greater, samples): |
---|
214 | uc = [s['unusual_packet'] for s in samples] |
---|
215 | rest = [s['other_packet'] for s in samples] |
---|
216 | |
---|
217 | uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low'])) |
---|
218 | rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low'])) |
---|
219 | if uc_high < rest_low: |
---|
220 | if greater: |
---|
221 | return -1 |
---|
222 | else: |
---|
223 | return 1 |
---|
224 | |
---|
225 | if rest_high < uc_low: |
---|
226 | if greater: |
---|
227 | return 1 |
---|
228 | else: |
---|
229 | return -1 |
---|
230 | |
---|
231 | return 0 |
---|
232 | |
---|
233 | |
---|
234 | # Returns 1 if unusual_case is unusual in the expected direction |
---|
235 | # 0 otherwise |
---|
236 | def summaryTest(f, params, greater, samples): |
---|
237 | diffs = [s['unusual_packet']-s['other_packet'] for s in samples] |
---|
238 | |
---|
239 | mh = f(diffs, params['distance']) |
---|
240 | if greater: |
---|
241 | if mh > params['threshold']: |
---|
242 | return 1 |
---|
243 | else: |
---|
244 | return 0 |
---|
245 | else: |
---|
246 | if mh < params['threshold']: |
---|
247 | return 1 |
---|
248 | else: |
---|
249 | return 0 |
---|
250 | |
---|
251 | |
---|
252 | midsummaryTest = functools.partial(summaryTest, midsummary) |
---|
253 | trimeanTest = functools.partial(summaryTest, trimean) |
---|
254 | ubersummaryTest = functools.partial(summaryTest, ubersummary) |
---|
255 | quadsummaryTest = functools.partial(summaryTest, quadsummary) |
---|
256 | |
---|
257 | def rmse(expected, measurements): |
---|
258 | s = sum([(expected-m)**2 for m in measurements])/len(measurements) |
---|
259 | return math.sqrt(s) |
---|
260 | |
---|
261 | def nrmse(expected, measurements): |
---|
262 | return rmse(expected, measurements)/(max(measurements)-min(measurements)) |
---|
263 | |
---|
264 | |
---|
265 | class KalmanFilter1D: |
---|
266 | def __init__(self, x0, P, R, Q): |
---|
267 | self.x = x0 |
---|
268 | self.P = P |
---|
269 | self.R = R |
---|
270 | self.Q = Q |
---|
271 | |
---|
272 | def update(self, z): |
---|
273 | self.x = (self.P * z + self.x * self.R) / (self.P + self.R) |
---|
274 | self.P = 1. / (1./self.P + 1./self.R) |
---|
275 | |
---|
276 | def predict(self, u=0.0): |
---|
277 | self.x += u |
---|
278 | self.P += self.Q |
---|
279 | |
---|
280 | |
---|
281 | def kfilter(params, observations): |
---|
282 | x = numpy.array(observations) |
---|
283 | movement = 0 |
---|
284 | est = [] |
---|
285 | var = [] |
---|
286 | kf = KalmanFilter1D(x0 = quadsummary(x), # initial state |
---|
287 | #P = 10000, # initial variance |
---|
288 | P = 10, # initial variance |
---|
289 | R = numpy.std(x), # msensor noise |
---|
290 | Q = 0) # movement noise |
---|
291 | for round in range(1): |
---|
292 | for d in x: |
---|
293 | kf.predict(movement) |
---|
294 | kf.update(d) |
---|
295 | est.append(kf.x) |
---|
296 | var.append(kf.P) |
---|
297 | |
---|
298 | return({'est':est, 'var':var}) |
---|
299 | |
---|
300 | |
---|
301 | def kalmanTest(params, greater, samples): |
---|
302 | diffs = [s['unusual_packet']-s['other_packet'] for s in samples] |
---|
303 | |
---|
304 | m = kfilter(params, diffs)['est'][-1] |
---|
305 | if greater: |
---|
306 | if m > params['threshold']: |
---|
307 | return 1 |
---|
308 | else: |
---|
309 | return 0 |
---|
310 | else: |
---|
311 | if m < params['threshold']: |
---|
312 | return 1 |
---|
313 | else: |
---|
314 | return 0 |
---|
315 | |
---|
316 | |
---|
317 | def tsvalwmeanTest(params, greater, samples): |
---|
318 | m = tsvalwmean(samples) |
---|
319 | if greater: |
---|
320 | if m > params['threshold']: |
---|
321 | return 1 |
---|
322 | else: |
---|
323 | return 0 |
---|
324 | else: |
---|
325 | if m < params['threshold']: |
---|
326 | return 1 |
---|
327 | else: |
---|
328 | return 0 |
---|