[4] | 1 | |
---|
| 2 | import sys |
---|
| 3 | import os |
---|
[10] | 4 | import functools |
---|
[4] | 5 | import math |
---|
| 6 | import statistics |
---|
| 7 | import gzip |
---|
| 8 | import random |
---|
[20] | 9 | try: |
---|
| 10 | import numpy |
---|
| 11 | except: |
---|
| 12 | sys.stderr.write('ERROR: Could not import numpy module. Ensure it is installed.\n') |
---|
| 13 | sys.stderr.write(' Under Debian, the package name is "python3-numpy"\n.') |
---|
| 14 | sys.exit(1) |
---|
[4] | 15 | |
---|
| 16 | # Don't trust numpy's seeding |
---|
| 17 | numpy.random.seed(random.SystemRandom().randint(0,2**32-1)) |
---|
| 18 | |
---|
| 19 | |
---|
| 20 | def mad(arr): |
---|
| 21 | """ Median Absolute Deviation: a "Robust" version of standard deviation. |
---|
| 22 | Indices variabililty of the sample. |
---|
| 23 | https://en.wikipedia.org/wiki/Median_absolute_deviation |
---|
| 24 | """ |
---|
| 25 | arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays. |
---|
| 26 | med = numpy.median(arr) |
---|
| 27 | return numpy.median(numpy.abs(arr - med)) |
---|
| 28 | |
---|
| 29 | |
---|
| 30 | def cov(x,y): |
---|
| 31 | mx = statistics.mean(x) |
---|
| 32 | my = statistics.mean(y) |
---|
| 33 | products = [] |
---|
| 34 | for i in range(0,len(x)): |
---|
| 35 | products.append((x[i] - mx)*(y[i] - my)) |
---|
| 36 | |
---|
| 37 | return statistics.mean(products) |
---|
| 38 | |
---|
[20] | 39 | |
---|
| 40 | def OLSRegression(x,y): |
---|
| 41 | #print(x,y) |
---|
| 42 | x = numpy.array(x) |
---|
| 43 | y = numpy.array(y) |
---|
| 44 | #A = numpy.vstack([x, numpy.ones(len(x))]).T |
---|
| 45 | #m, c = numpy.linalg.lstsq(A, y)[0] # broken |
---|
| 46 | #c,m = numpy.polynomial.polynomial.polyfit(x, y, 1) # less accurate |
---|
| 47 | c,m = numpy.polynomial.Polynomial.fit(x,y,1).convert().coef |
---|
[4] | 48 | |
---|
[20] | 49 | #print(m,c) |
---|
| 50 | |
---|
| 51 | #import matplotlib.pyplot as plt |
---|
| 52 | #plt.clf() |
---|
| 53 | #plt.scatter(x, y) |
---|
| 54 | #plt.plot(x, m*x + c, 'r', label='Fitted line') |
---|
| 55 | #plt.show() |
---|
| 56 | |
---|
| 57 | return (m,c) |
---|
| 58 | |
---|
| 59 | |
---|
[4] | 60 | def difference(ls): |
---|
| 61 | return ls[0]-ls[1] |
---|
| 62 | |
---|
| 63 | def product(ls): |
---|
| 64 | return ls[0]*ls[1] |
---|
| 65 | |
---|
| 66 | def hypotenuse(ls): |
---|
| 67 | return math.hypot(ls[0],ls[1]) |
---|
| 68 | |
---|
| 69 | def trustValues(derived, trustFunc): |
---|
| 70 | ret_val = [] |
---|
| 71 | for k,v in derived.items(): |
---|
| 72 | ret_val.append((trustFunc((v['long'],v['short'])), k)) |
---|
| 73 | |
---|
| 74 | ret_val.sort() |
---|
| 75 | return ret_val |
---|
| 76 | |
---|
| 77 | |
---|
| 78 | def prunedWeights(derived, trust, alpha): |
---|
| 79 | weights = {} |
---|
| 80 | |
---|
| 81 | threshold = len(trust)*(1.0-alpha) |
---|
| 82 | for i in range(0,len(trust)): |
---|
| 83 | if i < threshold: |
---|
| 84 | weights[trust[i][1]] = 1.0 |
---|
| 85 | else: |
---|
| 86 | weights[trust[i][1]] = 0.0 |
---|
| 87 | |
---|
| 88 | return weights |
---|
| 89 | |
---|
| 90 | |
---|
| 91 | def linearWeights(derived, trust, alpha): |
---|
| 92 | x1 = trust[0][0] |
---|
| 93 | y1 = 1.0 + (alpha*10) |
---|
| 94 | x2 = trust[(len(trust)-1)//3][0] |
---|
| 95 | y2 = 1.0 |
---|
| 96 | m = (y1-y2)/(x1-x2) |
---|
| 97 | b = y1 - m*x1 |
---|
| 98 | |
---|
| 99 | weights = {} |
---|
| 100 | for t,k in trust: |
---|
| 101 | weights[k] = m*t+b |
---|
| 102 | if weights[k] < 0.0: |
---|
| 103 | weights[k] = 0.0 |
---|
| 104 | |
---|
| 105 | return weights |
---|
| 106 | |
---|
| 107 | |
---|
| 108 | def invertedWeights(derived,trust,alpha): |
---|
| 109 | # (x+1-first_sample)^(-alpha) |
---|
| 110 | #scale = trust[0][0] |
---|
| 111 | |
---|
| 112 | #weights = {} |
---|
| 113 | #for t,k in trust: |
---|
| 114 | # weights[k] = (t+1-scale)**(-1.0*alpha) |
---|
| 115 | # if weights[k] < 0.0: |
---|
| 116 | # weights[k] = 0.0 |
---|
| 117 | |
---|
| 118 | weights = {} |
---|
| 119 | for i in range(len(trust)): |
---|
| 120 | w = 10.0/(i+2.0)-0.2 |
---|
| 121 | if w < 0.0: |
---|
| 122 | w = 0.0 |
---|
| 123 | weights[trust[i][1]] = w |
---|
| 124 | |
---|
| 125 | |
---|
| 126 | return weights |
---|
| 127 | |
---|
| 128 | |
---|
| 129 | |
---|
| 130 | def arctanWeights(derived,trust,alpha): |
---|
| 131 | shift = trust[int((len(trust)-1)*(1.0-alpha))][0] |
---|
| 132 | minimum = trust[0][0] |
---|
| 133 | |
---|
| 134 | weights = {} |
---|
| 135 | for i in range(len(trust)): |
---|
| 136 | w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum)) |
---|
| 137 | if w < 0.0: |
---|
| 138 | w = 0.0 |
---|
| 139 | weights[trust[i][1]] = w |
---|
| 140 | |
---|
| 141 | return weights |
---|
| 142 | |
---|
| 143 | |
---|
| 144 | def arctanWeights2(derived,trust,alpha): |
---|
| 145 | shift = trust[int((len(trust)-1)*(1.0-alpha))][0] |
---|
| 146 | minimum = trust[0][0] |
---|
| 147 | stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median |
---|
| 148 | |
---|
| 149 | weights = {} |
---|
| 150 | for i in range(len(trust)): |
---|
| 151 | w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum)) |
---|
| 152 | if w < 0.0: |
---|
| 153 | w = 0.0 |
---|
| 154 | weights[trust[i][1]] = w |
---|
| 155 | |
---|
| 156 | return weights |
---|
| 157 | |
---|
| 158 | |
---|
[10] | 159 | def midsummary(values, distance=25): |
---|
| 160 | #return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0 |
---|
| 161 | l,h = numpy.percentile(values, (50-distance,50+distance)) |
---|
| 162 | return (l+h)/2.0 |
---|
[4] | 163 | |
---|
| 164 | def trimean(values, distance=25): |
---|
[10] | 165 | return (midsummary(values, distance) + statistics.median(values))/2 |
---|
[4] | 166 | |
---|
[10] | 167 | def ubersummary(values, distance=25): |
---|
| 168 | left2 = 50-distance |
---|
[11] | 169 | left3 = 50-(distance/2.0) |
---|
[10] | 170 | left1 = left2/2.0 |
---|
| 171 | right2 = 50+distance |
---|
[11] | 172 | right3 = 50+(distance/2.0) |
---|
[10] | 173 | right1 = (right2+100)/2.0 |
---|
| 174 | l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1)) |
---|
| 175 | #print(l1,l2,l3,m,r3,r2,r1) |
---|
| 176 | return (l1+l2*4+l3+r3+r2*4+r1)/12.0 |
---|
| 177 | #return statistics.mean((l1,l2,l3,m,r3,r2,r1)) |
---|
| 178 | |
---|
[11] | 179 | |
---|
[10] | 180 | def quadsummary(values, distance=25): |
---|
| 181 | left1 = 50-distance |
---|
| 182 | left2 = (left1+50)/2.0 |
---|
| 183 | right1 = 50+distance |
---|
| 184 | right2 = (right1+50)/2.0 |
---|
| 185 | l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1)) |
---|
| 186 | #print(left1,left2,left3,50,right3,right2,right1) |
---|
| 187 | #print(l1,l2,l3,m,r3,r2,r1) |
---|
| 188 | return (l1+l2+r2+r1)/4.0 |
---|
| 189 | #return statistics.mean((l1,l2,l3,m,r3,r2,r1)) |
---|
| 190 | |
---|
[13] | 191 | |
---|
| 192 | def septasummary(values, distance=25): |
---|
| 193 | left2 = 50-distance |
---|
| 194 | left3 = 50-(distance/2.0) |
---|
| 195 | left1 = left2/2.0 |
---|
| 196 | right2 = 50+distance |
---|
| 197 | right3 = 50+(distance/2.0) |
---|
| 198 | right1 = (right2+100)/2.0 |
---|
| 199 | l1,l2,l3,m,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,50,right3,right2,right1)) |
---|
| 200 | return (l1+l2+l3+m+r3+r2+r1)/7.0 |
---|
[11] | 201 | |
---|
[13] | 202 | |
---|
[11] | 203 | def tsvalwmean(subseries): |
---|
| 204 | weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries] |
---|
| 205 | normalizer = sum(weights)/len(weights) |
---|
| 206 | return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer |
---|
| 207 | for i in range(len(weights))]) |
---|
| 208 | |
---|
| 209 | #def tsvalwmean(subseries): |
---|
| 210 | # return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries]) |
---|
| 211 | |
---|
| 212 | |
---|
[4] | 213 | def weightedMean(derived, weights): |
---|
| 214 | normalizer = sum(weights.values())/len(weights) |
---|
| 215 | return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()]) |
---|
| 216 | |
---|
| 217 | def weightedMeanTsval(derived, weights): |
---|
| 218 | normalizer = sum(weights.values())/len(weights) |
---|
| 219 | return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()]) |
---|
| 220 | |
---|
| 221 | |
---|
[11] | 222 | |
---|
| 223 | |
---|
[4] | 224 | def estimateMean(trustFunc, weightFunc, alpha, derived): |
---|
| 225 | trust = trustValues(derived, trustFunc) |
---|
| 226 | weights = weightFunc(derived, trust, alpha) |
---|
| 227 | return weightedMean(derived, weights) |
---|
| 228 | |
---|
| 229 | |
---|
| 230 | def estimateMeanTsval(trustFunc, weightFunc, alpha, derived): |
---|
| 231 | trust = trustValues(derived, trustFunc) |
---|
| 232 | weights = weightFunc(derived, trust, alpha) |
---|
| 233 | return weightedMeanTsval(derived, weights) |
---|
| 234 | |
---|
| 235 | |
---|
[6] | 236 | def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials): |
---|
| 237 | ret_val = [] |
---|
| 238 | for t in range(num_trials): |
---|
[8] | 239 | ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size))) |
---|
[6] | 240 | |
---|
| 241 | return ret_val |
---|
| 242 | |
---|
| 243 | |
---|
[4] | 244 | # Returns 1 if unusual_case is unusual in the expected direction |
---|
| 245 | # 0 if it isn't unusual |
---|
| 246 | # -1 if it is unusual in the wrong direction |
---|
[8] | 247 | def multiBoxTest(params, greater, samples): |
---|
[11] | 248 | uc = [s['unusual_packet'] for s in samples] |
---|
| 249 | rest = [s['other_packet'] for s in samples] |
---|
[4] | 250 | |
---|
[10] | 251 | uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low'])) |
---|
| 252 | rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low'])) |
---|
[4] | 253 | if uc_high < rest_low: |
---|
| 254 | if greater: |
---|
| 255 | return -1 |
---|
| 256 | else: |
---|
| 257 | return 1 |
---|
| 258 | |
---|
| 259 | if rest_high < uc_low: |
---|
| 260 | if greater: |
---|
| 261 | return 1 |
---|
| 262 | else: |
---|
| 263 | return -1 |
---|
| 264 | |
---|
| 265 | return 0 |
---|
| 266 | |
---|
| 267 | |
---|
| 268 | # Returns 1 if unusual_case is unusual in the expected direction |
---|
| 269 | # 0 otherwise |
---|
[10] | 270 | def summaryTest(f, params, greater, samples): |
---|
[11] | 271 | diffs = [s['unusual_packet']-s['other_packet'] for s in samples] |
---|
[4] | 272 | |
---|
[10] | 273 | mh = f(diffs, params['distance']) |
---|
[16] | 274 | #print("estimate:", mh) |
---|
[4] | 275 | if greater: |
---|
| 276 | if mh > params['threshold']: |
---|
| 277 | return 1 |
---|
| 278 | else: |
---|
| 279 | return 0 |
---|
| 280 | else: |
---|
| 281 | if mh < params['threshold']: |
---|
| 282 | return 1 |
---|
| 283 | else: |
---|
| 284 | return 0 |
---|
| 285 | |
---|
[11] | 286 | |
---|
[10] | 287 | midsummaryTest = functools.partial(summaryTest, midsummary) |
---|
| 288 | trimeanTest = functools.partial(summaryTest, trimean) |
---|
| 289 | ubersummaryTest = functools.partial(summaryTest, ubersummary) |
---|
| 290 | quadsummaryTest = functools.partial(summaryTest, quadsummary) |
---|
[13] | 291 | septasummaryTest = functools.partial(summaryTest, septasummary) |
---|
[4] | 292 | |
---|
| 293 | def rmse(expected, measurements): |
---|
| 294 | s = sum([(expected-m)**2 for m in measurements])/len(measurements) |
---|
| 295 | return math.sqrt(s) |
---|
| 296 | |
---|
| 297 | def nrmse(expected, measurements): |
---|
| 298 | return rmse(expected, measurements)/(max(measurements)-min(measurements)) |
---|
[10] | 299 | |
---|
| 300 | |
---|
| 301 | class KalmanFilter1D: |
---|
| 302 | def __init__(self, x0, P, R, Q): |
---|
| 303 | self.x = x0 |
---|
| 304 | self.P = P |
---|
| 305 | self.R = R |
---|
| 306 | self.Q = Q |
---|
| 307 | |
---|
| 308 | def update(self, z): |
---|
| 309 | self.x = (self.P * z + self.x * self.R) / (self.P + self.R) |
---|
| 310 | self.P = 1. / (1./self.P + 1./self.R) |
---|
| 311 | |
---|
| 312 | def predict(self, u=0.0): |
---|
| 313 | self.x += u |
---|
| 314 | self.P += self.Q |
---|
| 315 | |
---|
| 316 | |
---|
| 317 | def kfilter(params, observations): |
---|
[11] | 318 | x = numpy.array(observations) |
---|
[10] | 319 | movement = 0 |
---|
[11] | 320 | est = [] |
---|
[10] | 321 | var = [] |
---|
| 322 | kf = KalmanFilter1D(x0 = quadsummary(x), # initial state |
---|
[11] | 323 | #P = 10000, # initial variance |
---|
| 324 | P = 10, # initial variance |
---|
[10] | 325 | R = numpy.std(x), # msensor noise |
---|
| 326 | Q = 0) # movement noise |
---|
| 327 | for round in range(1): |
---|
| 328 | for d in x: |
---|
| 329 | kf.predict(movement) |
---|
| 330 | kf.update(d) |
---|
| 331 | est.append(kf.x) |
---|
| 332 | var.append(kf.P) |
---|
| 333 | |
---|
| 334 | return({'est':est, 'var':var}) |
---|
| 335 | |
---|
| 336 | |
---|
| 337 | def kalmanTest(params, greater, samples): |
---|
[11] | 338 | diffs = [s['unusual_packet']-s['other_packet'] for s in samples] |
---|
[10] | 339 | |
---|
| 340 | m = kfilter(params, diffs)['est'][-1] |
---|
| 341 | if greater: |
---|
| 342 | if m > params['threshold']: |
---|
| 343 | return 1 |
---|
| 344 | else: |
---|
| 345 | return 0 |
---|
| 346 | else: |
---|
| 347 | if m < params['threshold']: |
---|
| 348 | return 1 |
---|
| 349 | else: |
---|
| 350 | return 0 |
---|
| 351 | |
---|
| 352 | |
---|
[11] | 353 | def tsvalwmeanTest(params, greater, samples): |
---|
| 354 | m = tsvalwmean(samples) |
---|
[10] | 355 | if greater: |
---|
| 356 | if m > params['threshold']: |
---|
| 357 | return 1 |
---|
| 358 | else: |
---|
| 359 | return 0 |
---|
| 360 | else: |
---|
| 361 | if m < params['threshold']: |
---|
| 362 | return 1 |
---|
| 363 | else: |
---|
| 364 | return 0 |
---|
[13] | 365 | |
---|
| 366 | |
---|
[24] | 367 | #from pykalman import KalmanFilter |
---|
[13] | 368 | def pyKalman4DTest(params, greater, samples): |
---|
| 369 | kp = params['kparams'] |
---|
| 370 | #kp['initial_state_mean']=[quadsummary([s['unusual_packet'] for s in samples]), |
---|
| 371 | # quadsummary([s['other_packet'] for s in samples]), |
---|
| 372 | # numpy.mean([s['unusual_tsval'] for s in samples]), |
---|
| 373 | # numpy.mean([s['other_tsval'] for s in samples])] |
---|
| 374 | kf = KalmanFilter(n_dim_obs=4, n_dim_state=4, **kp) |
---|
| 375 | smooth,covariance = kf.smooth([(s['unusual_packet'],s['other_packet'],s['unusual_tsval'],s['other_tsval']) |
---|
| 376 | for s in samples]) |
---|
| 377 | m = numpy.mean(smooth) |
---|
| 378 | if greater: |
---|
| 379 | if m > params['threshold']: |
---|
| 380 | return 1 |
---|
| 381 | else: |
---|
| 382 | return 0 |
---|
| 383 | else: |
---|
| 384 | if m < params['threshold']: |
---|
| 385 | return 1 |
---|
| 386 | else: |
---|
| 387 | return 0 |
---|
| 388 | |
---|