source: trunk/lib/nanownlib/stats.py @ 14

Last change on this file since 14 was 13, checked in by tim, 9 years ago

.

File size: 9.9 KB
RevLine 
[4]1
2import sys
3import os
[10]4import functools
[4]5import math
6import statistics
7import gzip
8import random
9import scipy
10import scipy.stats
11import numpy
12
13# Don't trust numpy's seeding
14numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
15
16
17def mad(arr):
18    """ Median Absolute Deviation: a "Robust" version of standard deviation.
19        Indices variabililty of the sample.
20        https://en.wikipedia.org/wiki/Median_absolute_deviation
21    """
22    arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
23    med = numpy.median(arr)
24    return numpy.median(numpy.abs(arr - med))
25
26
27def cov(x,y):
28    mx = statistics.mean(x)
29    my = statistics.mean(y)
30    products = []
31    for i in range(0,len(x)):
32        products.append((x[i] - mx)*(y[i] - my))
33
34    return statistics.mean(products)
35
36
37def difference(ls):
38    return ls[0]-ls[1]
39
40def product(ls):
41    return ls[0]*ls[1]
42
43def hypotenuse(ls):
44    return math.hypot(ls[0],ls[1])
45   
46def trustValues(derived, trustFunc):
47    ret_val = []
48    for k,v in derived.items():
49        ret_val.append((trustFunc((v['long'],v['short'])), k))
50
51    ret_val.sort()
52    return ret_val
53
54
55def prunedWeights(derived, trust, alpha):
56    weights = {}
57
58    threshold = len(trust)*(1.0-alpha)
59    for i in range(0,len(trust)):
60        if i < threshold:
61            weights[trust[i][1]] = 1.0
62        else:
63            weights[trust[i][1]] = 0.0
64
65    return weights
66
67
68def linearWeights(derived, trust, alpha):
69    x1 = trust[0][0]
70    y1 = 1.0 + (alpha*10)
71    x2 = trust[(len(trust)-1)//3][0]
72    y2 = 1.0
73    m = (y1-y2)/(x1-x2)
74    b = y1 - m*x1
75
76    weights = {}
77    for t,k in trust:
78        weights[k] = m*t+b
79        if weights[k] < 0.0:
80            weights[k] = 0.0
81
82    return weights
83
84
85def invertedWeights(derived,trust,alpha):
86    # (x+1-first_sample)^(-alpha)
87    #scale = trust[0][0]
88
89    #weights = {}
90    #for t,k in trust:
91    #    weights[k] = (t+1-scale)**(-1.0*alpha)
92    #    if weights[k] < 0.0:
93    #        weights[k] = 0.0
94
95    weights = {}
96    for i in range(len(trust)):
97        w = 10.0/(i+2.0)-0.2
98        if w < 0.0:
99            w = 0.0
100        weights[trust[i][1]] = w
101       
102   
103    return weights
104
105
106
107def arctanWeights(derived,trust,alpha):
108    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
109    minimum = trust[0][0]
110   
111    weights = {}
112    for i in range(len(trust)):
113        w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
114        if w < 0.0:
115            w = 0.0
116        weights[trust[i][1]] = w
117
118    return weights
119
120
121def arctanWeights2(derived,trust,alpha):
122    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
123    minimum = trust[0][0]
124    stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
125   
126    weights = {}
127    for i in range(len(trust)):
128        w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
129        if w < 0.0:
130            w = 0.0
131        weights[trust[i][1]] = w
132
133    return weights
134
135
[10]136def midsummary(values, distance=25):
137    #return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
138    l,h = numpy.percentile(values, (50-distance,50+distance))
139    return (l+h)/2.0
[4]140
141def trimean(values, distance=25):
[10]142    return (midsummary(values, distance) + statistics.median(values))/2
[4]143
[10]144def ubersummary(values, distance=25):
145    left2 = 50-distance
[11]146    left3 = 50-(distance/2.0)
[10]147    left1 = left2/2.0
148    right2 = 50+distance
[11]149    right3 = 50+(distance/2.0)
[10]150    right1 = (right2+100)/2.0
151    l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
152    #print(l1,l2,l3,m,r3,r2,r1)
153    return (l1+l2*4+l3+r3+r2*4+r1)/12.0
154    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
155
[11]156   
[10]157def quadsummary(values, distance=25):
158    left1 = 50-distance
159    left2 = (left1+50)/2.0
160    right1 = 50+distance
161    right2 = (right1+50)/2.0
162    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
163    #print(left1,left2,left3,50,right3,right2,right1)
164    #print(l1,l2,l3,m,r3,r2,r1)
165    return (l1+l2+r2+r1)/4.0
166    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
167
[13]168   
169def septasummary(values, distance=25):
170    left2 = 50-distance
171    left3 = 50-(distance/2.0)
172    left1 = left2/2.0
173    right2 = 50+distance
174    right3 = 50+(distance/2.0)
175    right1 = (right2+100)/2.0
176    l1,l2,l3,m,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,50,right3,right2,right1))
177    return (l1+l2+l3+m+r3+r2+r1)/7.0
[11]178
[13]179
[11]180def tsvalwmean(subseries):
181    weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries]
182    normalizer = sum(weights)/len(weights)
183    return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer
184                       for i in range(len(weights))])
185
186#def tsvalwmean(subseries):
187#    return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries])
188
189
[4]190def weightedMean(derived, weights):
191    normalizer = sum(weights.values())/len(weights)
192    return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
193
194def weightedMeanTsval(derived, weights):
195    normalizer = sum(weights.values())/len(weights)
196    return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
197
198
[11]199   
200
[4]201def estimateMean(trustFunc, weightFunc, alpha, derived):
202    trust = trustValues(derived, trustFunc)
203    weights = weightFunc(derived, trust, alpha)
204    return weightedMean(derived, weights)
205
206
207def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
208    trust = trustValues(derived, trustFunc)
209    weights = weightFunc(derived, trust, alpha)
210    return weightedMeanTsval(derived, weights)
211
212
[6]213def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
214    ret_val = []
215    for t in range(num_trials):
[8]216        ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
[6]217
218    return ret_val
219
220
[4]221# Returns 1 if unusual_case is unusual in the expected direction
222#         0 if it isn't unusual
223#        -1 if it is unusual in the wrong direction
[8]224def multiBoxTest(params, greater, samples):
[11]225    uc = [s['unusual_packet'] for s in samples]
226    rest = [s['other_packet'] for s in samples]
[4]227   
[10]228    uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
229    rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
[4]230    if uc_high < rest_low:
231        if greater:
232            return -1
233        else:
234            return 1
235
236    if rest_high < uc_low:
237        if greater:
238            return 1
239        else:
240            return -1
241       
242    return 0
243
244
245# Returns 1 if unusual_case is unusual in the expected direction
246#         0 otherwise
[10]247def summaryTest(f, params, greater, samples):
[11]248    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
[4]249
[10]250    mh = f(diffs, params['distance'])
[4]251    if greater:
252        if mh > params['threshold']:
253            return 1
254        else:
255            return 0
256    else:
257        if mh < params['threshold']:
258            return 1
259        else:
260            return 0
261
[11]262
[10]263midsummaryTest = functools.partial(summaryTest, midsummary)
264trimeanTest = functools.partial(summaryTest, trimean)
265ubersummaryTest = functools.partial(summaryTest, ubersummary)
266quadsummaryTest = functools.partial(summaryTest, quadsummary)
[13]267septasummaryTest = functools.partial(summaryTest, septasummary)
[4]268
269def rmse(expected, measurements):
270    s = sum([(expected-m)**2 for m in measurements])/len(measurements)
271    return math.sqrt(s)
272
273def nrmse(expected, measurements):
274    return rmse(expected, measurements)/(max(measurements)-min(measurements))
[10]275
276
277class KalmanFilter1D:
278    def __init__(self, x0, P, R, Q):
279        self.x = x0
280        self.P = P
281        self.R = R
282        self.Q = Q
283
284    def update(self, z):
285        self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
286        self.P = 1. / (1./self.P + 1./self.R)
287
288    def predict(self, u=0.0):
289        self.x += u
290        self.P += self.Q
291
292
293def kfilter(params, observations):
[11]294    x = numpy.array(observations)   
[10]295    movement = 0
[11]296    est = []
[10]297    var = []
298    kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
[11]299                        #P  = 10000,         # initial variance
300                        P  = 10,            # initial variance
[10]301                        R  = numpy.std(x),   # msensor noise
302                        Q  = 0)              # movement noise
303    for round in range(1):
304        for d in x:
305            kf.predict(movement)
306            kf.update(d)
307            est.append(kf.x)
308            var.append(kf.P)
309
310    return({'est':est, 'var':var})
311
312
313def kalmanTest(params, greater, samples):
[11]314    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
[10]315
316    m = kfilter(params, diffs)['est'][-1]
317    if greater:
318        if m > params['threshold']:
319            return 1
320        else:
321            return 0
322    else:
323        if m < params['threshold']:
324            return 1
325        else:
326            return 0
327
328
[11]329def tsvalwmeanTest(params, greater, samples):
330    m = tsvalwmean(samples)
[10]331    if greater:
332        if m > params['threshold']:
333            return 1
334        else:
335            return 0
336    else:
337        if m < params['threshold']:
338            return 1
339        else:
340            return 0
[13]341
342
343from pykalman import KalmanFilter
344def pyKalman4DTest(params, greater, samples):
345    kp = params['kparams']
346    #kp['initial_state_mean']=[quadsummary([s['unusual_packet'] for s in samples]),
347    #                          quadsummary([s['other_packet'] for s in samples]),
348    #                          numpy.mean([s['unusual_tsval'] for s in samples]),
349    #                          numpy.mean([s['other_tsval'] for s in samples])]
350    kf = KalmanFilter(n_dim_obs=4, n_dim_state=4, **kp)
351    smooth,covariance = kf.smooth([(s['unusual_packet'],s['other_packet'],s['unusual_tsval'],s['other_tsval'])
352                                   for s in samples])
353    m = numpy.mean(smooth)
354    if greater:
355        if m > params['threshold']:
356            return 1
357        else:
358            return 0
359    else:
360        if m < params['threshold']:
361            return 1
362        else:
363            return 0
364   
Note: See TracBrowser for help on using the repository browser.