source: trunk/lib/nanownlib/stats.py @ 12

Last change on this file since 12 was 11, checked in by tim, 9 years ago

.

File size: 8.6 KB
RevLine 
[4]1
2import sys
3import os
[10]4import functools
[4]5import math
6import statistics
7import gzip
8import random
9import scipy
10import scipy.stats
11import numpy
12
13# Don't trust numpy's seeding
14numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
15
16
17def mad(arr):
18    """ Median Absolute Deviation: a "Robust" version of standard deviation.
19        Indices variabililty of the sample.
20        https://en.wikipedia.org/wiki/Median_absolute_deviation
21    """
22    arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
23    med = numpy.median(arr)
24    return numpy.median(numpy.abs(arr - med))
25
26
27def cov(x,y):
28    mx = statistics.mean(x)
29    my = statistics.mean(y)
30    products = []
31    for i in range(0,len(x)):
32        products.append((x[i] - mx)*(y[i] - my))
33
34    return statistics.mean(products)
35
36
37def difference(ls):
38    return ls[0]-ls[1]
39
40def product(ls):
41    return ls[0]*ls[1]
42
43def hypotenuse(ls):
44    return math.hypot(ls[0],ls[1])
45   
46def trustValues(derived, trustFunc):
47    ret_val = []
48    for k,v in derived.items():
49        ret_val.append((trustFunc((v['long'],v['short'])), k))
50
51    ret_val.sort()
52    return ret_val
53
54
55def prunedWeights(derived, trust, alpha):
56    weights = {}
57
58    threshold = len(trust)*(1.0-alpha)
59    for i in range(0,len(trust)):
60        if i < threshold:
61            weights[trust[i][1]] = 1.0
62        else:
63            weights[trust[i][1]] = 0.0
64
65    return weights
66
67
68def linearWeights(derived, trust, alpha):
69    x1 = trust[0][0]
70    y1 = 1.0 + (alpha*10)
71    x2 = trust[(len(trust)-1)//3][0]
72    y2 = 1.0
73    m = (y1-y2)/(x1-x2)
74    b = y1 - m*x1
75
76    weights = {}
77    for t,k in trust:
78        weights[k] = m*t+b
79        if weights[k] < 0.0:
80            weights[k] = 0.0
81
82    return weights
83
84
85def invertedWeights(derived,trust,alpha):
86    # (x+1-first_sample)^(-alpha)
87    #scale = trust[0][0]
88
89    #weights = {}
90    #for t,k in trust:
91    #    weights[k] = (t+1-scale)**(-1.0*alpha)
92    #    if weights[k] < 0.0:
93    #        weights[k] = 0.0
94
95    weights = {}
96    for i in range(len(trust)):
97        w = 10.0/(i+2.0)-0.2
98        if w < 0.0:
99            w = 0.0
100        weights[trust[i][1]] = w
101       
102   
103    return weights
104
105
106
107def arctanWeights(derived,trust,alpha):
108    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
109    minimum = trust[0][0]
110   
111    weights = {}
112    for i in range(len(trust)):
113        w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
114        if w < 0.0:
115            w = 0.0
116        weights[trust[i][1]] = w
117
118    return weights
119
120
121def arctanWeights2(derived,trust,alpha):
122    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
123    minimum = trust[0][0]
124    stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
125   
126    weights = {}
127    for i in range(len(trust)):
128        w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
129        if w < 0.0:
130            w = 0.0
131        weights[trust[i][1]] = w
132
133    return weights
134
135
[10]136def midsummary(values, distance=25):
137    #return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
138    l,h = numpy.percentile(values, (50-distance,50+distance))
139    return (l+h)/2.0
[4]140
141def trimean(values, distance=25):
[10]142    return (midsummary(values, distance) + statistics.median(values))/2
[4]143
[10]144def ubersummary(values, distance=25):
145    left2 = 50-distance
[11]146    left3 = 50-(distance/2.0)
[10]147    left1 = left2/2.0
148    right2 = 50+distance
[11]149    right3 = 50+(distance/2.0)
[10]150    right1 = (right2+100)/2.0
151    l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
152    #print(l1,l2,l3,m,r3,r2,r1)
153    return (l1+l2*4+l3+r3+r2*4+r1)/12.0
154    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
155
[11]156   
[10]157def quadsummary(values, distance=25):
158    left1 = 50-distance
159    left2 = (left1+50)/2.0
160    right1 = 50+distance
161    right2 = (right1+50)/2.0
162    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
163    #print(left1,left2,left3,50,right3,right2,right1)
164    #print(l1,l2,l3,m,r3,r2,r1)
165    return (l1+l2+r2+r1)/4.0
166    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
167
[11]168
169def tsvalwmean(subseries):
170    weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries]
171    normalizer = sum(weights)/len(weights)
172    return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer
173                       for i in range(len(weights))])
174
175#def tsvalwmean(subseries):
176#    return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries])
177
178
[4]179def weightedMean(derived, weights):
180    normalizer = sum(weights.values())/len(weights)
181    return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
182
183def weightedMeanTsval(derived, weights):
184    normalizer = sum(weights.values())/len(weights)
185    return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
186
187
[11]188   
189
[4]190def estimateMean(trustFunc, weightFunc, alpha, derived):
191    trust = trustValues(derived, trustFunc)
192    weights = weightFunc(derived, trust, alpha)
193    return weightedMean(derived, weights)
194
195
196def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
197    trust = trustValues(derived, trustFunc)
198    weights = weightFunc(derived, trust, alpha)
199    return weightedMeanTsval(derived, weights)
200
201
[6]202def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
203    ret_val = []
204    for t in range(num_trials):
[8]205        ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
[6]206
207    return ret_val
208
209
[4]210# Returns 1 if unusual_case is unusual in the expected direction
211#         0 if it isn't unusual
212#        -1 if it is unusual in the wrong direction
[8]213def multiBoxTest(params, greater, samples):
[11]214    uc = [s['unusual_packet'] for s in samples]
215    rest = [s['other_packet'] for s in samples]
[4]216   
[10]217    uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
218    rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
[4]219    if uc_high < rest_low:
220        if greater:
221            return -1
222        else:
223            return 1
224
225    if rest_high < uc_low:
226        if greater:
227            return 1
228        else:
229            return -1
230       
231    return 0
232
233
234# Returns 1 if unusual_case is unusual in the expected direction
235#         0 otherwise
[10]236def summaryTest(f, params, greater, samples):
[11]237    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
[4]238
[10]239    mh = f(diffs, params['distance'])
[4]240    if greater:
241        if mh > params['threshold']:
242            return 1
243        else:
244            return 0
245    else:
246        if mh < params['threshold']:
247            return 1
248        else:
249            return 0
250
[11]251
[10]252midsummaryTest = functools.partial(summaryTest, midsummary)
253trimeanTest = functools.partial(summaryTest, trimean)
254ubersummaryTest = functools.partial(summaryTest, ubersummary)
255quadsummaryTest = functools.partial(summaryTest, quadsummary)
[4]256
257def rmse(expected, measurements):
258    s = sum([(expected-m)**2 for m in measurements])/len(measurements)
259    return math.sqrt(s)
260
261def nrmse(expected, measurements):
262    return rmse(expected, measurements)/(max(measurements)-min(measurements))
[10]263
264
265class KalmanFilter1D:
266    def __init__(self, x0, P, R, Q):
267        self.x = x0
268        self.P = P
269        self.R = R
270        self.Q = Q
271
272    def update(self, z):
273        self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
274        self.P = 1. / (1./self.P + 1./self.R)
275
276    def predict(self, u=0.0):
277        self.x += u
278        self.P += self.Q
279
280
281def kfilter(params, observations):
[11]282    x = numpy.array(observations)   
[10]283    movement = 0
[11]284    est = []
[10]285    var = []
286    kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
[11]287                        #P  = 10000,         # initial variance
288                        P  = 10,            # initial variance
[10]289                        R  = numpy.std(x),   # msensor noise
290                        Q  = 0)              # movement noise
291    for round in range(1):
292        for d in x:
293            kf.predict(movement)
294            kf.update(d)
295            est.append(kf.x)
296            var.append(kf.P)
297
298    return({'est':est, 'var':var})
299
300
301def kalmanTest(params, greater, samples):
[11]302    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
[10]303
304    m = kfilter(params, diffs)['est'][-1]
305    if greater:
306        if m > params['threshold']:
307            return 1
308        else:
309            return 0
310    else:
311        if m < params['threshold']:
312            return 1
313        else:
314            return 0
315
316
[11]317def tsvalwmeanTest(params, greater, samples):
318    m = tsvalwmean(samples)
[10]319    if greater:
320        if m > params['threshold']:
321            return 1
322        else:
323            return 0
324    else:
325        if m < params['threshold']:
326            return 1
327        else:
328            return 0
Note: See TracBrowser for help on using the repository browser.