source: trunk/lib/nanownlib/stats.py @ 20

Last change on this file since 20 was 20, checked in by tim, 9 years ago

major code refactoring, better organizing location of library functions

File size: 10.6 KB
Line 
1
2import sys
3import os
4import functools
5import math
6import statistics
7import gzip
8import random
9try:
10    import numpy
11except:
12    sys.stderr.write('ERROR: Could not import numpy module.  Ensure it is installed.\n')
13    sys.stderr.write('       Under Debian, the package name is "python3-numpy"\n.')
14    sys.exit(1)
15
16# Don't trust numpy's seeding
17numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
18
19
20def mad(arr):
21    """ Median Absolute Deviation: a "Robust" version of standard deviation.
22        Indices variabililty of the sample.
23        https://en.wikipedia.org/wiki/Median_absolute_deviation
24    """
25    arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
26    med = numpy.median(arr)
27    return numpy.median(numpy.abs(arr - med))
28
29
30def cov(x,y):
31    mx = statistics.mean(x)
32    my = statistics.mean(y)
33    products = []
34    for i in range(0,len(x)):
35        products.append((x[i] - mx)*(y[i] - my))
36
37    return statistics.mean(products)
38
39   
40def OLSRegression(x,y):
41    #print(x,y)
42    x = numpy.array(x)
43    y = numpy.array(y)
44    #A = numpy.vstack([x, numpy.ones(len(x))]).T
45    #m, c = numpy.linalg.lstsq(A, y)[0] # broken
46    #c,m = numpy.polynomial.polynomial.polyfit(x, y, 1) # less accurate
47    c,m = numpy.polynomial.Polynomial.fit(x,y,1).convert().coef
48
49    #print(m,c)
50
51    #import matplotlib.pyplot as plt
52    #plt.clf()
53    #plt.scatter(x, y)
54    #plt.plot(x, m*x + c, 'r', label='Fitted line')
55    #plt.show()
56   
57    return (m,c)
58
59
60def difference(ls):
61    return ls[0]-ls[1]
62
63def product(ls):
64    return ls[0]*ls[1]
65
66def hypotenuse(ls):
67    return math.hypot(ls[0],ls[1])
68   
69def trustValues(derived, trustFunc):
70    ret_val = []
71    for k,v in derived.items():
72        ret_val.append((trustFunc((v['long'],v['short'])), k))
73
74    ret_val.sort()
75    return ret_val
76
77
78def prunedWeights(derived, trust, alpha):
79    weights = {}
80
81    threshold = len(trust)*(1.0-alpha)
82    for i in range(0,len(trust)):
83        if i < threshold:
84            weights[trust[i][1]] = 1.0
85        else:
86            weights[trust[i][1]] = 0.0
87
88    return weights
89
90
91def linearWeights(derived, trust, alpha):
92    x1 = trust[0][0]
93    y1 = 1.0 + (alpha*10)
94    x2 = trust[(len(trust)-1)//3][0]
95    y2 = 1.0
96    m = (y1-y2)/(x1-x2)
97    b = y1 - m*x1
98
99    weights = {}
100    for t,k in trust:
101        weights[k] = m*t+b
102        if weights[k] < 0.0:
103            weights[k] = 0.0
104
105    return weights
106
107
108def invertedWeights(derived,trust,alpha):
109    # (x+1-first_sample)^(-alpha)
110    #scale = trust[0][0]
111
112    #weights = {}
113    #for t,k in trust:
114    #    weights[k] = (t+1-scale)**(-1.0*alpha)
115    #    if weights[k] < 0.0:
116    #        weights[k] = 0.0
117
118    weights = {}
119    for i in range(len(trust)):
120        w = 10.0/(i+2.0)-0.2
121        if w < 0.0:
122            w = 0.0
123        weights[trust[i][1]] = w
124       
125   
126    return weights
127
128
129
130def arctanWeights(derived,trust,alpha):
131    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
132    minimum = trust[0][0]
133   
134    weights = {}
135    for i in range(len(trust)):
136        w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
137        if w < 0.0:
138            w = 0.0
139        weights[trust[i][1]] = w
140
141    return weights
142
143
144def arctanWeights2(derived,trust,alpha):
145    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
146    minimum = trust[0][0]
147    stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
148   
149    weights = {}
150    for i in range(len(trust)):
151        w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
152        if w < 0.0:
153            w = 0.0
154        weights[trust[i][1]] = w
155
156    return weights
157
158
159def midsummary(values, distance=25):
160    #return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
161    l,h = numpy.percentile(values, (50-distance,50+distance))
162    return (l+h)/2.0
163
164def trimean(values, distance=25):
165    return (midsummary(values, distance) + statistics.median(values))/2
166
167def ubersummary(values, distance=25):
168    left2 = 50-distance
169    left3 = 50-(distance/2.0)
170    left1 = left2/2.0
171    right2 = 50+distance
172    right3 = 50+(distance/2.0)
173    right1 = (right2+100)/2.0
174    l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
175    #print(l1,l2,l3,m,r3,r2,r1)
176    return (l1+l2*4+l3+r3+r2*4+r1)/12.0
177    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
178
179   
180def quadsummary(values, distance=25):
181    left1 = 50-distance
182    left2 = (left1+50)/2.0
183    right1 = 50+distance
184    right2 = (right1+50)/2.0
185    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
186    #print(left1,left2,left3,50,right3,right2,right1)
187    #print(l1,l2,l3,m,r3,r2,r1)
188    return (l1+l2+r2+r1)/4.0
189    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
190
191   
192def septasummary(values, distance=25):
193    left2 = 50-distance
194    left3 = 50-(distance/2.0)
195    left1 = left2/2.0
196    right2 = 50+distance
197    right3 = 50+(distance/2.0)
198    right1 = (right2+100)/2.0
199    l1,l2,l3,m,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,50,right3,right2,right1))
200    return (l1+l2+l3+m+r3+r2+r1)/7.0
201
202
203def tsvalwmean(subseries):
204    weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries]
205    normalizer = sum(weights)/len(weights)
206    return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer
207                       for i in range(len(weights))])
208
209#def tsvalwmean(subseries):
210#    return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries])
211
212
213def weightedMean(derived, weights):
214    normalizer = sum(weights.values())/len(weights)
215    return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
216
217def weightedMeanTsval(derived, weights):
218    normalizer = sum(weights.values())/len(weights)
219    return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
220
221
222   
223
224def estimateMean(trustFunc, weightFunc, alpha, derived):
225    trust = trustValues(derived, trustFunc)
226    weights = weightFunc(derived, trust, alpha)
227    return weightedMean(derived, weights)
228
229
230def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
231    trust = trustValues(derived, trustFunc)
232    weights = weightFunc(derived, trust, alpha)
233    return weightedMeanTsval(derived, weights)
234
235
236def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
237    ret_val = []
238    for t in range(num_trials):
239        ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
240
241    return ret_val
242
243
244# Returns 1 if unusual_case is unusual in the expected direction
245#         0 if it isn't unusual
246#        -1 if it is unusual in the wrong direction
247def multiBoxTest(params, greater, samples):
248    uc = [s['unusual_packet'] for s in samples]
249    rest = [s['other_packet'] for s in samples]
250   
251    uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
252    rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
253    if uc_high < rest_low:
254        if greater:
255            return -1
256        else:
257            return 1
258
259    if rest_high < uc_low:
260        if greater:
261            return 1
262        else:
263            return -1
264       
265    return 0
266
267
268# Returns 1 if unusual_case is unusual in the expected direction
269#         0 otherwise
270def summaryTest(f, params, greater, samples):
271    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
272
273    mh = f(diffs, params['distance'])
274    #print("estimate:", mh)
275    if greater:
276        if mh > params['threshold']:
277            return 1
278        else:
279            return 0
280    else:
281        if mh < params['threshold']:
282            return 1
283        else:
284            return 0
285
286
287midsummaryTest = functools.partial(summaryTest, midsummary)
288trimeanTest = functools.partial(summaryTest, trimean)
289ubersummaryTest = functools.partial(summaryTest, ubersummary)
290quadsummaryTest = functools.partial(summaryTest, quadsummary)
291septasummaryTest = functools.partial(summaryTest, septasummary)
292
293def rmse(expected, measurements):
294    s = sum([(expected-m)**2 for m in measurements])/len(measurements)
295    return math.sqrt(s)
296
297def nrmse(expected, measurements):
298    return rmse(expected, measurements)/(max(measurements)-min(measurements))
299
300
301class KalmanFilter1D:
302    def __init__(self, x0, P, R, Q):
303        self.x = x0
304        self.P = P
305        self.R = R
306        self.Q = Q
307
308    def update(self, z):
309        self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
310        self.P = 1. / (1./self.P + 1./self.R)
311
312    def predict(self, u=0.0):
313        self.x += u
314        self.P += self.Q
315
316
317def kfilter(params, observations):
318    x = numpy.array(observations)   
319    movement = 0
320    est = []
321    var = []
322    kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
323                        #P  = 10000,         # initial variance
324                        P  = 10,            # initial variance
325                        R  = numpy.std(x),   # msensor noise
326                        Q  = 0)              # movement noise
327    for round in range(1):
328        for d in x:
329            kf.predict(movement)
330            kf.update(d)
331            est.append(kf.x)
332            var.append(kf.P)
333
334    return({'est':est, 'var':var})
335
336
337def kalmanTest(params, greater, samples):
338    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
339
340    m = kfilter(params, diffs)['est'][-1]
341    if greater:
342        if m > params['threshold']:
343            return 1
344        else:
345            return 0
346    else:
347        if m < params['threshold']:
348            return 1
349        else:
350            return 0
351
352
353def tsvalwmeanTest(params, greater, samples):
354    m = tsvalwmean(samples)
355    if greater:
356        if m > params['threshold']:
357            return 1
358        else:
359            return 0
360    else:
361        if m < params['threshold']:
362            return 1
363        else:
364            return 0
365
366
367from pykalman import KalmanFilter
368def pyKalman4DTest(params, greater, samples):
369    kp = params['kparams']
370    #kp['initial_state_mean']=[quadsummary([s['unusual_packet'] for s in samples]),
371    #                          quadsummary([s['other_packet'] for s in samples]),
372    #                          numpy.mean([s['unusual_tsval'] for s in samples]),
373    #                          numpy.mean([s['other_tsval'] for s in samples])]
374    kf = KalmanFilter(n_dim_obs=4, n_dim_state=4, **kp)
375    smooth,covariance = kf.smooth([(s['unusual_packet'],s['other_packet'],s['unusual_tsval'],s['other_tsval'])
376                                   for s in samples])
377    m = numpy.mean(smooth)
378    if greater:
379        if m > params['threshold']:
380            return 1
381        else:
382            return 0
383    else:
384        if m < params['threshold']:
385            return 1
386        else:
387            return 0
388   
Note: See TracBrowser for help on using the repository browser.