source: trunk/lib/nanownlib/stats.py @ 16

Last change on this file since 16 was 16, checked in by tim, 9 years ago

.

File size: 9.9 KB
Line 
1
2import sys
3import os
4import functools
5import math
6import statistics
7import gzip
8import random
9import numpy
10
11# Don't trust numpy's seeding
12numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
13
14
15def mad(arr):
16    """ Median Absolute Deviation: a "Robust" version of standard deviation.
17        Indices variabililty of the sample.
18        https://en.wikipedia.org/wiki/Median_absolute_deviation
19    """
20    arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
21    med = numpy.median(arr)
22    return numpy.median(numpy.abs(arr - med))
23
24
25def cov(x,y):
26    mx = statistics.mean(x)
27    my = statistics.mean(y)
28    products = []
29    for i in range(0,len(x)):
30        products.append((x[i] - mx)*(y[i] - my))
31
32    return statistics.mean(products)
33
34
35def difference(ls):
36    return ls[0]-ls[1]
37
38def product(ls):
39    return ls[0]*ls[1]
40
41def hypotenuse(ls):
42    return math.hypot(ls[0],ls[1])
43   
44def trustValues(derived, trustFunc):
45    ret_val = []
46    for k,v in derived.items():
47        ret_val.append((trustFunc((v['long'],v['short'])), k))
48
49    ret_val.sort()
50    return ret_val
51
52
53def prunedWeights(derived, trust, alpha):
54    weights = {}
55
56    threshold = len(trust)*(1.0-alpha)
57    for i in range(0,len(trust)):
58        if i < threshold:
59            weights[trust[i][1]] = 1.0
60        else:
61            weights[trust[i][1]] = 0.0
62
63    return weights
64
65
66def linearWeights(derived, trust, alpha):
67    x1 = trust[0][0]
68    y1 = 1.0 + (alpha*10)
69    x2 = trust[(len(trust)-1)//3][0]
70    y2 = 1.0
71    m = (y1-y2)/(x1-x2)
72    b = y1 - m*x1
73
74    weights = {}
75    for t,k in trust:
76        weights[k] = m*t+b
77        if weights[k] < 0.0:
78            weights[k] = 0.0
79
80    return weights
81
82
83def invertedWeights(derived,trust,alpha):
84    # (x+1-first_sample)^(-alpha)
85    #scale = trust[0][0]
86
87    #weights = {}
88    #for t,k in trust:
89    #    weights[k] = (t+1-scale)**(-1.0*alpha)
90    #    if weights[k] < 0.0:
91    #        weights[k] = 0.0
92
93    weights = {}
94    for i in range(len(trust)):
95        w = 10.0/(i+2.0)-0.2
96        if w < 0.0:
97            w = 0.0
98        weights[trust[i][1]] = w
99       
100   
101    return weights
102
103
104
105def arctanWeights(derived,trust,alpha):
106    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
107    minimum = trust[0][0]
108   
109    weights = {}
110    for i in range(len(trust)):
111        w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
112        if w < 0.0:
113            w = 0.0
114        weights[trust[i][1]] = w
115
116    return weights
117
118
119def arctanWeights2(derived,trust,alpha):
120    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
121    minimum = trust[0][0]
122    stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
123   
124    weights = {}
125    for i in range(len(trust)):
126        w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
127        if w < 0.0:
128            w = 0.0
129        weights[trust[i][1]] = w
130
131    return weights
132
133
134def midsummary(values, distance=25):
135    #return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
136    l,h = numpy.percentile(values, (50-distance,50+distance))
137    return (l+h)/2.0
138
139def trimean(values, distance=25):
140    return (midsummary(values, distance) + statistics.median(values))/2
141
142def ubersummary(values, distance=25):
143    left2 = 50-distance
144    left3 = 50-(distance/2.0)
145    left1 = left2/2.0
146    right2 = 50+distance
147    right3 = 50+(distance/2.0)
148    right1 = (right2+100)/2.0
149    l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
150    #print(l1,l2,l3,m,r3,r2,r1)
151    return (l1+l2*4+l3+r3+r2*4+r1)/12.0
152    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
153
154   
155def quadsummary(values, distance=25):
156    left1 = 50-distance
157    left2 = (left1+50)/2.0
158    right1 = 50+distance
159    right2 = (right1+50)/2.0
160    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
161    #print(left1,left2,left3,50,right3,right2,right1)
162    #print(l1,l2,l3,m,r3,r2,r1)
163    return (l1+l2+r2+r1)/4.0
164    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
165
166   
167def septasummary(values, distance=25):
168    left2 = 50-distance
169    left3 = 50-(distance/2.0)
170    left1 = left2/2.0
171    right2 = 50+distance
172    right3 = 50+(distance/2.0)
173    right1 = (right2+100)/2.0
174    l1,l2,l3,m,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,50,right3,right2,right1))
175    return (l1+l2+l3+m+r3+r2+r1)/7.0
176
177
178def tsvalwmean(subseries):
179    weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries]
180    normalizer = sum(weights)/len(weights)
181    return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer
182                       for i in range(len(weights))])
183
184#def tsvalwmean(subseries):
185#    return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries])
186
187
188def weightedMean(derived, weights):
189    normalizer = sum(weights.values())/len(weights)
190    return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
191
192def weightedMeanTsval(derived, weights):
193    normalizer = sum(weights.values())/len(weights)
194    return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
195
196
197   
198
199def estimateMean(trustFunc, weightFunc, alpha, derived):
200    trust = trustValues(derived, trustFunc)
201    weights = weightFunc(derived, trust, alpha)
202    return weightedMean(derived, weights)
203
204
205def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
206    trust = trustValues(derived, trustFunc)
207    weights = weightFunc(derived, trust, alpha)
208    return weightedMeanTsval(derived, weights)
209
210
211def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
212    ret_val = []
213    for t in range(num_trials):
214        ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
215
216    return ret_val
217
218
219# Returns 1 if unusual_case is unusual in the expected direction
220#         0 if it isn't unusual
221#        -1 if it is unusual in the wrong direction
222def multiBoxTest(params, greater, samples):
223    uc = [s['unusual_packet'] for s in samples]
224    rest = [s['other_packet'] for s in samples]
225   
226    uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
227    rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
228    if uc_high < rest_low:
229        if greater:
230            return -1
231        else:
232            return 1
233
234    if rest_high < uc_low:
235        if greater:
236            return 1
237        else:
238            return -1
239       
240    return 0
241
242
243# Returns 1 if unusual_case is unusual in the expected direction
244#         0 otherwise
245def summaryTest(f, params, greater, samples):
246    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
247
248    mh = f(diffs, params['distance'])
249    #print("estimate:", mh)
250    if greater:
251        if mh > params['threshold']:
252            return 1
253        else:
254            return 0
255    else:
256        if mh < params['threshold']:
257            return 1
258        else:
259            return 0
260
261
262midsummaryTest = functools.partial(summaryTest, midsummary)
263trimeanTest = functools.partial(summaryTest, trimean)
264ubersummaryTest = functools.partial(summaryTest, ubersummary)
265quadsummaryTest = functools.partial(summaryTest, quadsummary)
266septasummaryTest = functools.partial(summaryTest, septasummary)
267
268def rmse(expected, measurements):
269    s = sum([(expected-m)**2 for m in measurements])/len(measurements)
270    return math.sqrt(s)
271
272def nrmse(expected, measurements):
273    return rmse(expected, measurements)/(max(measurements)-min(measurements))
274
275
276class KalmanFilter1D:
277    def __init__(self, x0, P, R, Q):
278        self.x = x0
279        self.P = P
280        self.R = R
281        self.Q = Q
282
283    def update(self, z):
284        self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
285        self.P = 1. / (1./self.P + 1./self.R)
286
287    def predict(self, u=0.0):
288        self.x += u
289        self.P += self.Q
290
291
292def kfilter(params, observations):
293    x = numpy.array(observations)   
294    movement = 0
295    est = []
296    var = []
297    kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
298                        #P  = 10000,         # initial variance
299                        P  = 10,            # initial variance
300                        R  = numpy.std(x),   # msensor noise
301                        Q  = 0)              # movement noise
302    for round in range(1):
303        for d in x:
304            kf.predict(movement)
305            kf.update(d)
306            est.append(kf.x)
307            var.append(kf.P)
308
309    return({'est':est, 'var':var})
310
311
312def kalmanTest(params, greater, samples):
313    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
314
315    m = kfilter(params, diffs)['est'][-1]
316    if greater:
317        if m > params['threshold']:
318            return 1
319        else:
320            return 0
321    else:
322        if m < params['threshold']:
323            return 1
324        else:
325            return 0
326
327
328def tsvalwmeanTest(params, greater, samples):
329    m = tsvalwmean(samples)
330    if greater:
331        if m > params['threshold']:
332            return 1
333        else:
334            return 0
335    else:
336        if m < params['threshold']:
337            return 1
338        else:
339            return 0
340
341
342from pykalman import KalmanFilter
343def pyKalman4DTest(params, greater, samples):
344    kp = params['kparams']
345    #kp['initial_state_mean']=[quadsummary([s['unusual_packet'] for s in samples]),
346    #                          quadsummary([s['other_packet'] for s in samples]),
347    #                          numpy.mean([s['unusual_tsval'] for s in samples]),
348    #                          numpy.mean([s['other_tsval'] for s in samples])]
349    kf = KalmanFilter(n_dim_obs=4, n_dim_state=4, **kp)
350    smooth,covariance = kf.smooth([(s['unusual_packet'],s['other_packet'],s['unusual_tsval'],s['other_tsval'])
351                                   for s in samples])
352    m = numpy.mean(smooth)
353    if greater:
354        if m > params['threshold']:
355            return 1
356        else:
357            return 0
358    else:
359        if m < params['threshold']:
360            return 1
361        else:
362            return 0
363   
Note: See TracBrowser for help on using the repository browser.