source: trunk/lib/nanownlib/stats.py @ 10

Last change on this file since 10 was 10, checked in by tim, 9 years ago

.

File size: 15.0 KB
Line 
1
2import sys
3import os
4import functools
5import math
6import statistics
7import gzip
8import random
9import scipy
10import scipy.stats
11import numpy
12
13# Don't trust numpy's seeding
14numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
15
16
17def mad(arr):
18    """ Median Absolute Deviation: a "Robust" version of standard deviation.
19        Indices variabililty of the sample.
20        https://en.wikipedia.org/wiki/Median_absolute_deviation
21    """
22    arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
23    med = numpy.median(arr)
24    return numpy.median(numpy.abs(arr - med))
25
26
27def cov(x,y):
28    mx = statistics.mean(x)
29    my = statistics.mean(y)
30    products = []
31    for i in range(0,len(x)):
32        products.append((x[i] - mx)*(y[i] - my))
33
34    return statistics.mean(products)
35
36
37def difference(ls):
38    return ls[0]-ls[1]
39
40def product(ls):
41    return ls[0]*ls[1]
42
43def hypotenuse(ls):
44    return math.hypot(ls[0],ls[1])
45   
46def trustValues(derived, trustFunc):
47    ret_val = []
48    for k,v in derived.items():
49        ret_val.append((trustFunc((v['long'],v['short'])), k))
50
51    ret_val.sort()
52    return ret_val
53
54
55def prunedWeights(derived, trust, alpha):
56    weights = {}
57
58    threshold = len(trust)*(1.0-alpha)
59    for i in range(0,len(trust)):
60        if i < threshold:
61            weights[trust[i][1]] = 1.0
62        else:
63            weights[trust[i][1]] = 0.0
64
65    return weights
66
67
68def linearWeights(derived, trust, alpha):
69    x1 = trust[0][0]
70    y1 = 1.0 + (alpha*10)
71    x2 = trust[(len(trust)-1)//3][0]
72    y2 = 1.0
73    m = (y1-y2)/(x1-x2)
74    b = y1 - m*x1
75
76    weights = {}
77    for t,k in trust:
78        weights[k] = m*t+b
79        if weights[k] < 0.0:
80            weights[k] = 0.0
81
82    return weights
83
84
85def invertedWeights(derived,trust,alpha):
86    # (x+1-first_sample)^(-alpha)
87    #scale = trust[0][0]
88
89    #weights = {}
90    #for t,k in trust:
91    #    weights[k] = (t+1-scale)**(-1.0*alpha)
92    #    if weights[k] < 0.0:
93    #        weights[k] = 0.0
94
95    weights = {}
96    for i in range(len(trust)):
97        w = 10.0/(i+2.0)-0.2
98        if w < 0.0:
99            w = 0.0
100        weights[trust[i][1]] = w
101       
102   
103    return weights
104
105
106
107def arctanWeights(derived,trust,alpha):
108    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
109    minimum = trust[0][0]
110   
111    weights = {}
112    for i in range(len(trust)):
113        w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
114        if w < 0.0:
115            w = 0.0
116        weights[trust[i][1]] = w
117
118    return weights
119
120
121def arctanWeights2(derived,trust,alpha):
122    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
123    minimum = trust[0][0]
124    stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
125   
126    weights = {}
127    for i in range(len(trust)):
128        w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
129        if w < 0.0:
130            w = 0.0
131        weights[trust[i][1]] = w
132
133    return weights
134
135
136def midsummary(values, distance=25):
137    #return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
138    l,h = numpy.percentile(values, (50-distance,50+distance))
139    return (l+h)/2.0
140
141def trimean(values, distance=25):
142    return (midsummary(values, distance) + statistics.median(values))/2
143
144def ubersummary(values, distance=25):
145    left2 = 50-distance
146    left1 = left2/2.0
147    left3 = (left2+50)/2.0
148    right2 = 50+distance
149    right3 = (right2+50)/2.0
150    right1 = (right2+100)/2.0
151    l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
152    #print(left1,left2,left3,50,right3,right2,right1)
153    #print(l1,l2,l3,m,r3,r2,r1)
154    return (l1+l2*4+l3+r3+r2*4+r1)/12.0
155    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
156
157def quadsummary(values, distance=25):
158    left2 = 50-distance
159    left1 = left2/2.0
160    right2 = 50+distance
161    right1 = (right2+100)/2.0
162    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
163    #print(left1,left2,left3,50,right3,right2,right1)
164    #print(l1,l2,l3,m,r3,r2,r1)
165    return (l1+l2+r2+r1)/4.0
166    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
167
168def quadsummary(values, distance=25):
169    left1 = 50-distance
170    left2 = (left1+50)/2.0
171    right1 = 50+distance
172    right2 = (right1+50)/2.0
173    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
174    #print(left1,left2,left3,50,right3,right2,right1)
175    #print(l1,l2,l3,m,r3,r2,r1)
176    return (l1+l2+r2+r1)/4.0
177    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
178
179   
180def weightedMean(derived, weights):
181    normalizer = sum(weights.values())/len(weights)
182    return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
183
184def weightedMeanTsval(derived, weights):
185    normalizer = sum(weights.values())/len(weights)
186    return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
187
188
189def estimateMean(trustFunc, weightFunc, alpha, derived):
190    trust = trustValues(derived, trustFunc)
191    weights = weightFunc(derived, trust, alpha)
192    return weightedMean(derived, weights)
193
194
195def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
196    trust = trustValues(derived, trustFunc)
197    weights = weightFunc(derived, trust, alpha)
198    return weightedMeanTsval(derived, weights)
199
200
201#def estimateMedian(trustFunc, weightFunc, alpha, derived):
202#    trust = trustValues(derived, trustFunc)
203#    weights = weightFunc(derived, trust, alpha)
204
205#    return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0])
206
207def estimateMedian(derived):
208    return statistics.median([(d['long']-d['short']) for d in derived.values()])
209
210
211def estimateMidsummary(derived):
212    return midsummary([(d['long']-d['short']) for d in derived.values()])
213
214
215def estimateTrimean(derived):
216    return trimean([(d['long']-d['short']) for d in derived.values()])
217
218
219def tTest(expected_mean, derived):
220    diffs = [(d['long']-d['short']) for d in derived.values()]
221    null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0)
222    tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean)
223
224    if pval < null_pval:
225        return 1
226    else:
227        return 0
228
229   
230def diffMedian(derived):
231    l = [tc['long']-tc['short'] for s,tc in derived.items()]
232    return statistics.median(l)
233
234
235def subsample_ids(db, probe_type, subsample_size=None):
236    cursor = db.conn.cursor()
237    cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))
238    population_size = cursor.fetchone()[0]
239    #print("population_size:", population_size)
240    if subsample_size == None or subsample_size > population_size:
241        subsample_size = population_size
242   
243    start = numpy.random.random_integers(0,population_size-1)
244    cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start))
245    for row in cursor:
246        subsample_size -= 1
247        yield row['sample']
248
249    if subsample_size > 0:
250        cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size))
251        for row in cursor:
252            yield row['sample']
253   
254
255def subsample(db, probe_type, subsample_size=None):
256    cursor = db.conn.cursor()
257    cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)")
258    num_test_cases = cursor.fetchone()[0]
259   
260    for sid in subsample_ids(db, probe_type, subsample_size):
261        cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,))
262        probes = cursor.fetchall()
263        if len(probes) != num_test_cases:
264            sys.stderr.write("WARN: sample %d had %d probes, but %d expected!  Discarding...\n" % (sid, len(probes), num_test_cases))
265            continue
266        yield (sid,[dict(r) for r in probes])
267
268
269def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
270    population_size = db.populationSize(probe_type)
271
272    if size == None or size > population_size:
273        size = population_size
274    if offset == None or offset >= population_size or offset < 0:
275        offset = numpy.random.random_integers(0,population_size-1)
276
277    query="""
278      SELECT %(field)s AS unusual_case,
279             (SELECT avg(%(field)s) FROM probes,analysis
280              WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
281      FROM   (SELECT probes.sample,%(field)s FROM probes,analysis
282              WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
283      LIMIT :size OFFSET :offset
284    """ % {"field":field}
285   
286    params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size}
287    cursor = db.conn.cursor()
288    cursor.execute(query, params)
289    ret_val = [dict(row) for row in cursor.fetchall()]
290    #for row in cursor:
291    #    size -= 1
292    #    yield dict(row)
293
294    size -= len(ret_val)
295    if size > 0:
296        params['offset'] = 0
297        params['size'] = size
298        cursor.execute(query, params)
299        ret_val += [dict(row) for row in cursor.fetchall()]
300        #for row in cursor:
301        #    yield dict(row)
302   
303    return ret_val
304
305
306# if test_cases=None, include all of them.  Otherwise, include only the specified test cases.
307def samples2Distributions(samples, field, test_cases=None):
308    ret_val = {}
309   
310    for sid,probes in samples:
311        for p in probes:
312            if p['test_case'] in ret_val:
313                ret_val[p['test_case']].append(p[field])
314            elif test_cases == None or p['test_case'] in test_cases:
315                ret_val[p['test_case']] = [p[field]]
316               
317               
318    return ret_val
319
320
321def samples2MeanDiffs(samples, field, unusual_case):
322    ret_val = {}
323   
324    for sid,probes in samples:
325        unusual_value = None
326        for p in probes:
327            if p['test_case'] == unusual_case:
328                unusual_value = p[field]
329                break
330        yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case])
331
332
333def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials):
334    ret_val = []
335    for t in range(num_trials):
336        ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size)))
337
338    return ret_val
339
340
341def bootstrap2(estimator, db, probe_type, subsample_size, num_trials):
342    ret_val = []
343    for t in range(num_trials):
344        ret_val.append(estimator(subsample(db, probe_type, subsample_size)))
345
346    return ret_val
347
348
349def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
350    ret_val = []
351    for t in range(num_trials):
352        ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
353
354    return ret_val
355
356
357# Returns the test case name that clearly has higher RTT; otherwise, returns None
358def boxTest(params, test_cases, samples):
359    if len(test_cases) != 2:
360        # XXX: somehow generalize the box test to handle more than 2 cases
361        raise Exception()
362    dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params
363
364    tmp1,tmp2 = dists.items()
365    test_case1,dist1 = tmp1
366    test_case2,dist2 = tmp2
367   
368    d1_high = numpy.percentile(dist1, params['high'])
369    d2_low = numpy.percentile(dist2, params['low'])
370    if d1_high < d2_low:
371        return test_case2
372
373    d1_low = numpy.percentile(dist1, params['low'])
374    d2_high = numpy.percentile(dist2, params['high'])
375
376    if d2_high < d1_low:
377        return test_case1
378   
379    return None
380
381
382# Returns 1 if unusual_case is unusual in the expected direction
383#         0 if it isn't unusual
384#        -1 if it is unusual in the wrong direction
385def multiBoxTest(params, greater, samples):
386    uc = [s['unusual_case'] for s in samples]
387    rest = [s['other_cases'] for s in samples]
388   
389    uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
390    rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
391    if uc_high < rest_low:
392        if greater:
393            return -1
394        else:
395            return 1
396
397    if rest_high < uc_low:
398        if greater:
399            return 1
400        else:
401            return -1
402       
403    return 0
404
405
406# Returns 1 if unusual_case is unusual in the expected direction
407#         0 otherwise
408def summaryTest(f, params, greater, samples):
409    diffs = [s['unusual_case']-s['other_cases'] for s in samples]
410
411    mh = f(diffs, params['distance'])
412    if greater:
413        if mh > params['threshold']:
414            return 1
415        else:
416            return 0
417    else:
418        if mh < params['threshold']:
419            return 1
420        else:
421            return 0
422
423midsummaryTest = functools.partial(summaryTest, midsummary)
424trimeanTest = functools.partial(summaryTest, trimean)
425ubersummaryTest = functools.partial(summaryTest, ubersummary)
426quadsummaryTest = functools.partial(summaryTest, quadsummary)
427
428def rmse(expected, measurements):
429    s = sum([(expected-m)**2 for m in measurements])/len(measurements)
430    return math.sqrt(s)
431
432def nrmse(expected, measurements):
433    return rmse(expected, measurements)/(max(measurements)-min(measurements))
434
435
436class KalmanFilter1D:
437    def __init__(self, x0, P, R, Q):
438        self.x = x0
439        self.P = P
440        self.R = R
441        self.Q = Q
442
443    def update(self, z):
444        self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
445        self.P = 1. / (1./self.P + 1./self.R)
446
447    def predict(self, u=0.0):
448        self.x += u
449        self.P += self.Q
450
451
452def kfilter(params, observations):
453    x = numpy.array(observations)
454    movement = 0
455    est = []   
456    var = []
457    kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
458                        #P  = 10000,          # initial variance
459                        P  = 10,          # initial variance
460                        R  = numpy.std(x),   # msensor noise
461                        Q  = 0)              # movement noise
462    for round in range(1):
463        for d in x:
464            kf.predict(movement)
465            kf.update(d)
466            est.append(kf.x)
467            var.append(kf.P)
468
469    return({'est':est, 'var':var})
470
471
472def kalmanTest(params, greater, samples):
473    diffs = [s['unusual_case']-s['other_cases'] for s in samples]
474
475    m = kfilter(params, diffs)['est'][-1]
476    if greater:
477        if m > params['threshold']:
478            return 1
479        else:
480            return 0
481    else:
482        if m < params['threshold']:
483            return 1
484        else:
485            return 0
486
487
488def kalmanTest2(params, greater, samples):
489    diffs = [s['unusual_case']-s['other_cases'] for s in samples]
490
491    estimates = []
492    size = 500
493    for i in range(100):
494        off = random.randrange(0,len(diffs))
495        sub = diffs[off:size]
496        if len(sub) < size:
497            sub += diffs[0:size-len(sub)]
498        estimates.append(kfilter(params, sub)['est'][-1])
499           
500    m = quadsummary(estimates)
501    if greater:
502        if m > params['threshold']:
503            return 1
504        else:
505            return 0
506    else:
507        if m < params['threshold']:
508            return 1
509        else:
510            return 0
511
Note: See TracBrowser for help on using the repository browser.