source: trunk/lib/nanownlib/stats.py @ 8

Last change on this file since 8 was 8, checked in by tim, 9 years ago

.

File size: 11.7 KB
Line 
1
2import sys
3import os
4import math
5import statistics
6import gzip
7import random
8import scipy
9import scipy.stats
10import numpy
11
12# Don't trust numpy's seeding
13numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
14
15
16def mad(arr):
17    """ Median Absolute Deviation: a "Robust" version of standard deviation.
18        Indices variabililty of the sample.
19        https://en.wikipedia.org/wiki/Median_absolute_deviation
20    """
21    arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
22    med = numpy.median(arr)
23    return numpy.median(numpy.abs(arr - med))
24
25
26def cov(x,y):
27    mx = statistics.mean(x)
28    my = statistics.mean(y)
29    products = []
30    for i in range(0,len(x)):
31        products.append((x[i] - mx)*(y[i] - my))
32
33    return statistics.mean(products)
34
35
36def difference(ls):
37    return ls[0]-ls[1]
38
39def product(ls):
40    return ls[0]*ls[1]
41
42def hypotenuse(ls):
43    return math.hypot(ls[0],ls[1])
44   
45def trustValues(derived, trustFunc):
46    ret_val = []
47    for k,v in derived.items():
48        ret_val.append((trustFunc((v['long'],v['short'])), k))
49
50    ret_val.sort()
51    return ret_val
52
53
54def prunedWeights(derived, trust, alpha):
55    weights = {}
56
57    threshold = len(trust)*(1.0-alpha)
58    for i in range(0,len(trust)):
59        if i < threshold:
60            weights[trust[i][1]] = 1.0
61        else:
62            weights[trust[i][1]] = 0.0
63
64    return weights
65
66
67def linearWeights(derived, trust, alpha):
68    x1 = trust[0][0]
69    y1 = 1.0 + (alpha*10)
70    x2 = trust[(len(trust)-1)//3][0]
71    y2 = 1.0
72    m = (y1-y2)/(x1-x2)
73    b = y1 - m*x1
74
75    weights = {}
76    for t,k in trust:
77        weights[k] = m*t+b
78        if weights[k] < 0.0:
79            weights[k] = 0.0
80
81    return weights
82
83
84def invertedWeights(derived,trust,alpha):
85    # (x+1-first_sample)^(-alpha)
86    #scale = trust[0][0]
87
88    #weights = {}
89    #for t,k in trust:
90    #    weights[k] = (t+1-scale)**(-1.0*alpha)
91    #    if weights[k] < 0.0:
92    #        weights[k] = 0.0
93
94    weights = {}
95    for i in range(len(trust)):
96        w = 10.0/(i+2.0)-0.2
97        if w < 0.0:
98            w = 0.0
99        weights[trust[i][1]] = w
100       
101   
102    return weights
103
104
105
106def arctanWeights(derived,trust,alpha):
107    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
108    minimum = trust[0][0]
109   
110    weights = {}
111    for i in range(len(trust)):
112        w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
113        if w < 0.0:
114            w = 0.0
115        weights[trust[i][1]] = w
116
117    return weights
118
119
120def arctanWeights2(derived,trust,alpha):
121    shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
122    minimum = trust[0][0]
123    stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
124   
125    weights = {}
126    for i in range(len(trust)):
127        w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
128        if w < 0.0:
129            w = 0.0
130        weights[trust[i][1]] = w
131
132    return weights
133
134
135def midhinge(values, distance=25):
136    return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
137
138def trimean(values, distance=25):
139    return (midhinge(values, distance) + statistics.median(values))/2
140
141def weightedMean(derived, weights):
142    normalizer = sum(weights.values())/len(weights)
143    return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
144
145def weightedMeanTsval(derived, weights):
146    normalizer = sum(weights.values())/len(weights)
147    return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
148
149
150def estimateMean(trustFunc, weightFunc, alpha, derived):
151    trust = trustValues(derived, trustFunc)
152    weights = weightFunc(derived, trust, alpha)
153    return weightedMean(derived, weights)
154
155
156def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
157    trust = trustValues(derived, trustFunc)
158    weights = weightFunc(derived, trust, alpha)
159    return weightedMeanTsval(derived, weights)
160
161
162#def estimateMedian(trustFunc, weightFunc, alpha, derived):
163#    trust = trustValues(derived, trustFunc)
164#    weights = weightFunc(derived, trust, alpha)
165
166#    return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0])
167
168def estimateMedian(derived):
169    return statistics.median([(d['long']-d['short']) for d in derived.values()])
170
171
172def estimateMidhinge(derived):
173    return midhinge([(d['long']-d['short']) for d in derived.values()])
174
175
176def estimateTrimean(derived):
177    return trimean([(d['long']-d['short']) for d in derived.values()])
178
179
180def tTest(expected_mean, derived):
181    diffs = [(d['long']-d['short']) for d in derived.values()]
182    null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0)
183    tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean)
184
185    if pval < null_pval:
186        return 1
187    else:
188        return 0
189
190   
191def diffMedian(derived):
192    l = [tc['long']-tc['short'] for s,tc in derived.items()]
193    return statistics.median(l)
194
195
196def subsample_ids(db, probe_type, subsample_size=None):
197    cursor = db.conn.cursor()
198    cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))
199    population_size = cursor.fetchone()[0]
200    #print("population_size:", population_size)
201    if subsample_size == None or subsample_size > population_size:
202        subsample_size = population_size
203   
204    start = numpy.random.random_integers(0,population_size-1)
205    cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start))
206    for row in cursor:
207        subsample_size -= 1
208        yield row['sample']
209
210    if subsample_size > 0:
211        cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size))
212        for row in cursor:
213            yield row['sample']
214   
215
216def subsample(db, probe_type, subsample_size=None):
217    cursor = db.conn.cursor()
218    cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)")
219    num_test_cases = cursor.fetchone()[0]
220   
221    for sid in subsample_ids(db, probe_type, subsample_size):
222        cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,))
223        probes = cursor.fetchall()
224        if len(probes) != num_test_cases:
225            sys.stderr.write("WARN: sample %d had %d probes, but %d expected!  Discarding...\n" % (sid, len(probes), num_test_cases))
226            continue
227        yield (sid,[dict(r) for r in probes])
228
229
230def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
231    population_size = db.populationSize(probe_type)
232
233    if size == None or size > population_size:
234        size = population_size
235    if offset == None or offset >= population_size or offset < 0:
236        offset = numpy.random.random_integers(0,population_size-1)
237
238    query="""
239      SELECT %(field)s AS unusual_case,
240             (SELECT avg(%(field)s) FROM probes,analysis
241              WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
242      FROM   (SELECT probes.sample,%(field)s FROM probes,analysis
243              WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
244      LIMIT :size OFFSET :offset
245    """ % {"field":field}
246   
247    params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size}
248    cursor = db.conn.cursor()
249    cursor.execute(query, params)
250    ret_val = [dict(row) for row in cursor.fetchall()]
251    #for row in cursor:
252    #    size -= 1
253    #    yield dict(row)
254
255    size -= len(ret_val)
256    if size > 0:
257        params['offset'] = 0
258        params['size'] = size
259        cursor.execute(query, params)
260        ret_val += [dict(row) for row in cursor.fetchall()]
261        #for row in cursor:
262        #    yield dict(row)
263   
264    return ret_val
265
266
267# if test_cases=None, include all of them.  Otherwise, include only the specified test cases.
268def samples2Distributions(samples, field, test_cases=None):
269    ret_val = {}
270   
271    for sid,probes in samples:
272        for p in probes:
273            if p['test_case'] in ret_val:
274                ret_val[p['test_case']].append(p[field])
275            elif test_cases == None or p['test_case'] in test_cases:
276                ret_val[p['test_case']] = [p[field]]
277               
278               
279    return ret_val
280
281
282def samples2MeanDiffs(samples, field, unusual_case):
283    ret_val = {}
284   
285    for sid,probes in samples:
286        unusual_value = None
287        for p in probes:
288            if p['test_case'] == unusual_case:
289                unusual_value = p[field]
290                break
291        yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case])
292
293
294def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials):
295    ret_val = []
296    for t in range(num_trials):
297        ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size)))
298
299    return ret_val
300
301
302def bootstrap2(estimator, db, probe_type, subsample_size, num_trials):
303    ret_val = []
304    for t in range(num_trials):
305        ret_val.append(estimator(subsample(db, probe_type, subsample_size)))
306
307    return ret_val
308
309
310def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
311    ret_val = []
312    for t in range(num_trials):
313        ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
314
315    return ret_val
316
317
318# Returns the test case name that clearly has higher RTT; otherwise, returns None
319def boxTest(params, test_cases, samples):
320    if len(test_cases) != 2:
321        # XXX: somehow generalize the box test to handle more than 2 cases
322        raise Exception()
323    dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params
324
325    tmp1,tmp2 = dists.items()
326    test_case1,dist1 = tmp1
327    test_case2,dist2 = tmp2
328   
329    d1_high = numpy.percentile(dist1, params['high'])
330    d2_low = numpy.percentile(dist2, params['low'])
331    if d1_high < d2_low:
332        return test_case2
333
334    d1_low = numpy.percentile(dist1, params['low'])
335    d2_high = numpy.percentile(dist2, params['high'])
336
337    if d2_high < d1_low:
338        return test_case1
339   
340    return None
341
342
343# Returns 1 if unusual_case is unusual in the expected direction
344#         0 if it isn't unusual
345#        -1 if it is unusual in the wrong direction
346def multiBoxTest(params, greater, samples):
347    uc = [s['unusual_case'] for s in samples]
348    rest = [s['other_cases'] for s in samples]
349   
350    uc_high = numpy.percentile(uc, params['high'])
351    rest_low = numpy.percentile(rest, params['low'])
352    if uc_high < rest_low:
353        if greater:
354            return -1
355        else:
356            return 1
357
358    uc_low = numpy.percentile(uc, params['low'])
359    rest_high = numpy.percentile(rest, params['high'])
360    if rest_high < uc_low:
361        if greater:
362            return 1
363        else:
364            return -1
365       
366    return 0
367
368
369# Returns 1 if unusual_case is unusual in the expected direction
370#         0 otherwise
371def midhingeTest(params, greater, samples):
372    diffs = [s['unusual_case']-s['other_cases'] for s in samples]
373
374    mh = midhinge(diffs, params['distance'])
375    #mh = trimean(diffs, params['distance'])
376    if greater:
377        if mh > params['threshold']:
378            return 1
379        else:
380            return 0
381    else:
382        if mh < params['threshold']:
383            return 1
384        else:
385            return 0
386
387
388def rmse(expected, measurements):
389    s = sum([(expected-m)**2 for m in measurements])/len(measurements)
390    return math.sqrt(s)
391
392def nrmse(expected, measurements):
393    return rmse(expected, measurements)/(max(measurements)-min(measurements))
Note: See TracBrowser for help on using the repository browser.