Context Navigation

source: trunk/lib/nanownlib/stats.py @ 10

Last change on this file since 10 was 10, checked in by tim, 9 years ago
.
File size: 15.0 KB

Line
1
2	import sys
3	import os
4	import functools
5	import math
6	import statistics
7	import gzip
8	import random
9	import scipy
10	import scipy.stats
11	import numpy
12
13	# Don't trust numpy's seeding
14	numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
15
16
17	def mad(arr):
18	""" Median Absolute Deviation: a "Robust" version of standard deviation.
19	Indices variabililty of the sample.
20	https://en.wikipedia.org/wiki/Median_absolute_deviation
21	"""
22	arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
23	med = numpy.median(arr)
24	return numpy.median(numpy.abs(arr - med))
25
26
27	def cov(x,y):
28	mx = statistics.mean(x)
29	my = statistics.mean(y)
30	products = []
31	for i in range(0,len(x)):
32	products.append((x[i] - mx)*(y[i] - my))
33
34	return statistics.mean(products)
35
36
37	def difference(ls):
38	return ls[0]-ls[1]
39
40	def product(ls):
41	return ls[0]*ls[1]
42
43	def hypotenuse(ls):
44	return math.hypot(ls[0],ls[1])
45
46	def trustValues(derived, trustFunc):
47	ret_val = []
48	for k,v in derived.items():
49	ret_val.append((trustFunc((v['long'],v['short'])), k))
50
51	ret_val.sort()
52	return ret_val
53
54
55	def prunedWeights(derived, trust, alpha):
56	weights = {}
57
58	threshold = len(trust)*(1.0-alpha)
59	for i in range(0,len(trust)):
60	if i < threshold:
61	weights[trust[i][1]] = 1.0
62	else:
63	weights[trust[i][1]] = 0.0
64
65	return weights
66
67
68	def linearWeights(derived, trust, alpha):
69	x1 = trust[0][0]
70	y1 = 1.0 + (alpha*10)
71	x2 = trust[(len(trust)-1)//3][0]
72	y2 = 1.0
73	m = (y1-y2)/(x1-x2)
74	b = y1 - m*x1
75
76	weights = {}
77	for t,k in trust:
78	weights[k] = m*t+b
79	if weights[k] < 0.0:
80	weights[k] = 0.0
81
82	return weights
83
84
85	def invertedWeights(derived,trust,alpha):
86	# (x+1-first_sample)^(-alpha)
87	#scale = trust[0][0]
88
89	#weights = {}
90	#for t,k in trust:
91	# weights[k] = (t+1-scale)*(-1.0alpha)
92	# if weights[k] < 0.0:
93	# weights[k] = 0.0
94
95	weights = {}
96	for i in range(len(trust)):
97	w = 10.0/(i+2.0)-0.2
98	if w < 0.0:
99	w = 0.0
100	weights[trust[i][1]] = w
101
102
103	return weights
104
105
106
107	def arctanWeights(derived,trust,alpha):
108	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
109	minimum = trust[0][0]
110
111	weights = {}
112	for i in range(len(trust)):
113	w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
114	if w < 0.0:
115	w = 0.0
116	weights[trust[i][1]] = w
117
118	return weights
119
120
121	def arctanWeights2(derived,trust,alpha):
122	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
123	minimum = trust[0][0]
124	stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
125
126	weights = {}
127	for i in range(len(trust)):
128	w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
129	if w < 0.0:
130	w = 0.0
131	weights[trust[i][1]] = w
132
133	return weights
134
135
136	def midsummary(values, distance=25):
137	#return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
138	l,h = numpy.percentile(values, (50-distance,50+distance))
139	return (l+h)/2.0
140
141	def trimean(values, distance=25):
142	return (midsummary(values, distance) + statistics.median(values))/2
143
144	def ubersummary(values, distance=25):
145	left2 = 50-distance
146	left1 = left2/2.0
147	left3 = (left2+50)/2.0
148	right2 = 50+distance
149	right3 = (right2+50)/2.0
150	right1 = (right2+100)/2.0
151	l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
152	#print(left1,left2,left3,50,right3,right2,right1)
153	#print(l1,l2,l3,m,r3,r2,r1)
154	return (l1+l24+l3+r3+r24+r1)/12.0
155	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
156
157	def quadsummary(values, distance=25):
158	left2 = 50-distance
159	left1 = left2/2.0
160	right2 = 50+distance
161	right1 = (right2+100)/2.0
162	l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
163	#print(left1,left2,left3,50,right3,right2,right1)
164	#print(l1,l2,l3,m,r3,r2,r1)
165	return (l1+l2+r2+r1)/4.0
166	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
167
168	def quadsummary(values, distance=25):
169	left1 = 50-distance
170	left2 = (left1+50)/2.0
171	right1 = 50+distance
172	right2 = (right1+50)/2.0
173	l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
174	#print(left1,left2,left3,50,right3,right2,right1)
175	#print(l1,l2,l3,m,r3,r2,r1)
176	return (l1+l2+r2+r1)/4.0
177	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
178
179
180	def weightedMean(derived, weights):
181	normalizer = sum(weights.values())/len(weights)
182	return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
183
184	def weightedMeanTsval(derived, weights):
185	normalizer = sum(weights.values())/len(weights)
186	return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
187
188
189	def estimateMean(trustFunc, weightFunc, alpha, derived):
190	trust = trustValues(derived, trustFunc)
191	weights = weightFunc(derived, trust, alpha)
192	return weightedMean(derived, weights)
193
194
195	def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
196	trust = trustValues(derived, trustFunc)
197	weights = weightFunc(derived, trust, alpha)
198	return weightedMeanTsval(derived, weights)
199
200
201	#def estimateMedian(trustFunc, weightFunc, alpha, derived):
202	# trust = trustValues(derived, trustFunc)
203	# weights = weightFunc(derived, trust, alpha)
204
205	# return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0])
206
207	def estimateMedian(derived):
208	return statistics.median([(d['long']-d['short']) for d in derived.values()])
209
210
211	def estimateMidsummary(derived):
212	return midsummary([(d['long']-d['short']) for d in derived.values()])
213
214
215	def estimateTrimean(derived):
216	return trimean([(d['long']-d['short']) for d in derived.values()])
217
218
219	def tTest(expected_mean, derived):
220	diffs = [(d['long']-d['short']) for d in derived.values()]
221	null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0)
222	tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean)
223
224	if pval < null_pval:
225	return 1
226	else:
227	return 0
228
229
230	def diffMedian(derived):
231	l = [tc['long']-tc['short'] for s,tc in derived.items()]
232	return statistics.median(l)
233
234
235	def subsample_ids(db, probe_type, subsample_size=None):
236	cursor = db.conn.cursor()
237	cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))
238	population_size = cursor.fetchone()[0]
239	#print("population_size:", population_size)
240	if subsample_size == None or subsample_size > population_size:
241	subsample_size = population_size
242
243	start = numpy.random.random_integers(0,population_size-1)
244	cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start))
245	for row in cursor:
246	subsample_size -= 1
247	yield row['sample']
248
249	if subsample_size > 0:
250	cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size))
251	for row in cursor:
252	yield row['sample']
253
254
255	def subsample(db, probe_type, subsample_size=None):
256	cursor = db.conn.cursor()
257	cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)")
258	num_test_cases = cursor.fetchone()[0]
259
260	for sid in subsample_ids(db, probe_type, subsample_size):
261	cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,))
262	probes = cursor.fetchall()
263	if len(probes) != num_test_cases:
264	sys.stderr.write("WARN: sample %d had %d probes, but %d expected! Discarding...\n" % (sid, len(probes), num_test_cases))
265	continue
266	yield (sid,[dict(r) for r in probes])
267
268
269	def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
270	population_size = db.populationSize(probe_type)
271
272	if size == None or size > population_size:
273	size = population_size
274	if offset == None or offset >= population_size or offset < 0:
275	offset = numpy.random.random_integers(0,population_size-1)
276
277	query="""
278	SELECT %(field)s AS unusual_case,
279	(SELECT avg(%(field)s) FROM probes,analysis
280	WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
281	FROM (SELECT probes.sample,%(field)s FROM probes,analysis
282	WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
283	LIMIT :size OFFSET :offset
284	""" % {"field":field}
285
286	params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size}
287	cursor = db.conn.cursor()
288	cursor.execute(query, params)
289	ret_val = [dict(row) for row in cursor.fetchall()]
290	#for row in cursor:
291	# size -= 1
292	# yield dict(row)
293
294	size -= len(ret_val)
295	if size > 0:
296	params['offset'] = 0
297	params['size'] = size
298	cursor.execute(query, params)
299	ret_val += [dict(row) for row in cursor.fetchall()]
300	#for row in cursor:
301	# yield dict(row)
302
303	return ret_val
304
305
306	# if test_cases=None, include all of them. Otherwise, include only the specified test cases.
307	def samples2Distributions(samples, field, test_cases=None):
308	ret_val = {}
309
310	for sid,probes in samples:
311	for p in probes:
312	if p['test_case'] in ret_val:
313	ret_val[p['test_case']].append(p[field])
314	elif test_cases == None or p['test_case'] in test_cases:
315	ret_val[p['test_case']] = [p[field]]
316
317
318	return ret_val
319
320
321	def samples2MeanDiffs(samples, field, unusual_case):
322	ret_val = {}
323
324	for sid,probes in samples:
325	unusual_value = None
326	for p in probes:
327	if p['test_case'] == unusual_case:
328	unusual_value = p[field]
329	break
330	yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case])
331
332
333	def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials):
334	ret_val = []
335	for t in range(num_trials):
336	ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size)))
337
338	return ret_val
339
340
341	def bootstrap2(estimator, db, probe_type, subsample_size, num_trials):
342	ret_val = []
343	for t in range(num_trials):
344	ret_val.append(estimator(subsample(db, probe_type, subsample_size)))
345
346	return ret_val
347
348
349	def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
350	ret_val = []
351	for t in range(num_trials):
352	ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
353
354	return ret_val
355
356
357	# Returns the test case name that clearly has higher RTT; otherwise, returns None
358	def boxTest(params, test_cases, samples):
359	if len(test_cases) != 2:
360	# XXX: somehow generalize the box test to handle more than 2 cases
361	raise Exception()
362	dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params
363
364	tmp1,tmp2 = dists.items()
365	test_case1,dist1 = tmp1
366	test_case2,dist2 = tmp2
367
368	d1_high = numpy.percentile(dist1, params['high'])
369	d2_low = numpy.percentile(dist2, params['low'])
370	if d1_high < d2_low:
371	return test_case2
372
373	d1_low = numpy.percentile(dist1, params['low'])
374	d2_high = numpy.percentile(dist2, params['high'])
375
376	if d2_high < d1_low:
377	return test_case1
378
379	return None
380
381
382	# Returns 1 if unusual_case is unusual in the expected direction
383	# 0 if it isn't unusual
384	# -1 if it is unusual in the wrong direction
385	def multiBoxTest(params, greater, samples):
386	uc = [s['unusual_case'] for s in samples]
387	rest = [s['other_cases'] for s in samples]
388
389	uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
390	rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
391	if uc_high < rest_low:
392	if greater:
393	return -1
394	else:
395	return 1
396
397	if rest_high < uc_low:
398	if greater:
399	return 1
400	else:
401	return -1
402
403	return 0
404
405
406	# Returns 1 if unusual_case is unusual in the expected direction
407	# 0 otherwise
408	def summaryTest(f, params, greater, samples):
409	diffs = [s['unusual_case']-s['other_cases'] for s in samples]
410
411	mh = f(diffs, params['distance'])
412	if greater:
413	if mh > params['threshold']:
414	return 1
415	else:
416	return 0
417	else:
418	if mh < params['threshold']:
419	return 1
420	else:
421	return 0
422
423	midsummaryTest = functools.partial(summaryTest, midsummary)
424	trimeanTest = functools.partial(summaryTest, trimean)
425	ubersummaryTest = functools.partial(summaryTest, ubersummary)
426	quadsummaryTest = functools.partial(summaryTest, quadsummary)
427
428	def rmse(expected, measurements):
429	s = sum([(expected-m)**2 for m in measurements])/len(measurements)
430	return math.sqrt(s)
431
432	def nrmse(expected, measurements):
433	return rmse(expected, measurements)/(max(measurements)-min(measurements))
434
435
436	class KalmanFilter1D:
437	def __init__(self, x0, P, R, Q):
438	self.x = x0
439	self.P = P
440	self.R = R
441	self.Q = Q
442
443	def update(self, z):
444	self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
445	self.P = 1. / (1./self.P + 1./self.R)
446
447	def predict(self, u=0.0):
448	self.x += u
449	self.P += self.Q
450
451
452	def kfilter(params, observations):
453	x = numpy.array(observations)
454	movement = 0
455	est = []
456	var = []
457	kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
458	#P = 10000, # initial variance
459	P = 10, # initial variance
460	R = numpy.std(x), # msensor noise
461	Q = 0) # movement noise
462	for round in range(1):
463	for d in x:
464	kf.predict(movement)
465	kf.update(d)
466	est.append(kf.x)
467	var.append(kf.P)
468
469	return({'est':est, 'var':var})
470
471
472	def kalmanTest(params, greater, samples):
473	diffs = [s['unusual_case']-s['other_cases'] for s in samples]
474
475	m = kfilter(params, diffs)['est'][-1]
476	if greater:
477	if m > params['threshold']:
478	return 1
479	else:
480	return 0
481	else:
482	if m < params['threshold']:
483	return 1
484	else:
485	return 0
486
487
488	def kalmanTest2(params, greater, samples):
489	diffs = [s['unusual_case']-s['other_cases'] for s in samples]
490
491	estimates = []
492	size = 500
493	for i in range(100):
494	off = random.randrange(0,len(diffs))
495	sub = diffs[off:size]
496	if len(sub) < size:
497	sub += diffs[0:size-len(sub)]
498	estimates.append(kfilter(params, sub)['est'][-1])
499
500	m = quadsummary(estimates)
501	if greater:
502	if m > params['threshold']:
503	return 1
504	else:
505	return 0
506	else:
507	if m < params['threshold']:
508	return 1
509	else:
510	return 0
511

Note: See TracBrowser for help on using the repository browser.

Download in other formats: