Context Navigation

source: trunk/lib/nanownlib/stats.py @ 6

Last change on this file since 6 was 6, checked in by tim, 9 years ago
.
File size: 11.7 KB

Line
1
2	import sys
3	import os
4	import math
5	import statistics
6	import gzip
7	import random
8	import scipy
9	import scipy.stats
10	import numpy
11
12	# Don't trust numpy's seeding
13	numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
14
15
16	def mad(arr):
17	""" Median Absolute Deviation: a "Robust" version of standard deviation.
18	Indices variabililty of the sample.
19	https://en.wikipedia.org/wiki/Median_absolute_deviation
20	"""
21	arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
22	med = numpy.median(arr)
23	return numpy.median(numpy.abs(arr - med))
24
25
26	def cov(x,y):
27	mx = statistics.mean(x)
28	my = statistics.mean(y)
29	products = []
30	for i in range(0,len(x)):
31	products.append((x[i] - mx)*(y[i] - my))
32
33	return statistics.mean(products)
34
35
36	def difference(ls):
37	return ls[0]-ls[1]
38
39	def product(ls):
40	return ls[0]*ls[1]
41
42	def hypotenuse(ls):
43	return math.hypot(ls[0],ls[1])
44
45	def trustValues(derived, trustFunc):
46	ret_val = []
47	for k,v in derived.items():
48	ret_val.append((trustFunc((v['long'],v['short'])), k))
49
50	ret_val.sort()
51	return ret_val
52
53
54	def prunedWeights(derived, trust, alpha):
55	weights = {}
56
57	threshold = len(trust)*(1.0-alpha)
58	for i in range(0,len(trust)):
59	if i < threshold:
60	weights[trust[i][1]] = 1.0
61	else:
62	weights[trust[i][1]] = 0.0
63
64	return weights
65
66
67	def linearWeights(derived, trust, alpha):
68	x1 = trust[0][0]
69	y1 = 1.0 + (alpha*10)
70	x2 = trust[(len(trust)-1)//3][0]
71	y2 = 1.0
72	m = (y1-y2)/(x1-x2)
73	b = y1 - m*x1
74
75	weights = {}
76	for t,k in trust:
77	weights[k] = m*t+b
78	if weights[k] < 0.0:
79	weights[k] = 0.0
80
81	return weights
82
83
84	def invertedWeights(derived,trust,alpha):
85	# (x+1-first_sample)^(-alpha)
86	#scale = trust[0][0]
87
88	#weights = {}
89	#for t,k in trust:
90	# weights[k] = (t+1-scale)*(-1.0alpha)
91	# if weights[k] < 0.0:
92	# weights[k] = 0.0
93
94	weights = {}
95	for i in range(len(trust)):
96	w = 10.0/(i+2.0)-0.2
97	if w < 0.0:
98	w = 0.0
99	weights[trust[i][1]] = w
100
101
102	return weights
103
104
105
106	def arctanWeights(derived,trust,alpha):
107	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
108	minimum = trust[0][0]
109
110	weights = {}
111	for i in range(len(trust)):
112	w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
113	if w < 0.0:
114	w = 0.0
115	weights[trust[i][1]] = w
116
117	return weights
118
119
120	def arctanWeights2(derived,trust,alpha):
121	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
122	minimum = trust[0][0]
123	stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
124
125	weights = {}
126	for i in range(len(trust)):
127	w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
128	if w < 0.0:
129	w = 0.0
130	weights[trust[i][1]] = w
131
132	return weights
133
134
135	def midhinge(values, distance=25):
136	return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
137
138	def trimean(values, distance=25):
139	return (midhinge(values, distance) + statistics.median(values))/2
140
141	def weightedMean(derived, weights):
142	normalizer = sum(weights.values())/len(weights)
143	return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
144
145	def weightedMeanTsval(derived, weights):
146	normalizer = sum(weights.values())/len(weights)
147	return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
148
149
150	def estimateMean(trustFunc, weightFunc, alpha, derived):
151	trust = trustValues(derived, trustFunc)
152	weights = weightFunc(derived, trust, alpha)
153	return weightedMean(derived, weights)
154
155
156	def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
157	trust = trustValues(derived, trustFunc)
158	weights = weightFunc(derived, trust, alpha)
159	return weightedMeanTsval(derived, weights)
160
161
162	#def estimateMedian(trustFunc, weightFunc, alpha, derived):
163	# trust = trustValues(derived, trustFunc)
164	# weights = weightFunc(derived, trust, alpha)
165
166	# return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0])
167
168	def estimateMedian(derived):
169	return statistics.median([(d['long']-d['short']) for d in derived.values()])
170
171
172	def estimateMidhinge(derived):
173	return midhinge([(d['long']-d['short']) for d in derived.values()])
174
175
176	def estimateTrimean(derived):
177	return trimean([(d['long']-d['short']) for d in derived.values()])
178
179
180	def tTest(expected_mean, derived):
181	diffs = [(d['long']-d['short']) for d in derived.values()]
182	null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0)
183	tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean)
184
185	if pval < null_pval:
186	return 1
187	else:
188	return 0
189
190
191	def diffMedian(derived):
192	l = [tc['long']-tc['short'] for s,tc in derived.items()]
193	return statistics.median(l)
194
195
196	def subsample_ids(db, probe_type, subsample_size=None):
197	cursor = db.conn.cursor()
198	cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))
199	population_size = cursor.fetchone()[0]
200	#print("population_size:", population_size)
201	if subsample_size == None or subsample_size > population_size:
202	subsample_size = population_size
203
204	start = numpy.random.random_integers(0,population_size-1)
205	cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start))
206	for row in cursor:
207	subsample_size -= 1
208	yield row['sample']
209
210	if subsample_size > 0:
211	cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size))
212	for row in cursor:
213	yield row['sample']
214
215
216	def subsample(db, probe_type, subsample_size=None):
217	cursor = db.conn.cursor()
218	cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)")
219	num_test_cases = cursor.fetchone()[0]
220
221	for sid in subsample_ids(db, probe_type, subsample_size):
222	cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,))
223	probes = cursor.fetchall()
224	if len(probes) != num_test_cases:
225	sys.stderr.write("WARN: sample %d had %d probes, but %d expected! Discarding...\n" % (sid, len(probes), num_test_cases))
226	continue
227	yield (sid,[dict(r) for r in probes])
228
229
230	def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
231	cursor = db.conn.cursor()
232	cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))
233	population_size = cursor.fetchone()[0]
234
235	if size == None or size > population_size:
236	size = population_size
237	if offset == None or offset >= population_size or offset < 0:
238	offset = numpy.random.random_integers(0,population_size-1)
239
240	query="""
241	SELECT %(field)s AS unusual_case,
242	(SELECT avg(%(field)s) FROM probes,analysis
243	WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
244	FROM (SELECT probes.sample,%(field)s FROM probes,analysis
245	WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
246	LIMIT :size OFFSET :offset
247	""" % {"field":field}
248
249	params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size}
250	cursor.execute(query, params)
251	for row in cursor:
252	size -= 1
253	yield dict(row)
254
255	if size > 0:
256	params['offset'] = 0
257	params['size'] = size
258	cursor.execute(query, params)
259	for row in cursor:
260	yield dict(row)
261
262
263	# if test_cases=None, include all of them. Otherwise, include only the specified test cases.
264	def samples2Distributions(samples, field, test_cases=None):
265	ret_val = {}
266
267	for sid,probes in samples:
268	for p in probes:
269	if p['test_case'] in ret_val:
270	ret_val[p['test_case']].append(p[field])
271	elif test_cases == None or p['test_case'] in test_cases:
272	ret_val[p['test_case']] = [p[field]]
273
274
275	return ret_val
276
277
278	def samples2MeanDiffs(samples, field, unusual_case):
279	ret_val = {}
280
281	for sid,probes in samples:
282	unusual_value = None
283	for p in probes:
284	if p['test_case'] == unusual_case:
285	unusual_value = p[field]
286	break
287	yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case])
288
289
290	def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials):
291	ret_val = []
292	for t in range(num_trials):
293	ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size)))
294
295	return ret_val
296
297
298	def bootstrap2(estimator, db, probe_type, subsample_size, num_trials):
299	ret_val = []
300	for t in range(num_trials):
301	ret_val.append(estimator(subsample(db, probe_type, subsample_size)))
302
303	return ret_val
304
305
306	def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
307	ret_val = []
308	for t in range(num_trials):
309	ret_val.append(estimator(subseries(db, probe_type, unusual_case, subseries_size)))
310
311	return ret_val
312
313
314	# Returns the test case name that clearly has higher RTT; otherwise, returns None
315	def boxTest(params, test_cases, samples):
316	if len(test_cases) != 2:
317	# XXX: somehow generalize the box test to handle more than 2 cases
318	raise Exception()
319	dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params
320
321	tmp1,tmp2 = dists.items()
322	test_case1,dist1 = tmp1
323	test_case2,dist2 = tmp2
324
325	d1_high = numpy.percentile(dist1, params['high'])
326	d2_low = numpy.percentile(dist2, params['low'])
327	if d1_high < d2_low:
328	return test_case2
329
330	d1_low = numpy.percentile(dist1, params['low'])
331	d2_high = numpy.percentile(dist2, params['high'])
332
333	if d2_high < d1_low:
334	return test_case1
335
336	return None
337
338
339	# Returns 1 if unusual_case is unusual in the expected direction
340	# 0 if it isn't unusual
341	# -1 if it is unusual in the wrong direction
342	def multiBoxTest(params, unusual_case, greater, samples):
343	#XXX: packet_rtt field from params
344	dists = samples2Distributions(samples, 'packet_rtt')
345
346	uc = dists[unusual_case]
347	rest = []
348	for tc,d in dists.items():
349	if tc != unusual_case:
350	rest.extend(d)
351
352	uc_high = numpy.percentile(uc, params['high'])
353	rest_low = numpy.percentile(rest, params['low'])
354	if uc_high < rest_low:
355	if greater:
356	return -1
357	else:
358	return 1
359
360	uc_low = numpy.percentile(uc, params['low'])
361	rest_high = numpy.percentile(rest, params['high'])
362	if rest_high < uc_low:
363	if greater:
364	return 1
365	else:
366	return -1
367
368	return 0
369
370
371	# Returns 1 if unusual_case is unusual in the expected direction
372	# 0 otherwise
373	def midhingeTest(params, greater, samples):
374	diffs = [s['unusual_case']-s['other_cases'] for s in samples]
375
376	mh = midhinge(diffs, params['distance'])
377	if greater:
378	if mh > params['threshold']:
379	return 1
380	else:
381	return 0
382	else:
383	if mh < params['threshold']:
384	return 1
385	else:
386	return 0
387
388
389	def rmse(expected, measurements):
390	s = sum([(expected-m)**2 for m in measurements])/len(measurements)
391	return math.sqrt(s)
392
393	def nrmse(expected, measurements):
394	return rmse(expected, measurements)/(max(measurements)-min(measurements))

Note: See TracBrowser for help on using the repository browser.

Download in other formats: