Context Navigation

source: trunk/lib/nanownlib/stats.py @ 7

Last change on this file since 7 was 7, checked in by tim, 9 years ago
.
File size: 11.8 KB

Rev	Line
[4]	1
	2	import sys
	3	import os
	4	import math
	5	import statistics
	6	import gzip
	7	import random
	8	import scipy
	9	import scipy.stats
	10	import numpy
	11
	12	# Don't trust numpy's seeding
	13	numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
	14
	15
	16	def mad(arr):
	17	""" Median Absolute Deviation: a "Robust" version of standard deviation.
	18	Indices variabililty of the sample.
	19	https://en.wikipedia.org/wiki/Median_absolute_deviation
	20	"""
	21	arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
	22	med = numpy.median(arr)
	23	return numpy.median(numpy.abs(arr - med))
	24
	25
	26	def cov(x,y):
	27	mx = statistics.mean(x)
	28	my = statistics.mean(y)
	29	products = []
	30	for i in range(0,len(x)):
	31	products.append((x[i] - mx)*(y[i] - my))
	32
	33	return statistics.mean(products)
	34
	35
	36	def difference(ls):
	37	return ls[0]-ls[1]
	38
	39	def product(ls):
	40	return ls[0]*ls[1]
	41
	42	def hypotenuse(ls):
	43	return math.hypot(ls[0],ls[1])
	44
	45	def trustValues(derived, trustFunc):
	46	ret_val = []
	47	for k,v in derived.items():
	48	ret_val.append((trustFunc((v['long'],v['short'])), k))
	49
	50	ret_val.sort()
	51	return ret_val
	52
	53
	54	def prunedWeights(derived, trust, alpha):
	55	weights = {}
	56
	57	threshold = len(trust)*(1.0-alpha)
	58	for i in range(0,len(trust)):
	59	if i < threshold:
	60	weights[trust[i][1]] = 1.0
	61	else:
	62	weights[trust[i][1]] = 0.0
	63
	64	return weights
	65
	66
	67	def linearWeights(derived, trust, alpha):
	68	x1 = trust[0][0]
	69	y1 = 1.0 + (alpha*10)
	70	x2 = trust[(len(trust)-1)//3][0]
	71	y2 = 1.0
	72	m = (y1-y2)/(x1-x2)
	73	b = y1 - m*x1
	74
	75	weights = {}
	76	for t,k in trust:
	77	weights[k] = m*t+b
	78	if weights[k] < 0.0:
	79	weights[k] = 0.0
	80
	81	return weights
	82
	83
	84	def invertedWeights(derived,trust,alpha):
	85	# (x+1-first_sample)^(-alpha)
	86	#scale = trust[0][0]
	87
	88	#weights = {}
	89	#for t,k in trust:
	90	# weights[k] = (t+1-scale)*(-1.0alpha)
	91	# if weights[k] < 0.0:
	92	# weights[k] = 0.0
	93
	94	weights = {}
	95	for i in range(len(trust)):
	96	w = 10.0/(i+2.0)-0.2
	97	if w < 0.0:
	98	w = 0.0
	99	weights[trust[i][1]] = w
	100
	101
	102	return weights
	103
	104
	105
	106	def arctanWeights(derived,trust,alpha):
	107	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
	108	minimum = trust[0][0]
	109
	110	weights = {}
	111	for i in range(len(trust)):
	112	w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
	113	if w < 0.0:
	114	w = 0.0
	115	weights[trust[i][1]] = w
	116
	117	return weights
	118
	119
	120	def arctanWeights2(derived,trust,alpha):
	121	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
	122	minimum = trust[0][0]
	123	stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
	124
	125	weights = {}
	126	for i in range(len(trust)):
	127	w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
	128	if w < 0.0:
	129	w = 0.0
	130	weights[trust[i][1]] = w
	131
	132	return weights
	133
	134
	135	def midhinge(values, distance=25):
	136	return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
	137
	138	def trimean(values, distance=25):
	139	return (midhinge(values, distance) + statistics.median(values))/2
	140
	141	def weightedMean(derived, weights):
	142	normalizer = sum(weights.values())/len(weights)
	143	return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
	144
	145	def weightedMeanTsval(derived, weights):
	146	normalizer = sum(weights.values())/len(weights)
	147	return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
	148
	149
	150	def estimateMean(trustFunc, weightFunc, alpha, derived):
	151	trust = trustValues(derived, trustFunc)
	152	weights = weightFunc(derived, trust, alpha)
	153	return weightedMean(derived, weights)
	154
	155
	156	def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
	157	trust = trustValues(derived, trustFunc)
	158	weights = weightFunc(derived, trust, alpha)
	159	return weightedMeanTsval(derived, weights)
	160
	161
	162	#def estimateMedian(trustFunc, weightFunc, alpha, derived):
	163	# trust = trustValues(derived, trustFunc)
	164	# weights = weightFunc(derived, trust, alpha)
	165
	166	# return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0])
	167
	168	def estimateMedian(derived):
	169	return statistics.median([(d['long']-d['short']) for d in derived.values()])
	170
	171
	172	def estimateMidhinge(derived):
	173	return midhinge([(d['long']-d['short']) for d in derived.values()])
	174
	175
	176	def estimateTrimean(derived):
	177	return trimean([(d['long']-d['short']) for d in derived.values()])
	178
	179
	180	def tTest(expected_mean, derived):
	181	diffs = [(d['long']-d['short']) for d in derived.values()]
	182	null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0)
	183	tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean)
	184
	185	if pval < null_pval:
	186	return 1
	187	else:
	188	return 0
	189
	190
	191	def diffMedian(derived):
	192	l = [tc['long']-tc['short'] for s,tc in derived.items()]
	193	return statistics.median(l)
	194
	195
	196	def subsample_ids(db, probe_type, subsample_size=None):
	197	cursor = db.conn.cursor()
	198	cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))
	199	population_size = cursor.fetchone()[0]
	200	#print("population_size:", population_size)
	201	if subsample_size == None or subsample_size > population_size:
	202	subsample_size = population_size
	203
	204	start = numpy.random.random_integers(0,population_size-1)
	205	cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start))
	206	for row in cursor:
	207	subsample_size -= 1
	208	yield row['sample']
	209
	210	if subsample_size > 0:
	211	cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size))
	212	for row in cursor:
	213	yield row['sample']
	214
	215
	216	def subsample(db, probe_type, subsample_size=None):
	217	cursor = db.conn.cursor()
	218	cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)")
	219	num_test_cases = cursor.fetchone()[0]
	220
	221	for sid in subsample_ids(db, probe_type, subsample_size):
	222	cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,))
	223	probes = cursor.fetchall()
	224	if len(probes) != num_test_cases:
	225	sys.stderr.write("WARN: sample %d had %d probes, but %d expected! Discarding...\n" % (sid, len(probes), num_test_cases))
	226	continue
	227	yield (sid,[dict(r) for r in probes])
	228
[6]	229
	230	def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
[7]	231	population_size = db.populationSize(probe_type)
[6]	232
	233	if size == None or size > population_size:
	234	size = population_size
	235	if offset == None or offset >= population_size or offset < 0:
	236	offset = numpy.random.random_integers(0,population_size-1)
	237
	238	query="""
	239	SELECT %(field)s AS unusual_case,
	240	(SELECT avg(%(field)s) FROM probes,analysis
	241	WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
	242	FROM (SELECT probes.sample,%(field)s FROM probes,analysis
	243	WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
	244	LIMIT :size OFFSET :offset
	245	""" % {"field":field}
	246
	247	params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size}
[7]	248	cursor = db.conn.cursor()
[6]	249	cursor.execute(query, params)
[7]	250	ret_val = [dict(row) for row in cursor.fetchall()]
	251	#for row in cursor:
	252	# size -= 1
	253	# yield dict(row)
[6]	254
[7]	255	size -= len(ret_val)
[6]	256	if size > 0:
	257	params['offset'] = 0
	258	params['size'] = size
	259	cursor.execute(query, params)
[7]	260	ret_val += [dict(row) for row in cursor.fetchall()]
	261	#for row in cursor:
	262	# yield dict(row)
[6]	263
[7]	264	return ret_val
[6]	265
[7]	266
[4]	267	# if test_cases=None, include all of them. Otherwise, include only the specified test cases.
	268	def samples2Distributions(samples, field, test_cases=None):
	269	ret_val = {}
	270
	271	for sid,probes in samples:
	272	for p in probes:
	273	if p['test_case'] in ret_val:
	274	ret_val[p['test_case']].append(p[field])
	275	elif test_cases == None or p['test_case'] in test_cases:
	276	ret_val[p['test_case']] = [p[field]]
	277
	278
	279	return ret_val
	280
	281
	282	def samples2MeanDiffs(samples, field, unusual_case):
	283	ret_val = {}
	284
	285	for sid,probes in samples:
	286	unusual_value = None
	287	for p in probes:
	288	if p['test_case'] == unusual_case:
	289	unusual_value = p[field]
	290	break
	291	yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case])
	292
	293
	294	def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials):
	295	ret_val = []
	296	for t in range(num_trials):
	297	ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size)))
	298
	299	return ret_val
	300
	301
	302	def bootstrap2(estimator, db, probe_type, subsample_size, num_trials):
	303	ret_val = []
	304	for t in range(num_trials):
	305	ret_val.append(estimator(subsample(db, probe_type, subsample_size)))
	306
	307	return ret_val
	308
	309
[6]	310	def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
	311	ret_val = []
	312	for t in range(num_trials):
	313	ret_val.append(estimator(subseries(db, probe_type, unusual_case, subseries_size)))
	314
	315	return ret_val
	316
	317
[4]	318	# Returns the test case name that clearly has higher RTT; otherwise, returns None
	319	def boxTest(params, test_cases, samples):
	320	if len(test_cases) != 2:
	321	# XXX: somehow generalize the box test to handle more than 2 cases
	322	raise Exception()
	323	dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params
	324
	325	tmp1,tmp2 = dists.items()
	326	test_case1,dist1 = tmp1
	327	test_case2,dist2 = tmp2
	328
	329	d1_high = numpy.percentile(dist1, params['high'])
	330	d2_low = numpy.percentile(dist2, params['low'])
	331	if d1_high < d2_low:
	332	return test_case2
	333
	334	d1_low = numpy.percentile(dist1, params['low'])
	335	d2_high = numpy.percentile(dist2, params['high'])
	336
	337	if d2_high < d1_low:
	338	return test_case1
	339
	340	return None
	341
	342
	343	# Returns 1 if unusual_case is unusual in the expected direction
	344	# 0 if it isn't unusual
	345	# -1 if it is unusual in the wrong direction
	346	def multiBoxTest(params, unusual_case, greater, samples):
	347	#XXX: packet_rtt field from params
	348	dists = samples2Distributions(samples, 'packet_rtt')
	349
	350	uc = dists[unusual_case]
	351	rest = []
	352	for tc,d in dists.items():
	353	if tc != unusual_case:
	354	rest.extend(d)
	355
	356	uc_high = numpy.percentile(uc, params['high'])
	357	rest_low = numpy.percentile(rest, params['low'])
	358	if uc_high < rest_low:
	359	if greater:
	360	return -1
	361	else:
	362	return 1
	363
	364	uc_low = numpy.percentile(uc, params['low'])
	365	rest_high = numpy.percentile(rest, params['high'])
	366	if rest_high < uc_low:
	367	if greater:
	368	return 1
	369	else:
	370	return -1
	371
	372	return 0
	373
	374
	375	# Returns 1 if unusual_case is unusual in the expected direction
	376	# 0 otherwise
[6]	377	def midhingeTest(params, greater, samples):
	378	diffs = [s['unusual_case']-s['other_cases'] for s in samples]
[4]	379
	380	mh = midhinge(diffs, params['distance'])
	381	if greater:
	382	if mh > params['threshold']:
	383	return 1
	384	else:
	385	return 0
	386	else:
	387	if mh < params['threshold']:
	388	return 1
	389	else:
	390	return 0
	391
	392
	393	def rmse(expected, measurements):
	394	s = sum([(expected-m)**2 for m in measurements])/len(measurements)
	395	return math.sqrt(s)
	396
	397	def nrmse(expected, measurements):
	398	return rmse(expected, measurements)/(max(measurements)-min(measurements))

Note: See TracBrowser for help on using the repository browser.

Download in other formats: