Context Navigation

source: trunk/lib/nanownlib/stats.py @ 10

Last change on this file since 10 was 10, checked in by tim, 9 years ago
.
File size: 15.0 KB

Rev	Line
[4]	1
	2	import sys
	3	import os
[10]	4	import functools
[4]	5	import math
	6	import statistics
	7	import gzip
	8	import random
	9	import scipy
	10	import scipy.stats
	11	import numpy
	12
	13	# Don't trust numpy's seeding
	14	numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
	15
	16
	17	def mad(arr):
	18	""" Median Absolute Deviation: a "Robust" version of standard deviation.
	19	Indices variabililty of the sample.
	20	https://en.wikipedia.org/wiki/Median_absolute_deviation
	21	"""
	22	arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
	23	med = numpy.median(arr)
	24	return numpy.median(numpy.abs(arr - med))
	25
	26
	27	def cov(x,y):
	28	mx = statistics.mean(x)
	29	my = statistics.mean(y)
	30	products = []
	31	for i in range(0,len(x)):
	32	products.append((x[i] - mx)*(y[i] - my))
	33
	34	return statistics.mean(products)
	35
	36
	37	def difference(ls):
	38	return ls[0]-ls[1]
	39
	40	def product(ls):
	41	return ls[0]*ls[1]
	42
	43	def hypotenuse(ls):
	44	return math.hypot(ls[0],ls[1])
	45
	46	def trustValues(derived, trustFunc):
	47	ret_val = []
	48	for k,v in derived.items():
	49	ret_val.append((trustFunc((v['long'],v['short'])), k))
	50
	51	ret_val.sort()
	52	return ret_val
	53
	54
	55	def prunedWeights(derived, trust, alpha):
	56	weights = {}
	57
	58	threshold = len(trust)*(1.0-alpha)
	59	for i in range(0,len(trust)):
	60	if i < threshold:
	61	weights[trust[i][1]] = 1.0
	62	else:
	63	weights[trust[i][1]] = 0.0
	64
	65	return weights
	66
	67
	68	def linearWeights(derived, trust, alpha):
	69	x1 = trust[0][0]
	70	y1 = 1.0 + (alpha*10)
	71	x2 = trust[(len(trust)-1)//3][0]
	72	y2 = 1.0
	73	m = (y1-y2)/(x1-x2)
	74	b = y1 - m*x1
	75
	76	weights = {}
	77	for t,k in trust:
	78	weights[k] = m*t+b
	79	if weights[k] < 0.0:
	80	weights[k] = 0.0
	81
	82	return weights
	83
	84
	85	def invertedWeights(derived,trust,alpha):
	86	# (x+1-first_sample)^(-alpha)
	87	#scale = trust[0][0]
	88
	89	#weights = {}
	90	#for t,k in trust:
	91	# weights[k] = (t+1-scale)*(-1.0alpha)
	92	# if weights[k] < 0.0:
	93	# weights[k] = 0.0
	94
	95	weights = {}
	96	for i in range(len(trust)):
	97	w = 10.0/(i+2.0)-0.2
	98	if w < 0.0:
	99	w = 0.0
	100	weights[trust[i][1]] = w
	101
	102
	103	return weights
	104
	105
	106
	107	def arctanWeights(derived,trust,alpha):
	108	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
	109	minimum = trust[0][0]
	110
	111	weights = {}
	112	for i in range(len(trust)):
	113	w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
	114	if w < 0.0:
	115	w = 0.0
	116	weights[trust[i][1]] = w
	117
	118	return weights
	119
	120
	121	def arctanWeights2(derived,trust,alpha):
	122	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
	123	minimum = trust[0][0]
	124	stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
	125
	126	weights = {}
	127	for i in range(len(trust)):
	128	w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
	129	if w < 0.0:
	130	w = 0.0
	131	weights[trust[i][1]] = w
	132
	133	return weights
	134
	135
[10]	136	def midsummary(values, distance=25):
	137	#return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
	138	l,h = numpy.percentile(values, (50-distance,50+distance))
	139	return (l+h)/2.0
[4]	140
	141	def trimean(values, distance=25):
[10]	142	return (midsummary(values, distance) + statistics.median(values))/2
[4]	143
[10]	144	def ubersummary(values, distance=25):
	145	left2 = 50-distance
	146	left1 = left2/2.0
	147	left3 = (left2+50)/2.0
	148	right2 = 50+distance
	149	right3 = (right2+50)/2.0
	150	right1 = (right2+100)/2.0
	151	l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
	152	#print(left1,left2,left3,50,right3,right2,right1)
	153	#print(l1,l2,l3,m,r3,r2,r1)
	154	return (l1+l24+l3+r3+r24+r1)/12.0
	155	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
	156
	157	def quadsummary(values, distance=25):
	158	left2 = 50-distance
	159	left1 = left2/2.0
	160	right2 = 50+distance
	161	right1 = (right2+100)/2.0
	162	l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
	163	#print(left1,left2,left3,50,right3,right2,right1)
	164	#print(l1,l2,l3,m,r3,r2,r1)
	165	return (l1+l2+r2+r1)/4.0
	166	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
	167
	168	def quadsummary(values, distance=25):
	169	left1 = 50-distance
	170	left2 = (left1+50)/2.0
	171	right1 = 50+distance
	172	right2 = (right1+50)/2.0
	173	l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
	174	#print(left1,left2,left3,50,right3,right2,right1)
	175	#print(l1,l2,l3,m,r3,r2,r1)
	176	return (l1+l2+r2+r1)/4.0
	177	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
	178
	179
[4]	180	def weightedMean(derived, weights):
	181	normalizer = sum(weights.values())/len(weights)
	182	return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
	183
	184	def weightedMeanTsval(derived, weights):
	185	normalizer = sum(weights.values())/len(weights)
	186	return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
	187
	188
	189	def estimateMean(trustFunc, weightFunc, alpha, derived):
	190	trust = trustValues(derived, trustFunc)
	191	weights = weightFunc(derived, trust, alpha)
	192	return weightedMean(derived, weights)
	193
	194
	195	def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
	196	trust = trustValues(derived, trustFunc)
	197	weights = weightFunc(derived, trust, alpha)
	198	return weightedMeanTsval(derived, weights)
	199
	200
	201	#def estimateMedian(trustFunc, weightFunc, alpha, derived):
	202	# trust = trustValues(derived, trustFunc)
	203	# weights = weightFunc(derived, trust, alpha)
	204
	205	# return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0])
	206
	207	def estimateMedian(derived):
	208	return statistics.median([(d['long']-d['short']) for d in derived.values()])
	209
	210
[10]	211	def estimateMidsummary(derived):
	212	return midsummary([(d['long']-d['short']) for d in derived.values()])
[4]	213
	214
	215	def estimateTrimean(derived):
	216	return trimean([(d['long']-d['short']) for d in derived.values()])
	217
	218
	219	def tTest(expected_mean, derived):
	220	diffs = [(d['long']-d['short']) for d in derived.values()]
	221	null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0)
	222	tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean)
	223
	224	if pval < null_pval:
	225	return 1
	226	else:
	227	return 0
	228
	229
	230	def diffMedian(derived):
	231	l = [tc['long']-tc['short'] for s,tc in derived.items()]
	232	return statistics.median(l)
	233
	234
	235	def subsample_ids(db, probe_type, subsample_size=None):
	236	cursor = db.conn.cursor()
	237	cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))
	238	population_size = cursor.fetchone()[0]
	239	#print("population_size:", population_size)
	240	if subsample_size == None or subsample_size > population_size:
	241	subsample_size = population_size
	242
	243	start = numpy.random.random_integers(0,population_size-1)
	244	cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start))
	245	for row in cursor:
	246	subsample_size -= 1
	247	yield row['sample']
	248
	249	if subsample_size > 0:
	250	cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size))
	251	for row in cursor:
	252	yield row['sample']
	253
	254
	255	def subsample(db, probe_type, subsample_size=None):
	256	cursor = db.conn.cursor()
	257	cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)")
	258	num_test_cases = cursor.fetchone()[0]
	259
	260	for sid in subsample_ids(db, probe_type, subsample_size):
	261	cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,))
	262	probes = cursor.fetchall()
	263	if len(probes) != num_test_cases:
	264	sys.stderr.write("WARN: sample %d had %d probes, but %d expected! Discarding...\n" % (sid, len(probes), num_test_cases))
	265	continue
	266	yield (sid,[dict(r) for r in probes])
	267
[6]	268
	269	def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
[7]	270	population_size = db.populationSize(probe_type)
[6]	271
	272	if size == None or size > population_size:
	273	size = population_size
	274	if offset == None or offset >= population_size or offset < 0:
	275	offset = numpy.random.random_integers(0,population_size-1)
	276
	277	query="""
	278	SELECT %(field)s AS unusual_case,
	279	(SELECT avg(%(field)s) FROM probes,analysis
	280	WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
	281	FROM (SELECT probes.sample,%(field)s FROM probes,analysis
	282	WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
	283	LIMIT :size OFFSET :offset
	284	""" % {"field":field}
	285
	286	params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size}
[7]	287	cursor = db.conn.cursor()
[6]	288	cursor.execute(query, params)
[7]	289	ret_val = [dict(row) for row in cursor.fetchall()]
	290	#for row in cursor:
	291	# size -= 1
	292	# yield dict(row)
[6]	293
[7]	294	size -= len(ret_val)
[6]	295	if size > 0:
	296	params['offset'] = 0
	297	params['size'] = size
	298	cursor.execute(query, params)
[7]	299	ret_val += [dict(row) for row in cursor.fetchall()]
	300	#for row in cursor:
	301	# yield dict(row)
[6]	302
[7]	303	return ret_val
[6]	304
[7]	305
[4]	306	# if test_cases=None, include all of them. Otherwise, include only the specified test cases.
	307	def samples2Distributions(samples, field, test_cases=None):
	308	ret_val = {}
	309
	310	for sid,probes in samples:
	311	for p in probes:
	312	if p['test_case'] in ret_val:
	313	ret_val[p['test_case']].append(p[field])
	314	elif test_cases == None or p['test_case'] in test_cases:
	315	ret_val[p['test_case']] = [p[field]]
	316
	317
	318	return ret_val
	319
	320
	321	def samples2MeanDiffs(samples, field, unusual_case):
	322	ret_val = {}
	323
	324	for sid,probes in samples:
	325	unusual_value = None
	326	for p in probes:
	327	if p['test_case'] == unusual_case:
	328	unusual_value = p[field]
	329	break
	330	yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case])
	331
	332
	333	def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials):
	334	ret_val = []
	335	for t in range(num_trials):
	336	ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size)))
	337
	338	return ret_val
	339
	340
	341	def bootstrap2(estimator, db, probe_type, subsample_size, num_trials):
	342	ret_val = []
	343	for t in range(num_trials):
	344	ret_val.append(estimator(subsample(db, probe_type, subsample_size)))
	345
	346	return ret_val
	347
	348
[6]	349	def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
	350	ret_val = []
	351	for t in range(num_trials):
[8]	352	ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
[6]	353
	354	return ret_val
	355
	356
[4]	357	# Returns the test case name that clearly has higher RTT; otherwise, returns None
	358	def boxTest(params, test_cases, samples):
	359	if len(test_cases) != 2:
	360	# XXX: somehow generalize the box test to handle more than 2 cases
	361	raise Exception()
	362	dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params
	363
	364	tmp1,tmp2 = dists.items()
	365	test_case1,dist1 = tmp1
	366	test_case2,dist2 = tmp2
	367
	368	d1_high = numpy.percentile(dist1, params['high'])
	369	d2_low = numpy.percentile(dist2, params['low'])
	370	if d1_high < d2_low:
	371	return test_case2
	372
	373	d1_low = numpy.percentile(dist1, params['low'])
	374	d2_high = numpy.percentile(dist2, params['high'])
	375
	376	if d2_high < d1_low:
	377	return test_case1
	378
	379	return None
	380
	381
	382	# Returns 1 if unusual_case is unusual in the expected direction
	383	# 0 if it isn't unusual
	384	# -1 if it is unusual in the wrong direction
[8]	385	def multiBoxTest(params, greater, samples):
	386	uc = [s['unusual_case'] for s in samples]
	387	rest = [s['other_cases'] for s in samples]
[4]	388
[10]	389	uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
	390	rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
[4]	391	if uc_high < rest_low:
	392	if greater:
	393	return -1
	394	else:
	395	return 1
	396
	397	if rest_high < uc_low:
	398	if greater:
	399	return 1
	400	else:
	401	return -1
	402
	403	return 0
	404
	405
	406	# Returns 1 if unusual_case is unusual in the expected direction
	407	# 0 otherwise
[10]	408	def summaryTest(f, params, greater, samples):
[6]	409	diffs = [s['unusual_case']-s['other_cases'] for s in samples]
[4]	410
[10]	411	mh = f(diffs, params['distance'])
[4]	412	if greater:
	413	if mh > params['threshold']:
	414	return 1
	415	else:
	416	return 0
	417	else:
	418	if mh < params['threshold']:
	419	return 1
	420	else:
	421	return 0
	422
[10]	423	midsummaryTest = functools.partial(summaryTest, midsummary)
	424	trimeanTest = functools.partial(summaryTest, trimean)
	425	ubersummaryTest = functools.partial(summaryTest, ubersummary)
	426	quadsummaryTest = functools.partial(summaryTest, quadsummary)
[4]	427
	428	def rmse(expected, measurements):
	429	s = sum([(expected-m)**2 for m in measurements])/len(measurements)
	430	return math.sqrt(s)
	431
	432	def nrmse(expected, measurements):
	433	return rmse(expected, measurements)/(max(measurements)-min(measurements))
[10]	434
	435
	436	class KalmanFilter1D:
	437	def __init__(self, x0, P, R, Q):
	438	self.x = x0
	439	self.P = P
	440	self.R = R
	441	self.Q = Q
	442
	443	def update(self, z):
	444	self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
	445	self.P = 1. / (1./self.P + 1./self.R)
	446
	447	def predict(self, u=0.0):
	448	self.x += u
	449	self.P += self.Q
	450
	451
	452	def kfilter(params, observations):
	453	x = numpy.array(observations)
	454	movement = 0
	455	est = []
	456	var = []
	457	kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
	458	#P = 10000, # initial variance
	459	P = 10, # initial variance
	460	R = numpy.std(x), # msensor noise
	461	Q = 0) # movement noise
	462	for round in range(1):
	463	for d in x:
	464	kf.predict(movement)
	465	kf.update(d)
	466	est.append(kf.x)
	467	var.append(kf.P)
	468
	469	return({'est':est, 'var':var})
	470
	471
	472	def kalmanTest(params, greater, samples):
	473	diffs = [s['unusual_case']-s['other_cases'] for s in samples]
	474
	475	m = kfilter(params, diffs)['est'][-1]
	476	if greater:
	477	if m > params['threshold']:
	478	return 1
	479	else:
	480	return 0
	481	else:
	482	if m < params['threshold']:
	483	return 1
	484	else:
	485	return 0
	486
	487
	488	def kalmanTest2(params, greater, samples):
	489	diffs = [s['unusual_case']-s['other_cases'] for s in samples]
	490
	491	estimates = []
	492	size = 500
	493	for i in range(100):
	494	off = random.randrange(0,len(diffs))
	495	sub = diffs[off:size]
	496	if len(sub) < size:
	497	sub += diffs[0:size-len(sub)]
	498	estimates.append(kfilter(params, sub)['est'][-1])
	499
	500	m = quadsummary(estimates)
	501	if greater:
	502	if m > params['threshold']:
	503	return 1
	504	else:
	505	return 0
	506	else:
	507	if m < params['threshold']:
	508	return 1
	509	else:
	510	return 0
	511

Note: See TracBrowser for help on using the repository browser.

Download in other formats: