Context Navigation

source: trunk/lib/nanownlib/stats.py @ 18

Last change on this file since 18 was 16, checked in by tim, 9 years ago
.
File size: 9.9 KB

Rev	Line
[4]	1
	2	import sys
	3	import os
[10]	4	import functools
[4]	5	import math
	6	import statistics
	7	import gzip
	8	import random
	9	import numpy
	10
	11	# Don't trust numpy's seeding
	12	numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
	13
	14
	15	def mad(arr):
	16	""" Median Absolute Deviation: a "Robust" version of standard deviation.
	17	Indices variabililty of the sample.
	18	https://en.wikipedia.org/wiki/Median_absolute_deviation
	19	"""
	20	arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
	21	med = numpy.median(arr)
	22	return numpy.median(numpy.abs(arr - med))
	23
	24
	25	def cov(x,y):
	26	mx = statistics.mean(x)
	27	my = statistics.mean(y)
	28	products = []
	29	for i in range(0,len(x)):
	30	products.append((x[i] - mx)*(y[i] - my))
	31
	32	return statistics.mean(products)
	33
	34
	35	def difference(ls):
	36	return ls[0]-ls[1]
	37
	38	def product(ls):
	39	return ls[0]*ls[1]
	40
	41	def hypotenuse(ls):
	42	return math.hypot(ls[0],ls[1])
	43
	44	def trustValues(derived, trustFunc):
	45	ret_val = []
	46	for k,v in derived.items():
	47	ret_val.append((trustFunc((v['long'],v['short'])), k))
	48
	49	ret_val.sort()
	50	return ret_val
	51
	52
	53	def prunedWeights(derived, trust, alpha):
	54	weights = {}
	55
	56	threshold = len(trust)*(1.0-alpha)
	57	for i in range(0,len(trust)):
	58	if i < threshold:
	59	weights[trust[i][1]] = 1.0
	60	else:
	61	weights[trust[i][1]] = 0.0
	62
	63	return weights
	64
	65
	66	def linearWeights(derived, trust, alpha):
	67	x1 = trust[0][0]
	68	y1 = 1.0 + (alpha*10)
	69	x2 = trust[(len(trust)-1)//3][0]
	70	y2 = 1.0
	71	m = (y1-y2)/(x1-x2)
	72	b = y1 - m*x1
	73
	74	weights = {}
	75	for t,k in trust:
	76	weights[k] = m*t+b
	77	if weights[k] < 0.0:
	78	weights[k] = 0.0
	79
	80	return weights
	81
	82
	83	def invertedWeights(derived,trust,alpha):
	84	# (x+1-first_sample)^(-alpha)
	85	#scale = trust[0][0]
	86
	87	#weights = {}
	88	#for t,k in trust:
	89	# weights[k] = (t+1-scale)*(-1.0alpha)
	90	# if weights[k] < 0.0:
	91	# weights[k] = 0.0
	92
	93	weights = {}
	94	for i in range(len(trust)):
	95	w = 10.0/(i+2.0)-0.2
	96	if w < 0.0:
	97	w = 0.0
	98	weights[trust[i][1]] = w
	99
	100
	101	return weights
	102
	103
	104
	105	def arctanWeights(derived,trust,alpha):
	106	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
	107	minimum = trust[0][0]
	108
	109	weights = {}
	110	for i in range(len(trust)):
	111	w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
	112	if w < 0.0:
	113	w = 0.0
	114	weights[trust[i][1]] = w
	115
	116	return weights
	117
	118
	119	def arctanWeights2(derived,trust,alpha):
	120	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
	121	minimum = trust[0][0]
	122	stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
	123
	124	weights = {}
	125	for i in range(len(trust)):
	126	w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
	127	if w < 0.0:
	128	w = 0.0
	129	weights[trust[i][1]] = w
	130
	131	return weights
	132
	133
[10]	134	def midsummary(values, distance=25):
	135	#return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
	136	l,h = numpy.percentile(values, (50-distance,50+distance))
	137	return (l+h)/2.0
[4]	138
	139	def trimean(values, distance=25):
[10]	140	return (midsummary(values, distance) + statistics.median(values))/2
[4]	141
[10]	142	def ubersummary(values, distance=25):
	143	left2 = 50-distance
[11]	144	left3 = 50-(distance/2.0)
[10]	145	left1 = left2/2.0
	146	right2 = 50+distance
[11]	147	right3 = 50+(distance/2.0)
[10]	148	right1 = (right2+100)/2.0
	149	l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
	150	#print(l1,l2,l3,m,r3,r2,r1)
	151	return (l1+l24+l3+r3+r24+r1)/12.0
	152	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
	153
[11]	154
[10]	155	def quadsummary(values, distance=25):
	156	left1 = 50-distance
	157	left2 = (left1+50)/2.0
	158	right1 = 50+distance
	159	right2 = (right1+50)/2.0
	160	l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
	161	#print(left1,left2,left3,50,right3,right2,right1)
	162	#print(l1,l2,l3,m,r3,r2,r1)
	163	return (l1+l2+r2+r1)/4.0
	164	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
	165
[13]	166
	167	def septasummary(values, distance=25):
	168	left2 = 50-distance
	169	left3 = 50-(distance/2.0)
	170	left1 = left2/2.0
	171	right2 = 50+distance
	172	right3 = 50+(distance/2.0)
	173	right1 = (right2+100)/2.0
	174	l1,l2,l3,m,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,50,right3,right2,right1))
	175	return (l1+l2+l3+m+r3+r2+r1)/7.0
[11]	176
[13]	177
[11]	178	def tsvalwmean(subseries):
	179	weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries]
	180	normalizer = sum(weights)/len(weights)
	181	return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer
	182	for i in range(len(weights))])
	183
	184	#def tsvalwmean(subseries):
	185	# return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries])
	186
	187
[4]	188	def weightedMean(derived, weights):
	189	normalizer = sum(weights.values())/len(weights)
	190	return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
	191
	192	def weightedMeanTsval(derived, weights):
	193	normalizer = sum(weights.values())/len(weights)
	194	return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
	195
	196
[11]	197
	198
[4]	199	def estimateMean(trustFunc, weightFunc, alpha, derived):
	200	trust = trustValues(derived, trustFunc)
	201	weights = weightFunc(derived, trust, alpha)
	202	return weightedMean(derived, weights)
	203
	204
	205	def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
	206	trust = trustValues(derived, trustFunc)
	207	weights = weightFunc(derived, trust, alpha)
	208	return weightedMeanTsval(derived, weights)
	209
	210
[6]	211	def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
	212	ret_val = []
	213	for t in range(num_trials):
[8]	214	ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
[6]	215
	216	return ret_val
	217
	218
[4]	219	# Returns 1 if unusual_case is unusual in the expected direction
	220	# 0 if it isn't unusual
	221	# -1 if it is unusual in the wrong direction
[8]	222	def multiBoxTest(params, greater, samples):
[11]	223	uc = [s['unusual_packet'] for s in samples]
	224	rest = [s['other_packet'] for s in samples]
[4]	225
[10]	226	uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
	227	rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
[4]	228	if uc_high < rest_low:
	229	if greater:
	230	return -1
	231	else:
	232	return 1
	233
	234	if rest_high < uc_low:
	235	if greater:
	236	return 1
	237	else:
	238	return -1
	239
	240	return 0
	241
	242
	243	# Returns 1 if unusual_case is unusual in the expected direction
	244	# 0 otherwise
[10]	245	def summaryTest(f, params, greater, samples):
[11]	246	diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
[4]	247
[10]	248	mh = f(diffs, params['distance'])
[16]	249	#print("estimate:", mh)
[4]	250	if greater:
	251	if mh > params['threshold']:
	252	return 1
	253	else:
	254	return 0
	255	else:
	256	if mh < params['threshold']:
	257	return 1
	258	else:
	259	return 0
	260
[11]	261
[10]	262	midsummaryTest = functools.partial(summaryTest, midsummary)
	263	trimeanTest = functools.partial(summaryTest, trimean)
	264	ubersummaryTest = functools.partial(summaryTest, ubersummary)
	265	quadsummaryTest = functools.partial(summaryTest, quadsummary)
[13]	266	septasummaryTest = functools.partial(summaryTest, septasummary)
[4]	267
	268	def rmse(expected, measurements):
	269	s = sum([(expected-m)**2 for m in measurements])/len(measurements)
	270	return math.sqrt(s)
	271
	272	def nrmse(expected, measurements):
	273	return rmse(expected, measurements)/(max(measurements)-min(measurements))
[10]	274
	275
	276	class KalmanFilter1D:
	277	def __init__(self, x0, P, R, Q):
	278	self.x = x0
	279	self.P = P
	280	self.R = R
	281	self.Q = Q
	282
	283	def update(self, z):
	284	self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
	285	self.P = 1. / (1./self.P + 1./self.R)
	286
	287	def predict(self, u=0.0):
	288	self.x += u
	289	self.P += self.Q
	290
	291
	292	def kfilter(params, observations):
[11]	293	x = numpy.array(observations)
[10]	294	movement = 0
[11]	295	est = []
[10]	296	var = []
	297	kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
[11]	298	#P = 10000, # initial variance
	299	P = 10, # initial variance
[10]	300	R = numpy.std(x), # msensor noise
	301	Q = 0) # movement noise
	302	for round in range(1):
	303	for d in x:
	304	kf.predict(movement)
	305	kf.update(d)
	306	est.append(kf.x)
	307	var.append(kf.P)
	308
	309	return({'est':est, 'var':var})
	310
	311
	312	def kalmanTest(params, greater, samples):
[11]	313	diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
[10]	314
	315	m = kfilter(params, diffs)['est'][-1]
	316	if greater:
	317	if m > params['threshold']:
	318	return 1
	319	else:
	320	return 0
	321	else:
	322	if m < params['threshold']:
	323	return 1
	324	else:
	325	return 0
	326
	327
[11]	328	def tsvalwmeanTest(params, greater, samples):
	329	m = tsvalwmean(samples)
[10]	330	if greater:
	331	if m > params['threshold']:
	332	return 1
	333	else:
	334	return 0
	335	else:
	336	if m < params['threshold']:
	337	return 1
	338	else:
	339	return 0
[13]	340
	341
	342	from pykalman import KalmanFilter
	343	def pyKalman4DTest(params, greater, samples):
	344	kp = params['kparams']
	345	#kp['initial_state_mean']=[quadsummary([s['unusual_packet'] for s in samples]),
	346	# quadsummary([s['other_packet'] for s in samples]),
	347	# numpy.mean([s['unusual_tsval'] for s in samples]),
	348	# numpy.mean([s['other_tsval'] for s in samples])]
	349	kf = KalmanFilter(n_dim_obs=4, n_dim_state=4, **kp)
	350	smooth,covariance = kf.smooth([(s['unusual_packet'],s['other_packet'],s['unusual_tsval'],s['other_tsval'])
	351	for s in samples])
	352	m = numpy.mean(smooth)
	353	if greater:
	354	if m > params['threshold']:
	355	return 1
	356	else:
	357	return 0
	358	else:
	359	if m < params['threshold']:
	360	return 1
	361	else:
	362	return 0
	363

Note: See TracBrowser for help on using the repository browser.

Download in other formats: