Context Navigation

stats.py @ 25

Last change on this file since 25 was 24, checked in by tim, 8 years ago
migration
File size: 10.6 KB

Rev	Line
[4]	1
	2	import sys
	3	import os
[10]	4	import functools
[4]	5	import math
	6	import statistics
	7	import gzip
	8	import random
[20]	9	try:
	10	import numpy
	11	except:
	12	sys.stderr.write('ERROR: Could not import numpy module. Ensure it is installed.\n')
	13	sys.stderr.write(' Under Debian, the package name is "python3-numpy"\n.')
	14	sys.exit(1)
[4]	15
	16	# Don't trust numpy's seeding
	17	numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
	18
	19
	20	def mad(arr):
	21	""" Median Absolute Deviation: a "Robust" version of standard deviation.
	22	Indices variabililty of the sample.
	23	https://en.wikipedia.org/wiki/Median_absolute_deviation
	24	"""
	25	arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
	26	med = numpy.median(arr)
	27	return numpy.median(numpy.abs(arr - med))
	28
	29
	30	def cov(x,y):
	31	mx = statistics.mean(x)
	32	my = statistics.mean(y)
	33	products = []
	34	for i in range(0,len(x)):
	35	products.append((x[i] - mx)*(y[i] - my))
	36
	37	return statistics.mean(products)
	38
[20]	39
	40	def OLSRegression(x,y):
	41	#print(x,y)
	42	x = numpy.array(x)
	43	y = numpy.array(y)
	44	#A = numpy.vstack([x, numpy.ones(len(x))]).T
	45	#m, c = numpy.linalg.lstsq(A, y)[0] # broken
	46	#c,m = numpy.polynomial.polynomial.polyfit(x, y, 1) # less accurate
	47	c,m = numpy.polynomial.Polynomial.fit(x,y,1).convert().coef
[4]	48
[20]	49	#print(m,c)
	50
	51	#import matplotlib.pyplot as plt
	52	#plt.clf()
	53	#plt.scatter(x, y)
	54	#plt.plot(x, m*x + c, 'r', label='Fitted line')
	55	#plt.show()
	56
	57	return (m,c)
	58
	59
[4]	60	def difference(ls):
	61	return ls[0]-ls[1]
	62
	63	def product(ls):
	64	return ls[0]*ls[1]
	65
	66	def hypotenuse(ls):
	67	return math.hypot(ls[0],ls[1])
	68
	69	def trustValues(derived, trustFunc):
	70	ret_val = []
	71	for k,v in derived.items():
	72	ret_val.append((trustFunc((v['long'],v['short'])), k))
	73
	74	ret_val.sort()
	75	return ret_val
	76
	77
	78	def prunedWeights(derived, trust, alpha):
	79	weights = {}
	80
	81	threshold = len(trust)*(1.0-alpha)
	82	for i in range(0,len(trust)):
	83	if i < threshold:
	84	weights[trust[i][1]] = 1.0
	85	else:
	86	weights[trust[i][1]] = 0.0
	87
	88	return weights
	89
	90
	91	def linearWeights(derived, trust, alpha):
	92	x1 = trust[0][0]
	93	y1 = 1.0 + (alpha*10)
	94	x2 = trust[(len(trust)-1)//3][0]
	95	y2 = 1.0
	96	m = (y1-y2)/(x1-x2)
	97	b = y1 - m*x1
	98
	99	weights = {}
	100	for t,k in trust:
	101	weights[k] = m*t+b
	102	if weights[k] < 0.0:
	103	weights[k] = 0.0
	104
	105	return weights
	106
	107
	108	def invertedWeights(derived,trust,alpha):
	109	# (x+1-first_sample)^(-alpha)
	110	#scale = trust[0][0]
	111
	112	#weights = {}
	113	#for t,k in trust:
	114	# weights[k] = (t+1-scale)*(-1.0alpha)
	115	# if weights[k] < 0.0:
	116	# weights[k] = 0.0
	117
	118	weights = {}
	119	for i in range(len(trust)):
	120	w = 10.0/(i+2.0)-0.2
	121	if w < 0.0:
	122	w = 0.0
	123	weights[trust[i][1]] = w
	124
	125
	126	return weights
	127
	128
	129
	130	def arctanWeights(derived,trust,alpha):
	131	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
	132	minimum = trust[0][0]
	133
	134	weights = {}
	135	for i in range(len(trust)):
	136	w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
	137	if w < 0.0:
	138	w = 0.0
	139	weights[trust[i][1]] = w
	140
	141	return weights
	142
	143
	144	def arctanWeights2(derived,trust,alpha):
	145	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
	146	minimum = trust[0][0]
	147	stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
	148
	149	weights = {}
	150	for i in range(len(trust)):
	151	w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
	152	if w < 0.0:
	153	w = 0.0
	154	weights[trust[i][1]] = w
	155
	156	return weights
	157
	158
[10]	159	def midsummary(values, distance=25):
	160	#return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
	161	l,h = numpy.percentile(values, (50-distance,50+distance))
	162	return (l+h)/2.0
[4]	163
	164	def trimean(values, distance=25):
[10]	165	return (midsummary(values, distance) + statistics.median(values))/2
[4]	166
[10]	167	def ubersummary(values, distance=25):
	168	left2 = 50-distance
[11]	169	left3 = 50-(distance/2.0)
[10]	170	left1 = left2/2.0
	171	right2 = 50+distance
[11]	172	right3 = 50+(distance/2.0)
[10]	173	right1 = (right2+100)/2.0
	174	l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
	175	#print(l1,l2,l3,m,r3,r2,r1)
	176	return (l1+l24+l3+r3+r24+r1)/12.0
	177	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
	178
[11]	179
[10]	180	def quadsummary(values, distance=25):
	181	left1 = 50-distance
	182	left2 = (left1+50)/2.0
	183	right1 = 50+distance
	184	right2 = (right1+50)/2.0
	185	l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
	186	#print(left1,left2,left3,50,right3,right2,right1)
	187	#print(l1,l2,l3,m,r3,r2,r1)
	188	return (l1+l2+r2+r1)/4.0
	189	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
	190
[13]	191
	192	def septasummary(values, distance=25):
	193	left2 = 50-distance
	194	left3 = 50-(distance/2.0)
	195	left1 = left2/2.0
	196	right2 = 50+distance
	197	right3 = 50+(distance/2.0)
	198	right1 = (right2+100)/2.0
	199	l1,l2,l3,m,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,50,right3,right2,right1))
	200	return (l1+l2+l3+m+r3+r2+r1)/7.0
[11]	201
[13]	202
[11]	203	def tsvalwmean(subseries):
	204	weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries]
	205	normalizer = sum(weights)/len(weights)
	206	return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer
	207	for i in range(len(weights))])
	208
	209	#def tsvalwmean(subseries):
	210	# return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries])
	211
	212
[4]	213	def weightedMean(derived, weights):
	214	normalizer = sum(weights.values())/len(weights)
	215	return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
	216
	217	def weightedMeanTsval(derived, weights):
	218	normalizer = sum(weights.values())/len(weights)
	219	return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
	220
	221
[11]	222
	223
[4]	224	def estimateMean(trustFunc, weightFunc, alpha, derived):
	225	trust = trustValues(derived, trustFunc)
	226	weights = weightFunc(derived, trust, alpha)
	227	return weightedMean(derived, weights)
	228
	229
	230	def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
	231	trust = trustValues(derived, trustFunc)
	232	weights = weightFunc(derived, trust, alpha)
	233	return weightedMeanTsval(derived, weights)
	234
	235
[6]	236	def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
	237	ret_val = []
	238	for t in range(num_trials):
[8]	239	ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
[6]	240
	241	return ret_val
	242
	243
[4]	244	# Returns 1 if unusual_case is unusual in the expected direction
	245	# 0 if it isn't unusual
	246	# -1 if it is unusual in the wrong direction
[8]	247	def multiBoxTest(params, greater, samples):
[11]	248	uc = [s['unusual_packet'] for s in samples]
	249	rest = [s['other_packet'] for s in samples]
[4]	250
[10]	251	uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
	252	rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
[4]	253	if uc_high < rest_low:
	254	if greater:
	255	return -1
	256	else:
	257	return 1
	258
	259	if rest_high < uc_low:
	260	if greater:
	261	return 1
	262	else:
	263	return -1
	264
	265	return 0
	266
	267
	268	# Returns 1 if unusual_case is unusual in the expected direction
	269	# 0 otherwise
[10]	270	def summaryTest(f, params, greater, samples):
[11]	271	diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
[4]	272
[10]	273	mh = f(diffs, params['distance'])
[16]	274	#print("estimate:", mh)
[4]	275	if greater:
	276	if mh > params['threshold']:
	277	return 1
	278	else:
	279	return 0
	280	else:
	281	if mh < params['threshold']:
	282	return 1
	283	else:
	284	return 0
	285
[11]	286
[10]	287	midsummaryTest = functools.partial(summaryTest, midsummary)
	288	trimeanTest = functools.partial(summaryTest, trimean)
	289	ubersummaryTest = functools.partial(summaryTest, ubersummary)
	290	quadsummaryTest = functools.partial(summaryTest, quadsummary)
[13]	291	septasummaryTest = functools.partial(summaryTest, septasummary)
[4]	292
	293	def rmse(expected, measurements):
	294	s = sum([(expected-m)**2 for m in measurements])/len(measurements)
	295	return math.sqrt(s)
	296
	297	def nrmse(expected, measurements):
	298	return rmse(expected, measurements)/(max(measurements)-min(measurements))
[10]	299
	300
	301	class KalmanFilter1D:
	302	def __init__(self, x0, P, R, Q):
	303	self.x = x0
	304	self.P = P
	305	self.R = R
	306	self.Q = Q
	307
	308	def update(self, z):
	309	self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
	310	self.P = 1. / (1./self.P + 1./self.R)
	311
	312	def predict(self, u=0.0):
	313	self.x += u
	314	self.P += self.Q
	315
	316
	317	def kfilter(params, observations):
[11]	318	x = numpy.array(observations)
[10]	319	movement = 0
[11]	320	est = []
[10]	321	var = []
	322	kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
[11]	323	#P = 10000, # initial variance
	324	P = 10, # initial variance
[10]	325	R = numpy.std(x), # msensor noise
	326	Q = 0) # movement noise
	327	for round in range(1):
	328	for d in x:
	329	kf.predict(movement)
	330	kf.update(d)
	331	est.append(kf.x)
	332	var.append(kf.P)
	333
	334	return({'est':est, 'var':var})
	335
	336
	337	def kalmanTest(params, greater, samples):
[11]	338	diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
[10]	339
	340	m = kfilter(params, diffs)['est'][-1]
	341	if greater:
	342	if m > params['threshold']:
	343	return 1
	344	else:
	345	return 0
	346	else:
	347	if m < params['threshold']:
	348	return 1
	349	else:
	350	return 0
	351
	352
[11]	353	def tsvalwmeanTest(params, greater, samples):
	354	m = tsvalwmean(samples)
[10]	355	if greater:
	356	if m > params['threshold']:
	357	return 1
	358	else:
	359	return 0
	360	else:
	361	if m < params['threshold']:
	362	return 1
	363	else:
	364	return 0
[13]	365
	366
[24]	367	#from pykalman import KalmanFilter
[13]	368	def pyKalman4DTest(params, greater, samples):
	369	kp = params['kparams']
	370	#kp['initial_state_mean']=[quadsummary([s['unusual_packet'] for s in samples]),
	371	# quadsummary([s['other_packet'] for s in samples]),
	372	# numpy.mean([s['unusual_tsval'] for s in samples]),
	373	# numpy.mean([s['other_tsval'] for s in samples])]
	374	kf = KalmanFilter(n_dim_obs=4, n_dim_state=4, **kp)
	375	smooth,covariance = kf.smooth([(s['unusual_packet'],s['other_packet'],s['unusual_tsval'],s['other_tsval'])
	376	for s in samples])
	377	m = numpy.mean(smooth)
	378	if greater:
	379	if m > params['threshold']:
	380	return 1
	381	else:
	382	return 0
	383	else:
	384	if m < params['threshold']:
	385	return 1
	386	else:
	387	return 0
	388

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/lib/nanownlib/stats.py @ 25

Download in other formats: