Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

stats.py

Last change on this file was 24, checked in by tim, 8 years ago
migration
File size: 10.6 KB

Line
1
2	import sys
3	import os
4	import functools
5	import math
6	import statistics
7	import gzip
8	import random
9	try:
10	import numpy
11	except:
12	sys.stderr.write('ERROR: Could not import numpy module. Ensure it is installed.\n')
13	sys.stderr.write(' Under Debian, the package name is "python3-numpy"\n.')
14	sys.exit(1)
15
16	# Don't trust numpy's seeding
17	numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
18
19
20	def mad(arr):
21	""" Median Absolute Deviation: a "Robust" version of standard deviation.
22	Indices variabililty of the sample.
23	https://en.wikipedia.org/wiki/Median_absolute_deviation
24	"""
25	arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
26	med = numpy.median(arr)
27	return numpy.median(numpy.abs(arr - med))
28
29
30	def cov(x,y):
31	mx = statistics.mean(x)
32	my = statistics.mean(y)
33	products = []
34	for i in range(0,len(x)):
35	products.append((x[i] - mx)*(y[i] - my))
36
37	return statistics.mean(products)
38
39
40	def OLSRegression(x,y):
41	#print(x,y)
42	x = numpy.array(x)
43	y = numpy.array(y)
44	#A = numpy.vstack([x, numpy.ones(len(x))]).T
45	#m, c = numpy.linalg.lstsq(A, y)[0] # broken
46	#c,m = numpy.polynomial.polynomial.polyfit(x, y, 1) # less accurate
47	c,m = numpy.polynomial.Polynomial.fit(x,y,1).convert().coef
48
49	#print(m,c)
50
51	#import matplotlib.pyplot as plt
52	#plt.clf()
53	#plt.scatter(x, y)
54	#plt.plot(x, m*x + c, 'r', label='Fitted line')
55	#plt.show()
56
57	return (m,c)
58
59
60	def difference(ls):
61	return ls[0]-ls[1]
62
63	def product(ls):
64	return ls[0]*ls[1]
65
66	def hypotenuse(ls):
67	return math.hypot(ls[0],ls[1])
68
69	def trustValues(derived, trustFunc):
70	ret_val = []
71	for k,v in derived.items():
72	ret_val.append((trustFunc((v['long'],v['short'])), k))
73
74	ret_val.sort()
75	return ret_val
76
77
78	def prunedWeights(derived, trust, alpha):
79	weights = {}
80
81	threshold = len(trust)*(1.0-alpha)
82	for i in range(0,len(trust)):
83	if i < threshold:
84	weights[trust[i][1]] = 1.0
85	else:
86	weights[trust[i][1]] = 0.0
87
88	return weights
89
90
91	def linearWeights(derived, trust, alpha):
92	x1 = trust[0][0]
93	y1 = 1.0 + (alpha*10)
94	x2 = trust[(len(trust)-1)//3][0]
95	y2 = 1.0
96	m = (y1-y2)/(x1-x2)
97	b = y1 - m*x1
98
99	weights = {}
100	for t,k in trust:
101	weights[k] = m*t+b
102	if weights[k] < 0.0:
103	weights[k] = 0.0
104
105	return weights
106
107
108	def invertedWeights(derived,trust,alpha):
109	# (x+1-first_sample)^(-alpha)
110	#scale = trust[0][0]
111
112	#weights = {}
113	#for t,k in trust:
114	# weights[k] = (t+1-scale)*(-1.0alpha)
115	# if weights[k] < 0.0:
116	# weights[k] = 0.0
117
118	weights = {}
119	for i in range(len(trust)):
120	w = 10.0/(i+2.0)-0.2
121	if w < 0.0:
122	w = 0.0
123	weights[trust[i][1]] = w
124
125
126	return weights
127
128
129
130	def arctanWeights(derived,trust,alpha):
131	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
132	minimum = trust[0][0]
133
134	weights = {}
135	for i in range(len(trust)):
136	w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum))
137	if w < 0.0:
138	w = 0.0
139	weights[trust[i][1]] = w
140
141	return weights
142
143
144	def arctanWeights2(derived,trust,alpha):
145	shift = trust[int((len(trust)-1)*(1.0-alpha))][0]
146	minimum = trust[0][0]
147	stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median
148
149	weights = {}
150	for i in range(len(trust)):
151	w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum))
152	if w < 0.0:
153	w = 0.0
154	weights[trust[i][1]] = w
155
156	return weights
157
158
159	def midsummary(values, distance=25):
160	#return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
161	l,h = numpy.percentile(values, (50-distance,50+distance))
162	return (l+h)/2.0
163
164	def trimean(values, distance=25):
165	return (midsummary(values, distance) + statistics.median(values))/2
166
167	def ubersummary(values, distance=25):
168	left2 = 50-distance
169	left3 = 50-(distance/2.0)
170	left1 = left2/2.0
171	right2 = 50+distance
172	right3 = 50+(distance/2.0)
173	right1 = (right2+100)/2.0
174	l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
175	#print(l1,l2,l3,m,r3,r2,r1)
176	return (l1+l24+l3+r3+r24+r1)/12.0
177	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
178
179
180	def quadsummary(values, distance=25):
181	left1 = 50-distance
182	left2 = (left1+50)/2.0
183	right1 = 50+distance
184	right2 = (right1+50)/2.0
185	l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
186	#print(left1,left2,left3,50,right3,right2,right1)
187	#print(l1,l2,l3,m,r3,r2,r1)
188	return (l1+l2+r2+r1)/4.0
189	#return statistics.mean((l1,l2,l3,m,r3,r2,r1))
190
191
192	def septasummary(values, distance=25):
193	left2 = 50-distance
194	left3 = 50-(distance/2.0)
195	left1 = left2/2.0
196	right2 = 50+distance
197	right3 = 50+(distance/2.0)
198	right1 = (right2+100)/2.0
199	l1,l2,l3,m,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,50,right3,right2,right1))
200	return (l1+l2+l3+m+r3+r2+r1)/7.0
201
202
203	def tsvalwmean(subseries):
204	weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries]
205	normalizer = sum(weights)/len(weights)
206	return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer
207	for i in range(len(weights))])
208
209	#def tsvalwmean(subseries):
210	# return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries])
211
212
213	def weightedMean(derived, weights):
214	normalizer = sum(weights.values())/len(weights)
215	return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()])
216
217	def weightedMeanTsval(derived, weights):
218	normalizer = sum(weights.values())/len(weights)
219	return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
220
221
222
223
224	def estimateMean(trustFunc, weightFunc, alpha, derived):
225	trust = trustValues(derived, trustFunc)
226	weights = weightFunc(derived, trust, alpha)
227	return weightedMean(derived, weights)
228
229
230	def estimateMeanTsval(trustFunc, weightFunc, alpha, derived):
231	trust = trustValues(derived, trustFunc)
232	weights = weightFunc(derived, trust, alpha)
233	return weightedMeanTsval(derived, weights)
234
235
236	def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
237	ret_val = []
238	for t in range(num_trials):
239	ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
240
241	return ret_val
242
243
244	# Returns 1 if unusual_case is unusual in the expected direction
245	# 0 if it isn't unusual
246	# -1 if it is unusual in the wrong direction
247	def multiBoxTest(params, greater, samples):
248	uc = [s['unusual_packet'] for s in samples]
249	rest = [s['other_packet'] for s in samples]
250
251	uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
252	rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
253	if uc_high < rest_low:
254	if greater:
255	return -1
256	else:
257	return 1
258
259	if rest_high < uc_low:
260	if greater:
261	return 1
262	else:
263	return -1
264
265	return 0
266
267
268	# Returns 1 if unusual_case is unusual in the expected direction
269	# 0 otherwise
270	def summaryTest(f, params, greater, samples):
271	diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
272
273	mh = f(diffs, params['distance'])
274	#print("estimate:", mh)
275	if greater:
276	if mh > params['threshold']:
277	return 1
278	else:
279	return 0
280	else:
281	if mh < params['threshold']:
282	return 1
283	else:
284	return 0
285
286
287	midsummaryTest = functools.partial(summaryTest, midsummary)
288	trimeanTest = functools.partial(summaryTest, trimean)
289	ubersummaryTest = functools.partial(summaryTest, ubersummary)
290	quadsummaryTest = functools.partial(summaryTest, quadsummary)
291	septasummaryTest = functools.partial(summaryTest, septasummary)
292
293	def rmse(expected, measurements):
294	s = sum([(expected-m)**2 for m in measurements])/len(measurements)
295	return math.sqrt(s)
296
297	def nrmse(expected, measurements):
298	return rmse(expected, measurements)/(max(measurements)-min(measurements))
299
300
301	class KalmanFilter1D:
302	def __init__(self, x0, P, R, Q):
303	self.x = x0
304	self.P = P
305	self.R = R
306	self.Q = Q
307
308	def update(self, z):
309	self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
310	self.P = 1. / (1./self.P + 1./self.R)
311
312	def predict(self, u=0.0):
313	self.x += u
314	self.P += self.Q
315
316
317	def kfilter(params, observations):
318	x = numpy.array(observations)
319	movement = 0
320	est = []
321	var = []
322	kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
323	#P = 10000, # initial variance
324	P = 10, # initial variance
325	R = numpy.std(x), # msensor noise
326	Q = 0) # movement noise
327	for round in range(1):
328	for d in x:
329	kf.predict(movement)
330	kf.update(d)
331	est.append(kf.x)
332	var.append(kf.P)
333
334	return({'est':est, 'var':var})
335
336
337	def kalmanTest(params, greater, samples):
338	diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
339
340	m = kfilter(params, diffs)['est'][-1]
341	if greater:
342	if m > params['threshold']:
343	return 1
344	else:
345	return 0
346	else:
347	if m < params['threshold']:
348	return 1
349	else:
350	return 0
351
352
353	def tsvalwmeanTest(params, greater, samples):
354	m = tsvalwmean(samples)
355	if greater:
356	if m > params['threshold']:
357	return 1
358	else:
359	return 0
360	else:
361	if m < params['threshold']:
362	return 1
363	else:
364	return 0
365
366
367	#from pykalman import KalmanFilter
368	def pyKalman4DTest(params, greater, samples):
369	kp = params['kparams']
370	#kp['initial_state_mean']=[quadsummary([s['unusual_packet'] for s in samples]),
371	# quadsummary([s['other_packet'] for s in samples]),
372	# numpy.mean([s['unusual_tsval'] for s in samples]),
373	# numpy.mean([s['other_tsval'] for s in samples])]
374	kf = KalmanFilter(n_dim_obs=4, n_dim_state=4, **kp)
375	smooth,covariance = kf.smooth([(s['unusual_packet'],s['other_packet'],s['unusual_tsval'],s['other_tsval'])
376	for s in samples])
377	m = numpy.mean(smooth)
378	if greater:
379	if m > params['threshold']:
380	return 1
381	else:
382	return 0
383	else:
384	if m < params['threshold']:
385	return 1
386	else:
387	return 0
388

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/lib/nanownlib/stats.py

Download in other formats: