1 | |
---|
2 | import sys |
---|
3 | import os |
---|
4 | import math |
---|
5 | import statistics |
---|
6 | import gzip |
---|
7 | import random |
---|
8 | import scipy |
---|
9 | import scipy.stats |
---|
10 | import numpy |
---|
11 | |
---|
12 | # Don't trust numpy's seeding |
---|
13 | numpy.random.seed(random.SystemRandom().randint(0,2**32-1)) |
---|
14 | |
---|
15 | |
---|
16 | def mad(arr): |
---|
17 | """ Median Absolute Deviation: a "Robust" version of standard deviation. |
---|
18 | Indices variabililty of the sample. |
---|
19 | https://en.wikipedia.org/wiki/Median_absolute_deviation |
---|
20 | """ |
---|
21 | arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays. |
---|
22 | med = numpy.median(arr) |
---|
23 | return numpy.median(numpy.abs(arr - med)) |
---|
24 | |
---|
25 | |
---|
26 | def cov(x,y): |
---|
27 | mx = statistics.mean(x) |
---|
28 | my = statistics.mean(y) |
---|
29 | products = [] |
---|
30 | for i in range(0,len(x)): |
---|
31 | products.append((x[i] - mx)*(y[i] - my)) |
---|
32 | |
---|
33 | return statistics.mean(products) |
---|
34 | |
---|
35 | |
---|
36 | def difference(ls): |
---|
37 | return ls[0]-ls[1] |
---|
38 | |
---|
39 | def product(ls): |
---|
40 | return ls[0]*ls[1] |
---|
41 | |
---|
42 | def hypotenuse(ls): |
---|
43 | return math.hypot(ls[0],ls[1]) |
---|
44 | |
---|
45 | def trustValues(derived, trustFunc): |
---|
46 | ret_val = [] |
---|
47 | for k,v in derived.items(): |
---|
48 | ret_val.append((trustFunc((v['long'],v['short'])), k)) |
---|
49 | |
---|
50 | ret_val.sort() |
---|
51 | return ret_val |
---|
52 | |
---|
53 | |
---|
54 | def prunedWeights(derived, trust, alpha): |
---|
55 | weights = {} |
---|
56 | |
---|
57 | threshold = len(trust)*(1.0-alpha) |
---|
58 | for i in range(0,len(trust)): |
---|
59 | if i < threshold: |
---|
60 | weights[trust[i][1]] = 1.0 |
---|
61 | else: |
---|
62 | weights[trust[i][1]] = 0.0 |
---|
63 | |
---|
64 | return weights |
---|
65 | |
---|
66 | |
---|
67 | def linearWeights(derived, trust, alpha): |
---|
68 | x1 = trust[0][0] |
---|
69 | y1 = 1.0 + (alpha*10) |
---|
70 | x2 = trust[(len(trust)-1)//3][0] |
---|
71 | y2 = 1.0 |
---|
72 | m = (y1-y2)/(x1-x2) |
---|
73 | b = y1 - m*x1 |
---|
74 | |
---|
75 | weights = {} |
---|
76 | for t,k in trust: |
---|
77 | weights[k] = m*t+b |
---|
78 | if weights[k] < 0.0: |
---|
79 | weights[k] = 0.0 |
---|
80 | |
---|
81 | return weights |
---|
82 | |
---|
83 | |
---|
84 | def invertedWeights(derived,trust,alpha): |
---|
85 | # (x+1-first_sample)^(-alpha) |
---|
86 | #scale = trust[0][0] |
---|
87 | |
---|
88 | #weights = {} |
---|
89 | #for t,k in trust: |
---|
90 | # weights[k] = (t+1-scale)**(-1.0*alpha) |
---|
91 | # if weights[k] < 0.0: |
---|
92 | # weights[k] = 0.0 |
---|
93 | |
---|
94 | weights = {} |
---|
95 | for i in range(len(trust)): |
---|
96 | w = 10.0/(i+2.0)-0.2 |
---|
97 | if w < 0.0: |
---|
98 | w = 0.0 |
---|
99 | weights[trust[i][1]] = w |
---|
100 | |
---|
101 | |
---|
102 | return weights |
---|
103 | |
---|
104 | |
---|
105 | |
---|
106 | def arctanWeights(derived,trust,alpha): |
---|
107 | shift = trust[int((len(trust)-1)*(1.0-alpha))][0] |
---|
108 | minimum = trust[0][0] |
---|
109 | |
---|
110 | weights = {} |
---|
111 | for i in range(len(trust)): |
---|
112 | w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum)) |
---|
113 | if w < 0.0: |
---|
114 | w = 0.0 |
---|
115 | weights[trust[i][1]] = w |
---|
116 | |
---|
117 | return weights |
---|
118 | |
---|
119 | |
---|
120 | def arctanWeights2(derived,trust,alpha): |
---|
121 | shift = trust[int((len(trust)-1)*(1.0-alpha))][0] |
---|
122 | minimum = trust[0][0] |
---|
123 | stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median |
---|
124 | |
---|
125 | weights = {} |
---|
126 | for i in range(len(trust)): |
---|
127 | w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum)) |
---|
128 | if w < 0.0: |
---|
129 | w = 0.0 |
---|
130 | weights[trust[i][1]] = w |
---|
131 | |
---|
132 | return weights |
---|
133 | |
---|
134 | |
---|
135 | def midhinge(values, distance=25): |
---|
136 | return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0 |
---|
137 | |
---|
138 | def trimean(values, distance=25): |
---|
139 | return (midhinge(values, distance) + statistics.median(values))/2 |
---|
140 | |
---|
141 | def weightedMean(derived, weights): |
---|
142 | normalizer = sum(weights.values())/len(weights) |
---|
143 | return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()]) |
---|
144 | |
---|
145 | def weightedMeanTsval(derived, weights): |
---|
146 | normalizer = sum(weights.values())/len(weights) |
---|
147 | return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()]) |
---|
148 | |
---|
149 | |
---|
150 | def estimateMean(trustFunc, weightFunc, alpha, derived): |
---|
151 | trust = trustValues(derived, trustFunc) |
---|
152 | weights = weightFunc(derived, trust, alpha) |
---|
153 | return weightedMean(derived, weights) |
---|
154 | |
---|
155 | |
---|
156 | def estimateMeanTsval(trustFunc, weightFunc, alpha, derived): |
---|
157 | trust = trustValues(derived, trustFunc) |
---|
158 | weights = weightFunc(derived, trust, alpha) |
---|
159 | return weightedMeanTsval(derived, weights) |
---|
160 | |
---|
161 | |
---|
162 | #def estimateMedian(trustFunc, weightFunc, alpha, derived): |
---|
163 | # trust = trustValues(derived, trustFunc) |
---|
164 | # weights = weightFunc(derived, trust, alpha) |
---|
165 | |
---|
166 | # return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0]) |
---|
167 | |
---|
168 | def estimateMedian(derived): |
---|
169 | return statistics.median([(d['long']-d['short']) for d in derived.values()]) |
---|
170 | |
---|
171 | |
---|
172 | def estimateMidhinge(derived): |
---|
173 | return midhinge([(d['long']-d['short']) for d in derived.values()]) |
---|
174 | |
---|
175 | |
---|
176 | def estimateTrimean(derived): |
---|
177 | return trimean([(d['long']-d['short']) for d in derived.values()]) |
---|
178 | |
---|
179 | |
---|
180 | def tTest(expected_mean, derived): |
---|
181 | diffs = [(d['long']-d['short']) for d in derived.values()] |
---|
182 | null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0) |
---|
183 | tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean) |
---|
184 | |
---|
185 | if pval < null_pval: |
---|
186 | return 1 |
---|
187 | else: |
---|
188 | return 0 |
---|
189 | |
---|
190 | |
---|
191 | def diffMedian(derived): |
---|
192 | l = [tc['long']-tc['short'] for s,tc in derived.items()] |
---|
193 | return statistics.median(l) |
---|
194 | |
---|
195 | |
---|
196 | def subsample_ids(db, probe_type, subsample_size=None): |
---|
197 | cursor = db.conn.cursor() |
---|
198 | cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,)) |
---|
199 | population_size = cursor.fetchone()[0] |
---|
200 | #print("population_size:", population_size) |
---|
201 | if subsample_size == None or subsample_size > population_size: |
---|
202 | subsample_size = population_size |
---|
203 | |
---|
204 | start = numpy.random.random_integers(0,population_size-1) |
---|
205 | cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start)) |
---|
206 | for row in cursor: |
---|
207 | subsample_size -= 1 |
---|
208 | yield row['sample'] |
---|
209 | |
---|
210 | if subsample_size > 0: |
---|
211 | cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size)) |
---|
212 | for row in cursor: |
---|
213 | yield row['sample'] |
---|
214 | |
---|
215 | |
---|
216 | def subsample(db, probe_type, subsample_size=None): |
---|
217 | cursor = db.conn.cursor() |
---|
218 | cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)") |
---|
219 | num_test_cases = cursor.fetchone()[0] |
---|
220 | |
---|
221 | for sid in subsample_ids(db, probe_type, subsample_size): |
---|
222 | cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,)) |
---|
223 | probes = cursor.fetchall() |
---|
224 | if len(probes) != num_test_cases: |
---|
225 | sys.stderr.write("WARN: sample %d had %d probes, but %d expected! Discarding...\n" % (sid, len(probes), num_test_cases)) |
---|
226 | continue |
---|
227 | yield (sid,[dict(r) for r in probes]) |
---|
228 | |
---|
229 | |
---|
230 | def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'): |
---|
231 | population_size = db.populationSize(probe_type) |
---|
232 | |
---|
233 | if size == None or size > population_size: |
---|
234 | size = population_size |
---|
235 | if offset == None or offset >= population_size or offset < 0: |
---|
236 | offset = numpy.random.random_integers(0,population_size-1) |
---|
237 | |
---|
238 | query=""" |
---|
239 | SELECT %(field)s AS unusual_case, |
---|
240 | (SELECT avg(%(field)s) FROM probes,analysis |
---|
241 | WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases |
---|
242 | FROM (SELECT probes.sample,%(field)s FROM probes,analysis |
---|
243 | WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u |
---|
244 | LIMIT :size OFFSET :offset |
---|
245 | """ % {"field":field} |
---|
246 | |
---|
247 | params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size} |
---|
248 | cursor = db.conn.cursor() |
---|
249 | cursor.execute(query, params) |
---|
250 | ret_val = [dict(row) for row in cursor.fetchall()] |
---|
251 | #for row in cursor: |
---|
252 | # size -= 1 |
---|
253 | # yield dict(row) |
---|
254 | |
---|
255 | size -= len(ret_val) |
---|
256 | if size > 0: |
---|
257 | params['offset'] = 0 |
---|
258 | params['size'] = size |
---|
259 | cursor.execute(query, params) |
---|
260 | ret_val += [dict(row) for row in cursor.fetchall()] |
---|
261 | #for row in cursor: |
---|
262 | # yield dict(row) |
---|
263 | |
---|
264 | return ret_val |
---|
265 | |
---|
266 | |
---|
267 | # if test_cases=None, include all of them. Otherwise, include only the specified test cases. |
---|
268 | def samples2Distributions(samples, field, test_cases=None): |
---|
269 | ret_val = {} |
---|
270 | |
---|
271 | for sid,probes in samples: |
---|
272 | for p in probes: |
---|
273 | if p['test_case'] in ret_val: |
---|
274 | ret_val[p['test_case']].append(p[field]) |
---|
275 | elif test_cases == None or p['test_case'] in test_cases: |
---|
276 | ret_val[p['test_case']] = [p[field]] |
---|
277 | |
---|
278 | |
---|
279 | return ret_val |
---|
280 | |
---|
281 | |
---|
282 | def samples2MeanDiffs(samples, field, unusual_case): |
---|
283 | ret_val = {} |
---|
284 | |
---|
285 | for sid,probes in samples: |
---|
286 | unusual_value = None |
---|
287 | for p in probes: |
---|
288 | if p['test_case'] == unusual_case: |
---|
289 | unusual_value = p[field] |
---|
290 | break |
---|
291 | yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case]) |
---|
292 | |
---|
293 | |
---|
294 | def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials): |
---|
295 | ret_val = [] |
---|
296 | for t in range(num_trials): |
---|
297 | ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size))) |
---|
298 | |
---|
299 | return ret_val |
---|
300 | |
---|
301 | |
---|
302 | def bootstrap2(estimator, db, probe_type, subsample_size, num_trials): |
---|
303 | ret_val = [] |
---|
304 | for t in range(num_trials): |
---|
305 | ret_val.append(estimator(subsample(db, probe_type, subsample_size))) |
---|
306 | |
---|
307 | return ret_val |
---|
308 | |
---|
309 | |
---|
310 | def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials): |
---|
311 | ret_val = [] |
---|
312 | for t in range(num_trials): |
---|
313 | ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size))) |
---|
314 | |
---|
315 | return ret_val |
---|
316 | |
---|
317 | |
---|
318 | # Returns the test case name that clearly has higher RTT; otherwise, returns None |
---|
319 | def boxTest(params, test_cases, samples): |
---|
320 | if len(test_cases) != 2: |
---|
321 | # XXX: somehow generalize the box test to handle more than 2 cases |
---|
322 | raise Exception() |
---|
323 | dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params |
---|
324 | |
---|
325 | tmp1,tmp2 = dists.items() |
---|
326 | test_case1,dist1 = tmp1 |
---|
327 | test_case2,dist2 = tmp2 |
---|
328 | |
---|
329 | d1_high = numpy.percentile(dist1, params['high']) |
---|
330 | d2_low = numpy.percentile(dist2, params['low']) |
---|
331 | if d1_high < d2_low: |
---|
332 | return test_case2 |
---|
333 | |
---|
334 | d1_low = numpy.percentile(dist1, params['low']) |
---|
335 | d2_high = numpy.percentile(dist2, params['high']) |
---|
336 | |
---|
337 | if d2_high < d1_low: |
---|
338 | return test_case1 |
---|
339 | |
---|
340 | return None |
---|
341 | |
---|
342 | |
---|
343 | # Returns 1 if unusual_case is unusual in the expected direction |
---|
344 | # 0 if it isn't unusual |
---|
345 | # -1 if it is unusual in the wrong direction |
---|
346 | def multiBoxTest(params, greater, samples): |
---|
347 | uc = [s['unusual_case'] for s in samples] |
---|
348 | rest = [s['other_cases'] for s in samples] |
---|
349 | |
---|
350 | uc_high = numpy.percentile(uc, params['high']) |
---|
351 | rest_low = numpy.percentile(rest, params['low']) |
---|
352 | if uc_high < rest_low: |
---|
353 | if greater: |
---|
354 | return -1 |
---|
355 | else: |
---|
356 | return 1 |
---|
357 | |
---|
358 | uc_low = numpy.percentile(uc, params['low']) |
---|
359 | rest_high = numpy.percentile(rest, params['high']) |
---|
360 | if rest_high < uc_low: |
---|
361 | if greater: |
---|
362 | return 1 |
---|
363 | else: |
---|
364 | return -1 |
---|
365 | |
---|
366 | return 0 |
---|
367 | |
---|
368 | |
---|
369 | # Returns 1 if unusual_case is unusual in the expected direction |
---|
370 | # 0 otherwise |
---|
371 | def midhingeTest(params, greater, samples): |
---|
372 | diffs = [s['unusual_case']-s['other_cases'] for s in samples] |
---|
373 | |
---|
374 | mh = midhinge(diffs, params['distance']) |
---|
375 | #mh = trimean(diffs, params['distance']) |
---|
376 | if greater: |
---|
377 | if mh > params['threshold']: |
---|
378 | return 1 |
---|
379 | else: |
---|
380 | return 0 |
---|
381 | else: |
---|
382 | if mh < params['threshold']: |
---|
383 | return 1 |
---|
384 | else: |
---|
385 | return 0 |
---|
386 | |
---|
387 | |
---|
388 | def rmse(expected, measurements): |
---|
389 | s = sum([(expected-m)**2 for m in measurements])/len(measurements) |
---|
390 | return math.sqrt(s) |
---|
391 | |
---|
392 | def nrmse(expected, measurements): |
---|
393 | return rmse(expected, measurements)/(max(measurements)-min(measurements)) |
---|