[4] | 1 | |
---|
| 2 | import sys |
---|
| 3 | import os |
---|
| 4 | import math |
---|
| 5 | import statistics |
---|
| 6 | import gzip |
---|
| 7 | import random |
---|
| 8 | import scipy |
---|
| 9 | import scipy.stats |
---|
| 10 | import numpy |
---|
| 11 | |
---|
| 12 | # Don't trust numpy's seeding |
---|
| 13 | numpy.random.seed(random.SystemRandom().randint(0,2**32-1)) |
---|
| 14 | |
---|
| 15 | |
---|
| 16 | def mad(arr): |
---|
| 17 | """ Median Absolute Deviation: a "Robust" version of standard deviation. |
---|
| 18 | Indices variabililty of the sample. |
---|
| 19 | https://en.wikipedia.org/wiki/Median_absolute_deviation |
---|
| 20 | """ |
---|
| 21 | arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays. |
---|
| 22 | med = numpy.median(arr) |
---|
| 23 | return numpy.median(numpy.abs(arr - med)) |
---|
| 24 | |
---|
| 25 | |
---|
| 26 | def cov(x,y): |
---|
| 27 | mx = statistics.mean(x) |
---|
| 28 | my = statistics.mean(y) |
---|
| 29 | products = [] |
---|
| 30 | for i in range(0,len(x)): |
---|
| 31 | products.append((x[i] - mx)*(y[i] - my)) |
---|
| 32 | |
---|
| 33 | return statistics.mean(products) |
---|
| 34 | |
---|
| 35 | |
---|
| 36 | def difference(ls): |
---|
| 37 | return ls[0]-ls[1] |
---|
| 38 | |
---|
| 39 | def product(ls): |
---|
| 40 | return ls[0]*ls[1] |
---|
| 41 | |
---|
| 42 | def hypotenuse(ls): |
---|
| 43 | return math.hypot(ls[0],ls[1]) |
---|
| 44 | |
---|
| 45 | def trustValues(derived, trustFunc): |
---|
| 46 | ret_val = [] |
---|
| 47 | for k,v in derived.items(): |
---|
| 48 | ret_val.append((trustFunc((v['long'],v['short'])), k)) |
---|
| 49 | |
---|
| 50 | ret_val.sort() |
---|
| 51 | return ret_val |
---|
| 52 | |
---|
| 53 | |
---|
| 54 | def prunedWeights(derived, trust, alpha): |
---|
| 55 | weights = {} |
---|
| 56 | |
---|
| 57 | threshold = len(trust)*(1.0-alpha) |
---|
| 58 | for i in range(0,len(trust)): |
---|
| 59 | if i < threshold: |
---|
| 60 | weights[trust[i][1]] = 1.0 |
---|
| 61 | else: |
---|
| 62 | weights[trust[i][1]] = 0.0 |
---|
| 63 | |
---|
| 64 | return weights |
---|
| 65 | |
---|
| 66 | |
---|
| 67 | def linearWeights(derived, trust, alpha): |
---|
| 68 | x1 = trust[0][0] |
---|
| 69 | y1 = 1.0 + (alpha*10) |
---|
| 70 | x2 = trust[(len(trust)-1)//3][0] |
---|
| 71 | y2 = 1.0 |
---|
| 72 | m = (y1-y2)/(x1-x2) |
---|
| 73 | b = y1 - m*x1 |
---|
| 74 | |
---|
| 75 | weights = {} |
---|
| 76 | for t,k in trust: |
---|
| 77 | weights[k] = m*t+b |
---|
| 78 | if weights[k] < 0.0: |
---|
| 79 | weights[k] = 0.0 |
---|
| 80 | |
---|
| 81 | return weights |
---|
| 82 | |
---|
| 83 | |
---|
| 84 | def invertedWeights(derived,trust,alpha): |
---|
| 85 | # (x+1-first_sample)^(-alpha) |
---|
| 86 | #scale = trust[0][0] |
---|
| 87 | |
---|
| 88 | #weights = {} |
---|
| 89 | #for t,k in trust: |
---|
| 90 | # weights[k] = (t+1-scale)**(-1.0*alpha) |
---|
| 91 | # if weights[k] < 0.0: |
---|
| 92 | # weights[k] = 0.0 |
---|
| 93 | |
---|
| 94 | weights = {} |
---|
| 95 | for i in range(len(trust)): |
---|
| 96 | w = 10.0/(i+2.0)-0.2 |
---|
| 97 | if w < 0.0: |
---|
| 98 | w = 0.0 |
---|
| 99 | weights[trust[i][1]] = w |
---|
| 100 | |
---|
| 101 | |
---|
| 102 | return weights |
---|
| 103 | |
---|
| 104 | |
---|
| 105 | |
---|
| 106 | def arctanWeights(derived,trust,alpha): |
---|
| 107 | shift = trust[int((len(trust)-1)*(1.0-alpha))][0] |
---|
| 108 | minimum = trust[0][0] |
---|
| 109 | |
---|
| 110 | weights = {} |
---|
| 111 | for i in range(len(trust)): |
---|
| 112 | w = math.pi/2.0 - math.atan(2*(trust[i][0] - shift)/(shift-minimum)) |
---|
| 113 | if w < 0.0: |
---|
| 114 | w = 0.0 |
---|
| 115 | weights[trust[i][1]] = w |
---|
| 116 | |
---|
| 117 | return weights |
---|
| 118 | |
---|
| 119 | |
---|
| 120 | def arctanWeights2(derived,trust,alpha): |
---|
| 121 | shift = trust[int((len(trust)-1)*(1.0-alpha))][0] |
---|
| 122 | minimum = trust[0][0] |
---|
| 123 | stretch = trust[int((len(trust)-1)*0.5)][0] - minimum # near median |
---|
| 124 | |
---|
| 125 | weights = {} |
---|
| 126 | for i in range(len(trust)): |
---|
| 127 | w = math.pi/2.0 - math.atan(3*(trust[i][0] - shift)/(shift-minimum)) |
---|
| 128 | if w < 0.0: |
---|
| 129 | w = 0.0 |
---|
| 130 | weights[trust[i][1]] = w |
---|
| 131 | |
---|
| 132 | return weights |
---|
| 133 | |
---|
| 134 | |
---|
| 135 | def midhinge(values, distance=25): |
---|
| 136 | return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0 |
---|
| 137 | |
---|
| 138 | def trimean(values, distance=25): |
---|
| 139 | return (midhinge(values, distance) + statistics.median(values))/2 |
---|
| 140 | |
---|
| 141 | def weightedMean(derived, weights): |
---|
| 142 | normalizer = sum(weights.values())/len(weights) |
---|
| 143 | return statistics.mean([w*(derived[k]['long']-derived[k]['short'])/normalizer for k,w in weights.items()]) |
---|
| 144 | |
---|
| 145 | def weightedMeanTsval(derived, weights): |
---|
| 146 | normalizer = sum(weights.values())/len(weights) |
---|
| 147 | return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()]) |
---|
| 148 | |
---|
| 149 | |
---|
| 150 | def estimateMean(trustFunc, weightFunc, alpha, derived): |
---|
| 151 | trust = trustValues(derived, trustFunc) |
---|
| 152 | weights = weightFunc(derived, trust, alpha) |
---|
| 153 | return weightedMean(derived, weights) |
---|
| 154 | |
---|
| 155 | |
---|
| 156 | def estimateMeanTsval(trustFunc, weightFunc, alpha, derived): |
---|
| 157 | trust = trustValues(derived, trustFunc) |
---|
| 158 | weights = weightFunc(derived, trust, alpha) |
---|
| 159 | return weightedMeanTsval(derived, weights) |
---|
| 160 | |
---|
| 161 | |
---|
| 162 | #def estimateMedian(trustFunc, weightFunc, alpha, derived): |
---|
| 163 | # trust = trustValues(derived, trustFunc) |
---|
| 164 | # weights = weightFunc(derived, trust, alpha) |
---|
| 165 | |
---|
| 166 | # return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0]) |
---|
| 167 | |
---|
| 168 | def estimateMedian(derived): |
---|
| 169 | return statistics.median([(d['long']-d['short']) for d in derived.values()]) |
---|
| 170 | |
---|
| 171 | |
---|
| 172 | def estimateMidhinge(derived): |
---|
| 173 | return midhinge([(d['long']-d['short']) for d in derived.values()]) |
---|
| 174 | |
---|
| 175 | |
---|
| 176 | def estimateTrimean(derived): |
---|
| 177 | return trimean([(d['long']-d['short']) for d in derived.values()]) |
---|
| 178 | |
---|
| 179 | |
---|
| 180 | def tTest(expected_mean, derived): |
---|
| 181 | diffs = [(d['long']-d['short']) for d in derived.values()] |
---|
| 182 | null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0) |
---|
| 183 | tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean) |
---|
| 184 | |
---|
| 185 | if pval < null_pval: |
---|
| 186 | return 1 |
---|
| 187 | else: |
---|
| 188 | return 0 |
---|
| 189 | |
---|
| 190 | |
---|
| 191 | def diffMedian(derived): |
---|
| 192 | l = [tc['long']-tc['short'] for s,tc in derived.items()] |
---|
| 193 | return statistics.median(l) |
---|
| 194 | |
---|
| 195 | |
---|
| 196 | def subsample_ids(db, probe_type, subsample_size=None): |
---|
| 197 | cursor = db.conn.cursor() |
---|
| 198 | cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,)) |
---|
| 199 | population_size = cursor.fetchone()[0] |
---|
| 200 | #print("population_size:", population_size) |
---|
| 201 | if subsample_size == None or subsample_size > population_size: |
---|
| 202 | subsample_size = population_size |
---|
| 203 | |
---|
| 204 | start = numpy.random.random_integers(0,population_size-1) |
---|
| 205 | cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start)) |
---|
| 206 | for row in cursor: |
---|
| 207 | subsample_size -= 1 |
---|
| 208 | yield row['sample'] |
---|
| 209 | |
---|
| 210 | if subsample_size > 0: |
---|
| 211 | cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size)) |
---|
| 212 | for row in cursor: |
---|
| 213 | yield row['sample'] |
---|
| 214 | |
---|
| 215 | |
---|
| 216 | def subsample(db, probe_type, subsample_size=None): |
---|
| 217 | cursor = db.conn.cursor() |
---|
| 218 | cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)") |
---|
| 219 | num_test_cases = cursor.fetchone()[0] |
---|
| 220 | |
---|
| 221 | for sid in subsample_ids(db, probe_type, subsample_size): |
---|
| 222 | cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,)) |
---|
| 223 | probes = cursor.fetchall() |
---|
| 224 | if len(probes) != num_test_cases: |
---|
| 225 | sys.stderr.write("WARN: sample %d had %d probes, but %d expected! Discarding...\n" % (sid, len(probes), num_test_cases)) |
---|
| 226 | continue |
---|
| 227 | yield (sid,[dict(r) for r in probes]) |
---|
| 228 | |
---|
| 229 | # if test_cases=None, include all of them. Otherwise, include only the specified test cases. |
---|
| 230 | def samples2Distributions(samples, field, test_cases=None): |
---|
| 231 | ret_val = {} |
---|
| 232 | |
---|
| 233 | for sid,probes in samples: |
---|
| 234 | for p in probes: |
---|
| 235 | if p['test_case'] in ret_val: |
---|
| 236 | ret_val[p['test_case']].append(p[field]) |
---|
| 237 | elif test_cases == None or p['test_case'] in test_cases: |
---|
| 238 | ret_val[p['test_case']] = [p[field]] |
---|
| 239 | |
---|
| 240 | |
---|
| 241 | return ret_val |
---|
| 242 | |
---|
| 243 | |
---|
| 244 | def samples2MeanDiffs(samples, field, unusual_case): |
---|
| 245 | ret_val = {} |
---|
| 246 | |
---|
| 247 | for sid,probes in samples: |
---|
| 248 | unusual_value = None |
---|
| 249 | for p in probes: |
---|
| 250 | if p['test_case'] == unusual_case: |
---|
| 251 | unusual_value = p[field] |
---|
| 252 | break |
---|
| 253 | yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case]) |
---|
| 254 | |
---|
| 255 | |
---|
| 256 | def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials): |
---|
| 257 | ret_val = [] |
---|
| 258 | for t in range(num_trials): |
---|
| 259 | ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size))) |
---|
| 260 | |
---|
| 261 | return ret_val |
---|
| 262 | |
---|
| 263 | |
---|
| 264 | def bootstrap2(estimator, db, probe_type, subsample_size, num_trials): |
---|
| 265 | ret_val = [] |
---|
| 266 | for t in range(num_trials): |
---|
| 267 | ret_val.append(estimator(subsample(db, probe_type, subsample_size))) |
---|
| 268 | |
---|
| 269 | return ret_val |
---|
| 270 | |
---|
| 271 | |
---|
| 272 | # Returns the test case name that clearly has higher RTT; otherwise, returns None |
---|
| 273 | def boxTest(params, test_cases, samples): |
---|
| 274 | if len(test_cases) != 2: |
---|
| 275 | # XXX: somehow generalize the box test to handle more than 2 cases |
---|
| 276 | raise Exception() |
---|
| 277 | dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params |
---|
| 278 | |
---|
| 279 | tmp1,tmp2 = dists.items() |
---|
| 280 | test_case1,dist1 = tmp1 |
---|
| 281 | test_case2,dist2 = tmp2 |
---|
| 282 | |
---|
| 283 | d1_high = numpy.percentile(dist1, params['high']) |
---|
| 284 | d2_low = numpy.percentile(dist2, params['low']) |
---|
| 285 | if d1_high < d2_low: |
---|
| 286 | return test_case2 |
---|
| 287 | |
---|
| 288 | d1_low = numpy.percentile(dist1, params['low']) |
---|
| 289 | d2_high = numpy.percentile(dist2, params['high']) |
---|
| 290 | |
---|
| 291 | if d2_high < d1_low: |
---|
| 292 | return test_case1 |
---|
| 293 | |
---|
| 294 | return None |
---|
| 295 | |
---|
| 296 | |
---|
| 297 | # Returns 1 if unusual_case is unusual in the expected direction |
---|
| 298 | # 0 if it isn't unusual |
---|
| 299 | # -1 if it is unusual in the wrong direction |
---|
| 300 | def multiBoxTest(params, unusual_case, greater, samples): |
---|
| 301 | #XXX: packet_rtt field from params |
---|
| 302 | dists = samples2Distributions(samples, 'packet_rtt') |
---|
| 303 | |
---|
| 304 | uc = dists[unusual_case] |
---|
| 305 | rest = [] |
---|
| 306 | for tc,d in dists.items(): |
---|
| 307 | if tc != unusual_case: |
---|
| 308 | rest.extend(d) |
---|
| 309 | |
---|
| 310 | uc_high = numpy.percentile(uc, params['high']) |
---|
| 311 | rest_low = numpy.percentile(rest, params['low']) |
---|
| 312 | if uc_high < rest_low: |
---|
| 313 | if greater: |
---|
| 314 | return -1 |
---|
| 315 | else: |
---|
| 316 | return 1 |
---|
| 317 | |
---|
| 318 | uc_low = numpy.percentile(uc, params['low']) |
---|
| 319 | rest_high = numpy.percentile(rest, params['high']) |
---|
| 320 | if rest_high < uc_low: |
---|
| 321 | if greater: |
---|
| 322 | return 1 |
---|
| 323 | else: |
---|
| 324 | return -1 |
---|
| 325 | |
---|
| 326 | return 0 |
---|
| 327 | |
---|
| 328 | |
---|
| 329 | # Returns 1 if unusual_case is unusual in the expected direction |
---|
| 330 | # 0 otherwise |
---|
| 331 | def midhingeTest(params, unusual_case, greater, samples): |
---|
| 332 | diffs = list(samples2MeanDiffs(samples, 'packet_rtt', unusual_case)) |
---|
| 333 | |
---|
| 334 | mh = midhinge(diffs, params['distance']) |
---|
| 335 | if greater: |
---|
| 336 | if mh > params['threshold']: |
---|
| 337 | return 1 |
---|
| 338 | else: |
---|
| 339 | return 0 |
---|
| 340 | else: |
---|
| 341 | if mh < params['threshold']: |
---|
| 342 | return 1 |
---|
| 343 | else: |
---|
| 344 | return 0 |
---|
| 345 | |
---|
| 346 | |
---|
| 347 | def rmse(expected, measurements): |
---|
| 348 | s = sum([(expected-m)**2 for m in measurements])/len(measurements) |
---|
| 349 | return math.sqrt(s) |
---|
| 350 | |
---|
| 351 | def nrmse(expected, measurements): |
---|
| 352 | return rmse(expected, measurements)/(max(measurements)-min(measurements)) |
---|