Changeset 11 for trunk/lib/nanownlib/stats.py
- Timestamp:
- 07/16/15 20:40:01 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/lib/nanownlib/stats.py
r10 r11 144 144 def ubersummary(values, distance=25): 145 145 left2 = 50-distance 146 left3 = 50-(distance/2.0) 146 147 left1 = left2/2.0 147 left3 = (left2+50)/2.0148 148 right2 = 50+distance 149 right3 = (right2+50)/2.0149 right3 = 50+(distance/2.0) 150 150 right1 = (right2+100)/2.0 151 151 l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1)) 152 #print(left1,left2,left3,50,right3,right2,right1)153 152 #print(l1,l2,l3,m,r3,r2,r1) 154 153 return (l1+l2*4+l3+r3+r2*4+r1)/12.0 155 154 #return statistics.mean((l1,l2,l3,m,r3,r2,r1)) 156 155 157 def quadsummary(values, distance=25): 158 left2 = 50-distance 159 left1 = left2/2.0 160 right2 = 50+distance 161 right1 = (right2+100)/2.0 162 l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1)) 163 #print(left1,left2,left3,50,right3,right2,right1) 164 #print(l1,l2,l3,m,r3,r2,r1) 165 return (l1+l2+r2+r1)/4.0 166 #return statistics.mean((l1,l2,l3,m,r3,r2,r1)) 167 156 168 157 def quadsummary(values, distance=25): 169 158 left1 = 50-distance … … 177 166 #return statistics.mean((l1,l2,l3,m,r3,r2,r1)) 178 167 179 168 169 def tsvalwmean(subseries): 170 weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries] 171 normalizer = sum(weights)/len(weights) 172 return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer 173 for i in range(len(weights))]) 174 175 #def tsvalwmean(subseries): 176 # return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries]) 177 178 180 179 def weightedMean(derived, weights): 181 180 normalizer = sum(weights.values())/len(weights) … … 186 185 return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()]) 187 186 187 188 188 189 189 190 def estimateMean(trustFunc, weightFunc, alpha, derived): … … 199 200 200 201 201 #def estimateMedian(trustFunc, weightFunc, alpha, derived):202 # trust = trustValues(derived, trustFunc)203 # weights = weightFunc(derived, trust, alpha)204 205 # return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0])206 207 def estimateMedian(derived):208 return statistics.median([(d['long']-d['short']) for d in derived.values()])209 210 211 def estimateMidsummary(derived):212 return midsummary([(d['long']-d['short']) for d in derived.values()])213 214 215 def estimateTrimean(derived):216 return trimean([(d['long']-d['short']) for d in derived.values()])217 218 219 def tTest(expected_mean, derived):220 diffs = [(d['long']-d['short']) for d in derived.values()]221 null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0)222 tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean)223 224 if pval < null_pval:225 return 1226 else:227 return 0228 229 230 def diffMedian(derived):231 l = [tc['long']-tc['short'] for s,tc in derived.items()]232 return statistics.median(l)233 234 235 def subsample_ids(db, probe_type, subsample_size=None):236 cursor = db.conn.cursor()237 cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))238 population_size = cursor.fetchone()[0]239 #print("population_size:", population_size)240 if subsample_size == None or subsample_size > population_size:241 subsample_size = population_size242 243 start = numpy.random.random_integers(0,population_size-1)244 cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start))245 for row in cursor:246 subsample_size -= 1247 yield row['sample']248 249 if subsample_size > 0:250 cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size))251 for row in cursor:252 yield row['sample']253 254 255 def subsample(db, probe_type, subsample_size=None):256 cursor = db.conn.cursor()257 cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)")258 num_test_cases = cursor.fetchone()[0]259 260 for sid in subsample_ids(db, probe_type, subsample_size):261 cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,))262 probes = cursor.fetchall()263 if len(probes) != num_test_cases:264 sys.stderr.write("WARN: sample %d had %d probes, but %d expected! Discarding...\n" % (sid, len(probes), num_test_cases))265 continue266 yield (sid,[dict(r) for r in probes])267 268 269 def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):270 population_size = db.populationSize(probe_type)271 272 if size == None or size > population_size:273 size = population_size274 if offset == None or offset >= population_size or offset < 0:275 offset = numpy.random.random_integers(0,population_size-1)276 277 query="""278 SELECT %(field)s AS unusual_case,279 (SELECT avg(%(field)s) FROM probes,analysis280 WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases281 FROM (SELECT probes.sample,%(field)s FROM probes,analysis282 WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u283 LIMIT :size OFFSET :offset284 """ % {"field":field}285 286 params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size}287 cursor = db.conn.cursor()288 cursor.execute(query, params)289 ret_val = [dict(row) for row in cursor.fetchall()]290 #for row in cursor:291 # size -= 1292 # yield dict(row)293 294 size -= len(ret_val)295 if size > 0:296 params['offset'] = 0297 params['size'] = size298 cursor.execute(query, params)299 ret_val += [dict(row) for row in cursor.fetchall()]300 #for row in cursor:301 # yield dict(row)302 303 return ret_val304 305 306 # if test_cases=None, include all of them. Otherwise, include only the specified test cases.307 def samples2Distributions(samples, field, test_cases=None):308 ret_val = {}309 310 for sid,probes in samples:311 for p in probes:312 if p['test_case'] in ret_val:313 ret_val[p['test_case']].append(p[field])314 elif test_cases == None or p['test_case'] in test_cases:315 ret_val[p['test_case']] = [p[field]]316 317 318 return ret_val319 320 321 def samples2MeanDiffs(samples, field, unusual_case):322 ret_val = {}323 324 for sid,probes in samples:325 unusual_value = None326 for p in probes:327 if p['test_case'] == unusual_case:328 unusual_value = p[field]329 break330 yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case])331 332 333 def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials):334 ret_val = []335 for t in range(num_trials):336 ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size)))337 338 return ret_val339 340 341 def bootstrap2(estimator, db, probe_type, subsample_size, num_trials):342 ret_val = []343 for t in range(num_trials):344 ret_val.append(estimator(subsample(db, probe_type, subsample_size)))345 346 return ret_val347 348 349 202 def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials): 350 203 ret_val = [] … … 355 208 356 209 357 # Returns the test case name that clearly has higher RTT; otherwise, returns None358 def boxTest(params, test_cases, samples):359 if len(test_cases) != 2:360 # XXX: somehow generalize the box test to handle more than 2 cases361 raise Exception()362 dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params363 364 tmp1,tmp2 = dists.items()365 test_case1,dist1 = tmp1366 test_case2,dist2 = tmp2367 368 d1_high = numpy.percentile(dist1, params['high'])369 d2_low = numpy.percentile(dist2, params['low'])370 if d1_high < d2_low:371 return test_case2372 373 d1_low = numpy.percentile(dist1, params['low'])374 d2_high = numpy.percentile(dist2, params['high'])375 376 if d2_high < d1_low:377 return test_case1378 379 return None380 381 382 210 # Returns 1 if unusual_case is unusual in the expected direction 383 211 # 0 if it isn't unusual 384 212 # -1 if it is unusual in the wrong direction 385 213 def multiBoxTest(params, greater, samples): 386 uc = [s['unusual_ case'] for s in samples]387 rest = [s['other_ cases'] for s in samples]214 uc = [s['unusual_packet'] for s in samples] 215 rest = [s['other_packet'] for s in samples] 388 216 389 217 uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low'])) … … 407 235 # 0 otherwise 408 236 def summaryTest(f, params, greater, samples): 409 diffs = [s['unusual_ case']-s['other_cases'] for s in samples]237 diffs = [s['unusual_packet']-s['other_packet'] for s in samples] 410 238 411 239 mh = f(diffs, params['distance']) … … 420 248 else: 421 249 return 0 250 422 251 423 252 midsummaryTest = functools.partial(summaryTest, midsummary) … … 451 280 452 281 def kfilter(params, observations): 453 x = numpy.array(observations) 282 x = numpy.array(observations) 454 283 movement = 0 455 est = [] 284 est = [] 456 285 var = [] 457 286 kf = KalmanFilter1D(x0 = quadsummary(x), # initial state 458 #P = 10000, 459 P = 10, # initial variance287 #P = 10000, # initial variance 288 P = 10, # initial variance 460 289 R = numpy.std(x), # msensor noise 461 290 Q = 0) # movement noise … … 471 300 472 301 def kalmanTest(params, greater, samples): 473 diffs = [s['unusual_ case']-s['other_cases'] for s in samples]302 diffs = [s['unusual_packet']-s['other_packet'] for s in samples] 474 303 475 304 m = kfilter(params, diffs)['est'][-1] … … 486 315 487 316 488 def kalmanTest2(params, greater, samples): 489 diffs = [s['unusual_case']-s['other_cases'] for s in samples] 490 491 estimates = [] 492 size = 500 493 for i in range(100): 494 off = random.randrange(0,len(diffs)) 495 sub = diffs[off:size] 496 if len(sub) < size: 497 sub += diffs[0:size-len(sub)] 498 estimates.append(kfilter(params, sub)['est'][-1]) 499 500 m = quadsummary(estimates) 317 def tsvalwmeanTest(params, greater, samples): 318 m = tsvalwmean(samples) 501 319 if greater: 502 320 if m > params['threshold']: … … 509 327 else: 510 328 return 0 511
Note: See TracChangeset
for help on using the changeset viewer.