- Timestamp:
- 07/09/15 19:01:23 (9 years ago)
- Location:
- trunk
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/bin/train
r7 r8 82 82 83 83 84 def trainBoxTest(db, test_cases, longest, subsample_size): 84 85 def trainBoxTest(db, unusual_case, greater, subseries_size): 85 86 86 87 def trainAux(low,high,num_trials): 87 estimator = functools.partial(boxTest, {'low':low, 'high':high}) 88 estimates = bootstrap(estimator, db, 'train', test_cases, subsample_size, num_trials) 89 null_estimates = bootstrap(estimator, db, 'train_null', test_cases, subsample_size, num_trials) 90 91 #XXX: need to have a configurable policy on what we're looking for. 92 # which is longest or which is shortest? 93 bad_estimates = len([e for e in estimates if e != longest]) 94 bad_null_estimates = len([e for e in null_estimates if e != None]) 95 96 false_negatives = 100.0*bad_estimates/num_trials 97 false_positives = 100.0*bad_null_estimates/num_trials 98 return false_positives,false_negatives 99 100 start = time.time() 101 wt = WorkerThreads(2, trainAux) 102 103 width = 2.0 104 performance = [] 105 percentiles = list(range(0,50,2)) 106 for low in percentiles: 107 wt.addJob(low, (low,low+width,200)) 108 wt.wait() 109 while not wt.resultq.empty(): 110 job_id,errors = wt.resultq.get() 111 fp,fn = errors 112 performance.append(((fp+fn)/2.0, job_id, fn, fp)) 113 performance.sort() 114 pprint.pprint(performance) 115 print(time.time()-start) 116 117 lows = [p[1] for p in performance[0:5]] 118 widths = [w/10.0 for w in range(0,105,5)] 119 performance = [] 120 for width in widths: 121 false_positives = [] 122 false_negatives = [] 123 for low in lows: 124 wt.addJob(low,(low,low+width,150)) 125 wt.wait() 126 while not wt.resultq.empty(): 127 job_id,errors = wt.resultq.get() 128 fp,fn = errors 129 false_negatives.append(fn) 130 false_positives.append(fp) 131 132 #print(width, false_negatives) 133 #print(width, false_positives) 134 performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0, 135 width, statistics.mean(false_negatives), statistics.mean(false_positives))) 136 performance.sort() 137 pprint.pprint(performance) 138 good_width = performance[0][1] 139 print("good_width:",good_width) 140 141 142 lc = {} 143 for low in lows: 144 if low-1 > 0: 145 lc[low-1] = None 146 lc[low] = None 147 lc[low+1] = None 148 lows = lc.keys() 149 150 performance = [] 151 for low in lows: 152 wt.addJob(low, (low,low+good_width,300)) 153 wt.wait() 154 while not wt.resultq.empty(): 155 job_id,errors = wt.resultq.get() 156 fp,fn = errors 157 performance.append(((fp+fn)/2.0, job_id, fn, fp)) 158 performance.sort() 159 pprint.pprint(performance) 160 best_low = performance[0][1] 161 print("best_low:", best_low) 162 163 164 widths = [good_width-0.4,good_width-0.3,good_width-0.2,good_width-0.1, 165 good_width,good_width+0.1,good_width+0.2,good_width+0.3,good_width+0.4] 166 performance = [] 167 for width in widths: 168 wt.addJob(width, (best_low,best_low+width,200)) 169 wt.wait() 170 while not wt.resultq.empty(): 171 job_id,errors = wt.resultq.get() 172 fp,fn = errors 173 performance.append(((fp+fn)/2.0, job_id, fn, fp)) 174 performance.sort() 175 pprint.pprint(performance) 176 best_width=performance[0][1] 177 print("best_width:",best_width) 178 print("final_performance:", performance[0][0]) 179 180 return {"low":best_low,"high":best_low+good_width} 181 182 183 def trainBoxTest2(db, unusual_case, greater, subsample_size): 184 185 def trainAux(low,high,num_trials): 186 estimator = functools.partial(multiBoxTest, {'low':low, 'high':high}, unusual_case, greater) 187 estimates = bootstrap2(estimator, db, 'train', subsample_size, num_trials) 188 null_estimates = bootstrap2(estimator, db, 'train_null', subsample_size, num_trials) 88 estimator = functools.partial(multiBoxTest, {'low':low, 'high':high}, greater) 89 estimates = bootstrap3(estimator, db, 'train', unusual_case, subseries_size, num_trials) 90 null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, subseries_size, num_trials) 189 91 190 92 bad_estimates = len([e for e in estimates if e != 1]) … … 199 101 200 102 num_trials = 200 201 width = 2.0 202 performance = [] 203 percentiles = list(range(0,50,2)) 204 for low in percentiles: 103 width = 1.0 104 performance = [] 105 for low in range(0,50): 205 106 wt.addJob(low, (low,low+width,num_trials)) 206 107 wt.wait() … … 213 114 print(time.time()-start) 214 115 215 num_trials = 150116 num_trials = 200 216 117 lows = [p[1] for p in performance[0:5]] 217 widths = [w/10.0 for w in range( 0,105,5)]118 widths = [w/10.0 for w in range(5,65,5)] 218 119 performance = [] 219 120 for width in widths: … … 231 132 #print(width, false_negatives) 232 133 #print(width, false_positives) 233 performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0, 134 #performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0, 135 # width, statistics.mean(false_negatives), statistics.mean(false_positives))) 136 performance.append((abs(statistics.mean(false_positives)-statistics.mean(false_negatives)), 234 137 width, statistics.mean(false_negatives), statistics.mean(false_positives))) 235 138 performance.sort() … … 239 142 240 143 241 lc = {} 242 for low in lows: 243 if low-1 >= 0: 244 lc[low-1] = None 245 lc[low] = None 246 lc[low+1] = None 247 lows = lc.keys() 248 print("candidate lows:") 249 pprint.pprint(lows) 250 251 num_trials = 300 144 num_trials = 500 252 145 performance = [] 253 146 for low in lows: … … 263 156 print("best_low:", best_low) 264 157 265 num_trials = 200266 widths = [good_width-0.4,good_width-0.3,good_width-0.2,good_width-0.1,267 good_width,good_width+0.1,good_width+0.2,good_width+0.3,good_width+0.4]158 159 num_trials = 500 160 widths = [good_width+(x/10.0) for x in range(-6,7) if good_width+(x/10.0) > 0.0] 268 161 performance = [] 269 162 for width in widths: … … 280 173 print("final_performance:", performance[0][0]) 281 174 282 params = json.dumps({"low":best_low,"high":best_low+ good_width})175 params = json.dumps({"low":best_low,"high":best_low+best_width}) 283 176 return {'algorithm':"boxtest", 284 177 'params':params, 285 'sample_size':subs ample_size,178 'sample_size':subseries_size, 286 179 'num_trials':num_trials, 287 180 'trial_type':"train", … … 305 198 306 199 #determine expected delta based on differences 307 start = time.time() 308 mean_diffs = [s['unusual_case']-s['other_cases'] for s in subseries(db, 'train', unusual_case)] 200 mean_diffs = [s['unusual_case']-s['other_cases'] for s in db.subseries('train', unusual_case)] 309 201 threshold = trimean(mean_diffs)/2.0 310 print("initial threshold:", threshold) 311 print("median threshold:", statistics.median(mean_diffs)/2.0) 312 print("midhinge threshold:", midhinge(mean_diffs)/2.0) 313 print("trimean threshold:", trimean(mean_diffs)/2.0) 314 315 mean_diffs = [s['unusual_case']-s['other_cases'] for s in subseries(db, 'train_null', unusual_case)] 316 print(len(mean_diffs)) 317 print("null mean:", statistics.mean(mean_diffs)) 318 print("null median:", statistics.median(mean_diffs)) 319 print("null midhinge:", midhinge(mean_diffs)) 320 print("null trimean:", trimean(mean_diffs)) 321 print(time.time()-start) 322 323 324 start = time.time() 325 wt = WorkerThreads(4, trainAux) 326 327 num_trials = 20 328 performance = [] 329 #for distance in range(1,46,4): 330 for distance in range(25,46,4): 202 print("init_threshold:", threshold) 203 204 wt = WorkerThreads(2, trainAux) 205 206 num_trials = 500 207 performance = [] 208 for distance in range(1,50): 331 209 wt.addJob(distance, (distance,threshold,num_trials)) 332 210 wt.wait() … … 335 213 fp,fn = errors 336 214 performance.append(((fp+fn)/2.0, job_id, fn, fp)) 337 #for distance in range(25,46,4): 338 # job_id = distance 339 # fp,fn = trainAux(distance, threshold, num_trials) 340 # performance.append(((fp+fn)/2.0, job_id, fn, fp)) 341 342 performance.sort() 343 pprint.pprint(performance) 344 print(time.time()-start) 215 216 performance.sort() 217 #pprint.pprint(performance) 345 218 good_distance = performance[0][1] 346 219 print("good_distance:",good_distance) 347 220 348 221 349 num_trials = 20 350 start = time.time() 351 performance = [] 352 for t in range(80,125,5): 222 num_trials = 500 223 performance = [] 224 for t in range(50,154,4): 353 225 wt.addJob(threshold*(t/100.0), (good_distance,threshold*(t/100.0),num_trials)) 354 226 wt.wait() … … 358 230 performance.append(((fp+fn)/2.0, job_id, fn, fp)) 359 231 performance.sort() 360 pprint.pprint(performance) 361 print(time.time()-start) 232 #pprint.pprint(performance) 362 233 good_threshold = performance[0][1] 363 234 print("good_threshold:", good_threshold) 364 235 365 236 366 num_trials = 20 367 start = time.time() 368 performance = [] 369 for d in range(-4,5): 370 wt.addJob(good_distance+d, (good_distance+d,good_threshold,num_trials)) 371 wt.wait() 372 while not wt.resultq.empty(): 373 job_id,errors = wt.resultq.get() 374 fp,fn = errors 375 performance.append(((fp+fn)/2.0, job_id, fn, fp)) 376 performance.sort() 377 pprint.pprint(performance) 378 print(time.time()-start) 237 num_trials = 500 238 performance = [] 239 for d in [good_distance+s for s in range(-4,5) if good_distance+s > -1]: 240 wt.addJob(d, (d,good_threshold,num_trials)) 241 wt.wait() 242 while not wt.resultq.empty(): 243 job_id,errors = wt.resultq.get() 244 fp,fn = errors 245 performance.append(((fp+fn)/2.0, job_id, fn, fp)) 246 performance.sort() 247 #pprint.pprint(performance) 379 248 best_distance = performance[0][1] 380 249 print("best_distance:",best_distance) 381 382 num_trials = 20383 start = time.time()250 251 252 num_trials = 500 384 253 performance = [] 385 254 for t in range(95,106): … … 391 260 performance.append(((fp+fn)/2.0, job_id, fn, fp)) 392 261 performance.sort() 393 pprint.pprint(performance) 394 print(time.time()-start) 262 #pprint.pprint(performance) 395 263 best_threshold = performance[0][1] 396 264 print("best_threshold:", best_threshold) … … 428 296 import cProfile 429 297 298 299 300 301 for size in (500,1000,2000,4000,5000,6000): 302 start = time.time() 303 #cProfile.run('results = trainMidhinge(db, unusual_case, greater, 100)') 304 results = trainMidhinge(db, unusual_case, greater, size) 305 #db.addClassifierResults(results) 306 print("midhinge result:") 307 pprint.pprint(results) 308 print(":", time.time()-start) 309 310 sys.exit(0) 311 430 312 start = time.time() 431 cProfile.run('results = trainMidhinge(db, unusual_case, greater, 100)') 432 #results = trainMidhinge(db, unusual_case, greater, 100) 313 results = trainBoxTest(db, unusual_case, greater, 6000) 433 314 #db.addClassifierResults(results) 434 print("midhinge result:", results) 435 end = time.time() 436 print(":", end-start) 437 438 sys.exit(0) 439 440 start = time.time() 441 results = trainBoxTest2(db, unusual_case, greater, 6000) 442 db.addClassifierResults(results) 443 print("multi box test result:", results) 444 end = time.time() 445 print(":", end-start) 446 447 #start = time.time() 448 #print("box test params:", trainBoxTest(db, test_cases, 'long', 100)) 449 #end = time.time() 450 #print(":", end-start) 451 452 315 print("multi box test result:") 316 pprint.pprint(results) 317 print(":", time.time()-start) -
trunk/lib/nanownlib/stats.py
r7 r8 311 311 ret_val = [] 312 312 for t in range(num_trials): 313 ret_val.append(estimator( subseries(db,probe_type, unusual_case, subseries_size)))313 ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size))) 314 314 315 315 return ret_val … … 344 344 # 0 if it isn't unusual 345 345 # -1 if it is unusual in the wrong direction 346 def multiBoxTest(params, unusual_case, greater, samples): 347 #XXX: packet_rtt field from params 348 dists = samples2Distributions(samples, 'packet_rtt') 349 350 uc = dists[unusual_case] 351 rest = [] 352 for tc,d in dists.items(): 353 if tc != unusual_case: 354 rest.extend(d) 355 346 def multiBoxTest(params, greater, samples): 347 uc = [s['unusual_case'] for s in samples] 348 rest = [s['other_cases'] for s in samples] 349 356 350 uc_high = numpy.percentile(uc, params['high']) 357 351 rest_low = numpy.percentile(rest, params['low']) … … 379 373 380 374 mh = midhinge(diffs, params['distance']) 375 #mh = trimean(diffs, params['distance']) 381 376 if greater: 382 377 if mh > params['threshold']: -
trunk/lib/nanownlib/storage.py
r7 r8 6 6 import threading 7 7 import sqlite3 8 9 import numpy 8 10 9 11 def _newid(): … … 15 17 cursor = None 16 18 _population_sizes = None 19 _population_cache = None 17 20 18 21 def __init__(self, path): … … 22 25 self.conn.row_factory = sqlite3.Row 23 26 self._population_sizes = {} 27 self._population_cache = {} 24 28 25 29 if not exists: … … 89 93 self.conn.close() 90 94 95 91 96 def populationSize(self, probe_type): 92 97 if probe_type in self._population_sizes: … … 101 106 print(e) 102 107 return 0 108 109 110 def subseries(self, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'): 111 if (probe_type,unusual_case,field) not in self._population_cache: 112 query=""" 113 SELECT %(field)s AS unusual_case, 114 (SELECT avg(%(field)s) FROM probes,analysis 115 WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases 116 FROM (SELECT probes.sample,%(field)s FROM probes,analysis 117 WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u 118 """ % {"field":field} 119 120 params = {"probe_type":probe_type, "unusual_case":unusual_case} 121 cursor = self.conn.cursor() 122 cursor.execute(query, params) 123 self._population_cache[(probe_type,unusual_case,field)] = [dict(row) for row in cursor.fetchall()] 124 125 population = self._population_cache[(probe_type,unusual_case,field)] 126 127 if size == None or size > len(population): 128 size = len(population) 129 if offset == None or offset >= len(population) or offset < 0: 130 offset = numpy.random.random_integers(0,len(population)-1) 131 132 ret_val = population[offset:offset+size] 133 if len(ret_val) < size: 134 ret_val += population[0:size-len(ret_val)] 135 136 return ret_val 137 138 139 def clearCache(self): 140 self._population_cache = {} 141 103 142 104 143 def _insert(self, table, row):
Note: See TracChangeset
for help on using the changeset viewer.