Changeset 8


Ignore:
Timestamp:
07/09/15 19:01:23 (9 years ago)
Author:
tim
Message:

.

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/bin/train

    r7 r8  
    8282
    8383           
    84 def trainBoxTest(db, test_cases, longest, subsample_size):
     84
     85def trainBoxTest(db, unusual_case, greater, subseries_size):
    8586
    8687    def trainAux(low,high,num_trials):
    87         estimator = functools.partial(boxTest, {'low':low, 'high':high})
    88         estimates = bootstrap(estimator, db, 'train', test_cases, subsample_size, num_trials)
    89         null_estimates = bootstrap(estimator, db, 'train_null', test_cases, subsample_size, num_trials)
    90 
    91         #XXX: need to have a configurable policy on what we're looking for.
    92         #     which is longest or which is shortest?
    93         bad_estimates = len([e for e in estimates if e != longest])
    94         bad_null_estimates = len([e for e in null_estimates if e != None])
    95        
    96         false_negatives = 100.0*bad_estimates/num_trials
    97         false_positives = 100.0*bad_null_estimates/num_trials
    98         return false_positives,false_negatives
    99 
    100     start = time.time()
    101     wt = WorkerThreads(2, trainAux)
    102    
    103     width = 2.0
    104     performance = []
    105     percentiles = list(range(0,50,2))
    106     for low in percentiles:
    107         wt.addJob(low, (low,low+width,200))
    108     wt.wait()
    109     while not wt.resultq.empty():
    110         job_id,errors = wt.resultq.get()
    111         fp,fn = errors
    112         performance.append(((fp+fn)/2.0, job_id, fn, fp))
    113     performance.sort()
    114     pprint.pprint(performance)
    115     print(time.time()-start)
    116    
    117     lows = [p[1] for p in performance[0:5]]
    118     widths = [w/10.0 for w in range(0,105,5)]
    119     performance = []
    120     for width in widths:
    121         false_positives = []
    122         false_negatives = []
    123         for low in lows:
    124             wt.addJob(low,(low,low+width,150))
    125         wt.wait()
    126         while not wt.resultq.empty():
    127             job_id,errors = wt.resultq.get()
    128             fp,fn = errors
    129             false_negatives.append(fn)
    130             false_positives.append(fp)
    131 
    132         #print(width, false_negatives)
    133         #print(width, false_positives)
    134         performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0,
    135                             width, statistics.mean(false_negatives), statistics.mean(false_positives)))
    136     performance.sort()
    137     pprint.pprint(performance)
    138     good_width = performance[0][1]
    139     print("good_width:",good_width)
    140 
    141 
    142     lc = {}
    143     for low in lows:
    144         if low-1 > 0:
    145             lc[low-1] = None
    146         lc[low] = None
    147         lc[low+1] = None
    148     lows = lc.keys()
    149    
    150     performance = []
    151     for low in lows:
    152         wt.addJob(low, (low,low+good_width,300))
    153     wt.wait()
    154     while not wt.resultq.empty():
    155         job_id,errors = wt.resultq.get()
    156         fp,fn = errors
    157         performance.append(((fp+fn)/2.0, job_id, fn, fp))
    158     performance.sort()
    159     pprint.pprint(performance)
    160     best_low = performance[0][1]
    161     print("best_low:", best_low)
    162 
    163    
    164     widths = [good_width-0.4,good_width-0.3,good_width-0.2,good_width-0.1,
    165               good_width,good_width+0.1,good_width+0.2,good_width+0.3,good_width+0.4]
    166     performance = []
    167     for width in widths:
    168         wt.addJob(width, (best_low,best_low+width,200))
    169     wt.wait()
    170     while not wt.resultq.empty():
    171         job_id,errors = wt.resultq.get()
    172         fp,fn = errors
    173         performance.append(((fp+fn)/2.0, job_id, fn, fp))
    174     performance.sort()
    175     pprint.pprint(performance)
    176     best_width=performance[0][1]
    177     print("best_width:",best_width)
    178     print("final_performance:", performance[0][0])
    179 
    180     return {"low":best_low,"high":best_low+good_width}
    181 
    182 
    183 def trainBoxTest2(db, unusual_case, greater, subsample_size):
    184 
    185     def trainAux(low,high,num_trials):
    186         estimator = functools.partial(multiBoxTest, {'low':low, 'high':high}, unusual_case, greater)
    187         estimates = bootstrap2(estimator, db, 'train', subsample_size, num_trials)
    188         null_estimates = bootstrap2(estimator, db, 'train_null', subsample_size, num_trials)
     88        estimator = functools.partial(multiBoxTest, {'low':low, 'high':high}, greater)
     89        estimates = bootstrap3(estimator, db, 'train', unusual_case, subseries_size, num_trials)
     90        null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, subseries_size, num_trials)
    18991
    19092        bad_estimates = len([e for e in estimates if e != 1])
     
    199101   
    200102    num_trials = 200
    201     width = 2.0
    202     performance = []
    203     percentiles = list(range(0,50,2))
    204     for low in percentiles:
     103    width = 1.0
     104    performance = []
     105    for low in range(0,50):
    205106        wt.addJob(low, (low,low+width,num_trials))
    206107    wt.wait()
     
    213114    print(time.time()-start)
    214115   
    215     num_trials = 150
     116    num_trials = 200
    216117    lows = [p[1] for p in performance[0:5]]
    217     widths = [w/10.0 for w in range(0,105,5)]
     118    widths = [w/10.0 for w in range(5,65,5)]
    218119    performance = []
    219120    for width in widths:
     
    231132        #print(width, false_negatives)
    232133        #print(width, false_positives)
    233         performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0,
     134        #performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0,
     135        #                    width, statistics.mean(false_negatives), statistics.mean(false_positives)))
     136        performance.append((abs(statistics.mean(false_positives)-statistics.mean(false_negatives)),
    234137                            width, statistics.mean(false_negatives), statistics.mean(false_positives)))
    235138    performance.sort()
     
    239142
    240143
    241     lc = {}
    242     for low in lows:
    243         if low-1 >= 0:
    244             lc[low-1] = None
    245         lc[low] = None
    246         lc[low+1] = None
    247     lows = lc.keys()
    248     print("candidate lows:")
    249     pprint.pprint(lows)
    250    
    251     num_trials = 300
     144    num_trials = 500
    252145    performance = []
    253146    for low in lows:
     
    263156    print("best_low:", best_low)
    264157
    265     num_trials = 200
    266     widths = [good_width-0.4,good_width-0.3,good_width-0.2,good_width-0.1,
    267               good_width,good_width+0.1,good_width+0.2,good_width+0.3,good_width+0.4]
     158   
     159    num_trials = 500
     160    widths = [good_width+(x/10.0) for x in range(-6,7) if good_width+(x/10.0) > 0.0]
    268161    performance = []
    269162    for width in widths:
     
    280173    print("final_performance:", performance[0][0])
    281174   
    282     params = json.dumps({"low":best_low,"high":best_low+good_width})
     175    params = json.dumps({"low":best_low,"high":best_low+best_width})
    283176    return {'algorithm':"boxtest",
    284177            'params':params,
    285             'sample_size':subsample_size,
     178            'sample_size':subseries_size,
    286179            'num_trials':num_trials,
    287180            'trial_type':"train",
     
    305198
    306199    #determine expected delta based on differences
    307     start = time.time()
    308     mean_diffs = [s['unusual_case']-s['other_cases'] for s in subseries(db, 'train', unusual_case)]
     200    mean_diffs = [s['unusual_case']-s['other_cases'] for s in db.subseries('train', unusual_case)]
    309201    threshold = trimean(mean_diffs)/2.0
    310     print("initial threshold:", threshold)
    311     print("median threshold:", statistics.median(mean_diffs)/2.0)
    312     print("midhinge threshold:", midhinge(mean_diffs)/2.0)
    313     print("trimean threshold:", trimean(mean_diffs)/2.0)
    314    
    315     mean_diffs = [s['unusual_case']-s['other_cases'] for s in subseries(db, 'train_null', unusual_case)]
    316     print(len(mean_diffs))
    317     print("null mean:", statistics.mean(mean_diffs))
    318     print("null median:", statistics.median(mean_diffs))
    319     print("null midhinge:", midhinge(mean_diffs))
    320     print("null trimean:", trimean(mean_diffs))
    321     print(time.time()-start)
    322 
    323    
    324     start = time.time()
    325     wt = WorkerThreads(4, trainAux)
    326    
    327     num_trials = 20
    328     performance = []
    329     #for distance in range(1,46,4):
    330     for distance in range(25,46,4):
     202    print("init_threshold:", threshold)
     203   
     204    wt = WorkerThreads(2, trainAux)
     205   
     206    num_trials = 500
     207    performance = []
     208    for distance in range(1,50):
    331209        wt.addJob(distance, (distance,threshold,num_trials))
    332210    wt.wait()
     
    335213        fp,fn = errors
    336214        performance.append(((fp+fn)/2.0, job_id, fn, fp))
    337     #for distance in range(25,46,4):
    338     #    job_id = distance
    339     #    fp,fn = trainAux(distance, threshold, num_trials)
    340     #    performance.append(((fp+fn)/2.0, job_id, fn, fp))
    341    
    342     performance.sort()
    343     pprint.pprint(performance)
    344     print(time.time()-start)
     215   
     216    performance.sort()
     217    #pprint.pprint(performance)
    345218    good_distance = performance[0][1]
    346219    print("good_distance:",good_distance)
    347220
    348221   
    349     num_trials = 20
    350     start = time.time()
    351     performance = []
    352     for t in range(80,125,5):
     222    num_trials = 500
     223    performance = []
     224    for t in range(50,154,4):
    353225        wt.addJob(threshold*(t/100.0), (good_distance,threshold*(t/100.0),num_trials))
    354226    wt.wait()
     
    358230        performance.append(((fp+fn)/2.0, job_id, fn, fp))
    359231    performance.sort()
    360     pprint.pprint(performance)
    361     print(time.time()-start)
     232    #pprint.pprint(performance)
    362233    good_threshold = performance[0][1]
    363234    print("good_threshold:", good_threshold)
    364235
    365236   
    366     num_trials = 20
    367     start = time.time()
    368     performance = []
    369     for d in range(-4,5):
    370         wt.addJob(good_distance+d, (good_distance+d,good_threshold,num_trials))
    371     wt.wait()
    372     while not wt.resultq.empty():
    373         job_id,errors = wt.resultq.get()
    374         fp,fn = errors
    375         performance.append(((fp+fn)/2.0, job_id, fn, fp))
    376     performance.sort()
    377     pprint.pprint(performance)
    378     print(time.time()-start)
     237    num_trials = 500
     238    performance = []
     239    for d in [good_distance+s for s in range(-4,5) if good_distance+s > -1]:
     240        wt.addJob(d, (d,good_threshold,num_trials))
     241    wt.wait()
     242    while not wt.resultq.empty():
     243        job_id,errors = wt.resultq.get()
     244        fp,fn = errors
     245        performance.append(((fp+fn)/2.0, job_id, fn, fp))
     246    performance.sort()
     247    #pprint.pprint(performance)
    379248    best_distance = performance[0][1]
    380249    print("best_distance:",best_distance)
    381    
    382     num_trials = 20
    383     start = time.time()
     250
     251   
     252    num_trials = 500
    384253    performance = []
    385254    for t in range(95,106):
     
    391260        performance.append(((fp+fn)/2.0, job_id, fn, fp))
    392261    performance.sort()
    393     pprint.pprint(performance)
    394     print(time.time()-start)
     262    #pprint.pprint(performance)
    395263    best_threshold = performance[0][1]
    396264    print("best_threshold:", best_threshold)
     
    428296import cProfile
    429297
     298
     299
     300
     301for size in (500,1000,2000,4000,5000,6000):
     302    start = time.time()
     303    #cProfile.run('results = trainMidhinge(db, unusual_case, greater, 100)')
     304    results = trainMidhinge(db, unusual_case, greater, size)
     305    #db.addClassifierResults(results)
     306    print("midhinge result:")
     307    pprint.pprint(results)
     308    print(":", time.time()-start)
     309
     310sys.exit(0)
     311
    430312start = time.time()
    431 cProfile.run('results = trainMidhinge(db, unusual_case, greater, 100)')
    432 #results = trainMidhinge(db, unusual_case, greater, 100)
     313results = trainBoxTest(db, unusual_case, greater, 6000)
    433314#db.addClassifierResults(results)
    434 print("midhinge result:", results)
    435 end = time.time()
    436 print(":", end-start)
    437 
    438 sys.exit(0)
    439 
    440 start = time.time()
    441 results = trainBoxTest2(db, unusual_case, greater, 6000)
    442 db.addClassifierResults(results)
    443 print("multi box test result:", results)
    444 end = time.time()
    445 print(":", end-start)
    446 
    447 #start = time.time()
    448 #print("box test params:", trainBoxTest(db, test_cases, 'long', 100))
    449 #end = time.time()
    450 #print(":", end-start)
    451 
    452 
     315print("multi box test result:")
     316pprint.pprint(results)
     317print(":", time.time()-start)
  • trunk/lib/nanownlib/stats.py

    r7 r8  
    311311    ret_val = []
    312312    for t in range(num_trials):
    313         ret_val.append(estimator(subseries(db, probe_type, unusual_case, subseries_size)))
     313        ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
    314314
    315315    return ret_val
     
    344344#         0 if it isn't unusual
    345345#        -1 if it is unusual in the wrong direction
    346 def multiBoxTest(params, unusual_case, greater, samples):
    347     #XXX: packet_rtt field from params
    348     dists = samples2Distributions(samples, 'packet_rtt')
    349    
    350     uc = dists[unusual_case]
    351     rest = []
    352     for tc,d in dists.items():
    353         if tc != unusual_case:
    354             rest.extend(d)
    355 
     346def multiBoxTest(params, greater, samples):
     347    uc = [s['unusual_case'] for s in samples]
     348    rest = [s['other_cases'] for s in samples]
     349   
    356350    uc_high = numpy.percentile(uc, params['high'])
    357351    rest_low = numpy.percentile(rest, params['low'])
     
    379373
    380374    mh = midhinge(diffs, params['distance'])
     375    #mh = trimean(diffs, params['distance'])
    381376    if greater:
    382377        if mh > params['threshold']:
  • trunk/lib/nanownlib/storage.py

    r7 r8  
    66import threading
    77import sqlite3
     8
     9import numpy
    810
    911def _newid():
     
    1517    cursor = None
    1618    _population_sizes = None
     19    _population_cache = None
    1720   
    1821    def __init__(self, path):
     
    2225        self.conn.row_factory = sqlite3.Row
    2326        self._population_sizes = {}
     27        self._population_cache = {}
    2428       
    2529        if not exists:
     
    8993            self.conn.close()
    9094
     95   
    9196    def populationSize(self, probe_type):
    9297        if probe_type in self._population_sizes:
     
    101106            print(e)
    102107            return 0
     108
     109
     110    def subseries(self, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
     111        if (probe_type,unusual_case,field) not in self._population_cache:
     112            query="""
     113            SELECT %(field)s AS unusual_case,
     114                   (SELECT avg(%(field)s) FROM probes,analysis
     115                    WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
     116            FROM   (SELECT probes.sample,%(field)s FROM probes,analysis
     117                    WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
     118            """ % {"field":field}
     119   
     120            params = {"probe_type":probe_type, "unusual_case":unusual_case}
     121            cursor = self.conn.cursor()
     122            cursor.execute(query, params)
     123            self._population_cache[(probe_type,unusual_case,field)] = [dict(row) for row in cursor.fetchall()]
     124
     125        population = self._population_cache[(probe_type,unusual_case,field)]
     126
     127        if size == None or size > len(population):
     128            size = len(population)
     129        if offset == None or offset >= len(population) or offset < 0:
     130            offset = numpy.random.random_integers(0,len(population)-1)
     131
     132        ret_val = population[offset:offset+size]
     133        if len(ret_val) < size:
     134            ret_val += population[0:size-len(ret_val)]
     135       
     136        return ret_val
     137
     138   
     139    def clearCache(self):
     140        self._population_cache = {}
     141
    103142       
    104143    def _insert(self, table, row):
Note: See TracChangeset for help on using the changeset viewer.