Changeset 11 for trunk


Ignore:
Timestamp:
07/16/15 20:40:01 (9 years ago)
Author:
tim
Message:

.

Location:
trunk
Files:
1 added
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/bin/graph

    r10 r11  
    4040
    4141
    42 def differences(db, unusual_case, column='packet_rtt'):
    43     cursor = db.conn.cursor()
    44     query="""
    45       SELECT %(column)s-(SELECT avg(%(column)s) FROM probes,analysis
    46                          WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type in ('train','test') AND sample=u.sample)
    47       FROM (SELECT probes.sample,%(column)s FROM probes,analysis
    48                          WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type in ('train','test')) u
    49       """ % {"column":column}
    50     params = {"unusual_case":unusual_case}
    51     cursor.execute(query, params)
    52     for row in cursor:
    53         yield row[0]
     42def differences(db, unusual_case, rtt_type='packet'):
     43    ret_val = [s['unusual_'+rtt_type]-s['other_'+rtt_type] for s in db.subseries('train', unusual_case)]
     44    ret_val += [s['unusual_'+rtt_type]-s['other_'+rtt_type] for s in db.subseries('test', unusual_case)]
     45    return ret_val
     46
     47def null_differences(db, unusual_case, rtt_type='packet'):
     48    ret_val = [s['unusual_'+rtt_type]-s['other_'+rtt_type] for s in db.subseries('train_null', unusual_case)]
     49    return ret_val
    5450
    5551
     
    9187print('packet_rtt diff ubersummary: %f' % ubersummary(diffs))
    9288print('packet_rtt diff MAD: %f' % mad(diffs))
    93 print('reported diff trimean: %f' % trimean(reported_diffs))
    94 print('reported diff quadsummary: %f' % quadsummary(reported_diffs))
    95 print('reported diff ubersummary: %f' % ubersummary(reported_diffs))
    96 print('reported diff MAD: %f' % mad(reported_diffs))
    97 
    98 import cProfile
    99 kresults = kfilter({},diffs)
    100 #print('packet_rtt diff kfilter: ', numpy.mean(kresults['est']), kresults['var'])
    101 print('packet_rtt diff kfilter: ', kresults['est'][-1], kresults['var'][-1])
    102 kresults = kfilter({},reported_diffs)
    103 #print('reported diff kfilter: ', numpy.mean(kresults['est']), kresults['var'][-1])
    104 print('reported diff kfilter: ', kresults['est'][-1], kresults['var'][-1])
     89try:
     90    print('reported diff trimean: %f' % trimean(reported_diffs))
     91    print('reported diff quadsummary: %f' % quadsummary(reported_diffs))
     92    print('reported diff ubersummary: %f' % ubersummary(reported_diffs))
     93    print('reported diff MAD: %f' % mad(reported_diffs))
     94
     95    import cProfile
     96    start = time.time()
     97    kresults = kfilter({},diffs)
     98    #print('packet_rtt diff kfilter: ', numpy.mean(kresults['est']), kresults['var'])
     99    print('packet_rtt diff kfilter: ', kresults['est'][-1], kresults['var'][-1])
     100    kresults = kfilter({},reported_diffs)
     101    #print('reported diff kfilter: ', numpy.mean(kresults['est']), kresults['var'][-1])
     102    print('reported diff kfilter: ', kresults['est'][-1], kresults['var'][-1])
     103    print("kfilter time: %f" % (time.time()-start))
     104except:
     105    pass
     106
     107print('tsval diff mean: %f' % numpy.mean(differences(db, 'long', 'tsval')))
     108print('tsval null diff mean: %f' % numpy.mean(null_differences(db, 'long', 'tsval')))
     109print('tsval diff weighted mean: %f' % tsvalwmean(db.subseries('train','long')+db.subseries('test','long')))
     110print('tsval null diff weighted mean: %f' % tsvalwmean(db.subseries('train_null','long')))
    105111
    106112
     
    110116#cut_off_high = all_data[int(len(all_data)*0.997)]
    111117
     118
     119def plotSingleProbe(probe_id=None):
     120    if probe_id == None:
     121        cursor = db.conn.cursor()
     122        query="""SELECT probe_id FROM analysis WHERE suspect='' ORDER BY probe_id DESC limit 1 OFFSET 10"""
     123        cursor.execute(query)
     124        probe_id = cursor.fetchone()[0]
     125   
     126    cursor = db.conn.cursor()
     127    query="""SELECT observed,payload_len FROM packets WHERE probe_id=? AND sent=1"""
     128    cursor.execute(query, (probe_id,))
     129    pkts = cursor.fetchall()
     130    sent_payload = [row[0] for row in pkts if row[1] != 0]
     131    sent_other = [row[0] for row in pkts if row[1] == 0]
     132   
     133    query="""SELECT observed,payload_len FROM packets WHERE probe_id=? AND sent=0"""
     134    cursor.execute(query, (probe_id,))
     135    pkts = cursor.fetchall()
     136    rcvd_payload = [row[0] for row in pkts if row[1] != 0]
     137    rcvd_other = [row[0] for row in pkts if row[1] == 0]
     138   
     139    #query="""SELECT reported,time_of_day FROM probes WHERE id=?"""
     140    #cursor.execute(query, (probe_id,))
     141    #reported,tod = cursor.fetchone()
     142    #userspace_times = [sent_times[0]-reported/3.0, sent_times[0]+reported]
     143
     144    print("single probe counts:",len(sent_payload),len(sent_other),len(rcvd_payload),len(rcvd_other))
     145    plt.clf()
     146    plt.title("Single HTTP Request - Packet Times")
     147    sp = plt.eventplot(sent_payload, colors=('red',), lineoffsets=8, linewidths=2, alpha=0.6,label='sent')
     148    so = plt.eventplot(sent_other, colors=('red',), lineoffsets=6, linewidths=2, alpha=0.6,label='sent')
     149    rp = plt.eventplot(rcvd_payload, colors=('blue',), lineoffsets=4, linewidths=2, alpha=0.6,label='received')
     150    ro = plt.eventplot(rcvd_other, colors=('blue',), lineoffsets=2, linewidths=2, alpha=0.6,label='received')
     151    #plt.legend((s,r), ('sent','received'))
     152    #plt.savefig('../img/http-packet-times.svg')
     153    plt.show()
     154
     155#plotSingleProbe()
     156
     157
     158def graphTestResults():
     159    plt.clf()
     160    plt.title("Test Results")
     161    plt.xlabel('sample size')
     162    plt.ylabel('error rate')
     163    legend = []
     164    colors = ['red','blue','green','purple','orange','black','brown']
     165    color_id = 0
     166
     167    cursor = db.conn.cursor()
     168    query = """
     169      SELECT classifier FROM classifier_results GROUP BY classifier ORDER BY classifier;
     170    """
     171    cursor.execute(query)
     172    classifiers = []
     173    for c in cursor:
     174        classifiers.append(c[0])
     175
     176    for classifier in classifiers:
     177        query="""
     178        SELECT params FROM classifier_results
     179        WHERE trial_type='test'
     180         AND classifier=:classifier
     181         AND (false_positives+false_negatives)/2.0 < 5.0
     182        ORDER BY num_observations,(false_positives+false_negatives)
     183        LIMIT 1
     184        """
     185        cursor.execute(query, {'classifier':classifier})
     186        row = cursor.fetchone()
     187        if row == None:
     188            query="""
     189            SELECT params FROM classifier_results
     190            WHERE trial_type='test' and classifier=:classifier
     191            ORDER BY (false_positives+false_negatives),num_observations
     192            LIMIT 1
     193            """
     194            cursor.execute(query, {'classifier':classifier})
     195            row = cursor.fetchone()
     196            if row == None:
     197                sys.stderr.write("WARN: couldn't find test results for classifier '%s'.\n" % classifier)
     198                continue
     199
     200        best_params = row[0]
     201        query="""
     202        SELECT num_observations,(false_positives+false_negatives)/2.0 FROM classifier_results
     203        WHERE trial_type='test'
     204         AND classifier=:classifier
     205         AND params=:params
     206        ORDER BY num_observations
     207        """
     208        cursor.execute(query, {'classifier':classifier,'params':best_params})
     209
     210        num_obs = []
     211        performance = []
     212        for row in cursor:
     213            num_obs.append(row[0])
     214            performance.append(row[1])
     215        #print(num_obs,performance)
     216        path = plt.scatter(num_obs, performance, color=colors[color_id], s=4, alpha=0.8, linewidths=3.0)
     217        plt.plot(num_obs, performance, color=colors[color_id], alpha=0.8)
     218        legend.append((classifier,path))
     219        color_id = (color_id+1) % len(colors)
     220
     221    plt.legend([l[1] for l in legend], [l[0] for l in legend], scatterpoints=1, fontsize='xx-small')
     222    plt.show()
     223       
     224graphTestResults()
     225
     226sys.exit(0)
    112227
    113228plt.clf()
     
    122237plt.show()
    123238
     239
     240
     241plt.clf()
     242plt.title("Simple HTTP Request")
     243plt.xlabel('Time of Day')
     244plt.ylabel('')
     245s = plt.scatter(sent_times, [2]*len(sent_times), s=3, color='red', alpha=0.9)
     246r = plt.scatter(rcvd_times, [1]*len(rcvd_times), s=3, color='blue', alpha=0.9)
     247plt.legend((s,r), ('sent','received'), scatterpoints=1)
     248plt.show()
     249
     250sys.exit(0)
    124251short_overtime,long_overtime,diff_overtime = None,None,None
    125252
  • trunk/bin/train

    r10 r11  
    2525    sys.path.append("%s/../lib" % script_dir)
    2626
     27
    2728from nanownlib import *
    2829from nanownlib.stats import *
     30from nanownlib.train import *
    2931from nanownlib.parallel import WorkerThreads
    3032import nanownlib.storage
     
    3638#parser.add_argument('-c', dest='cases', type=str, default='{"short":10000,"long":1010000}',
    3739#                    help='JSON representation of echo timing cases. Default: {"short":10000,"long":1010000}')
    38 parser.add_argument('--retrain', action='append', default=[], help='Force a classifier to be retrained.  May be specified multiple times.')
     40parser.add_argument('--unusual-case', action='store', default=None, help='Specify the unusual case and whether it is greater than the other cases.  Format: {case name},{1 or 0}')
     41parser.add_argument('--retrain', action='append', default=[], help='Force a classifier to be retrained (and retested).  May be specified multiple times.')
    3942parser.add_argument('--retest', action='append', default=[], help='Force a classifier to be retested.  May be specified multiple times.')
    4043parser.add_argument('session_data', default=None,
    4144                    help='Database file storing session information')
    4245options = parser.parse_args()
     46db = nanownlib.storage.db(options.session_data)
    4347
    4448
    45 def trainBoxTest(db, unusual_case, greater, num_observations):
    46     db.resetOffsets()
    47    
    48     def trainAux(low,high,num_trials):
    49         estimator = functools.partial(multiBoxTest, {'low':low, 'high':high}, greater)
    50         estimates = bootstrap3(estimator, db, 'train', unusual_case, num_observations, num_trials)
    51         null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, num_observations, num_trials)
    52 
    53         bad_estimates = len([e for e in estimates if e != 1])
    54         bad_null_estimates = len([e for e in null_estimates if e != 0])
    55        
    56         false_negatives = 100.0*bad_estimates/num_trials
    57         false_positives = 100.0*bad_null_estimates/num_trials
    58         return false_positives,false_negatives
    59 
    60     #start = time.time()
    61     wt = WorkerThreads(2, trainAux)
    62    
    63     num_trials = 200
    64     width = 1.0
    65     performance = []
    66     for low in range(0,50):
    67         wt.addJob(low, (low,low+width,num_trials))
    68     wt.wait()
    69     while not wt.resultq.empty():
    70         job_id,errors = wt.resultq.get()
    71         fp,fn = errors
    72         performance.append(((fp+fn)/2.0, job_id, fn, fp))
    73     performance.sort()
    74     #pprint.pprint(performance)
    75     #print(time.time()-start)
    76    
    77     num_trials = 200
    78     lows = [p[1] for p in performance[0:5]]
    79     widths = [w/10.0 for w in range(5,65,5)]
    80     performance = []
    81     for width in widths:
    82         false_positives = []
    83         false_negatives = []
    84         for low in lows:
    85             wt.addJob(low,(low,low+width,num_trials))
    86         wt.wait()
    87         while not wt.resultq.empty():
    88             job_id,errors = wt.resultq.get()
    89             fp,fn = errors
    90             false_negatives.append(fn)
    91             false_positives.append(fp)
    92 
    93         #print(width, false_negatives)
    94         #print(width, false_positives)
    95         #performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0,
    96         #                    width, statistics.mean(false_negatives), statistics.mean(false_positives)))
    97         performance.append((abs(statistics.mean(false_positives)-statistics.mean(false_negatives)),
    98                             width, statistics.mean(false_negatives), statistics.mean(false_positives)))
    99     performance.sort()
    100     #pprint.pprint(performance)
    101     good_width = performance[0][1]
    102     #print("good_width:",good_width)
    103 
    104 
    105     num_trials = 500
    106     performance = []
    107     for low in lows:
    108         wt.addJob(low, (low,low+good_width,num_trials))
    109     wt.wait()
    110     while not wt.resultq.empty():
    111         job_id,errors = wt.resultq.get()
    112         fp,fn = errors
    113         performance.append(((fp+fn)/2.0, job_id, fn, fp))
    114     performance.sort()
    115     #pprint.pprint(performance)
    116     best_low = performance[0][1]
    117     #print("best_low:", best_low)
    118 
    119    
    120     num_trials = 500
    121     widths = [good_width+(x/100.0) for x in range(-70,75,5) if good_width+(x/100.0) > 0.0]
    122     performance = []
    123     for width in widths:
    124         wt.addJob(width, (best_low,best_low+width,num_trials))
    125     wt.wait()
    126     while not wt.resultq.empty():
    127         job_id,errors = wt.resultq.get()
    128         fp,fn = errors
    129         #performance.append(((fp+fn)/2.0, job_id, fn, fp))
    130         performance.append((abs(fp-fn), job_id, fn, fp))
    131     performance.sort()
    132     #pprint.pprint(performance)
    133     best_width=performance[0][1]
    134     #print("best_width:",best_width)
    135     #print("final_performance:", performance[0][0])
    136 
    137     wt.stop()
    138     params = json.dumps({"low":best_low,"high":best_low+best_width})
    139     return {'trial_type':"train",
    140             'num_observations':num_observations,
    141             'num_trials':num_trials,
    142             'params':params,
    143             'false_positives':performance[0][3],
    144             'false_negatives':performance[0][2]}
    145 
    146 
    147 def trainSummary(summaryFunc, db, unusual_case, greater, num_observations):
    148     db.resetOffsets()
    149     stest = functools.partial(summaryTest, summaryFunc)
    150    
    151     def trainAux(distance, threshold, num_trials):
    152         estimator = functools.partial(stest, {'distance':distance,'threshold':threshold}, greater)
    153         estimates = bootstrap3(estimator, db, 'train', unusual_case, num_observations, num_trials)
    154         null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, num_observations, num_trials)
    155 
    156         bad_estimates = len([e for e in estimates if e != 1])
    157         bad_null_estimates = len([e for e in null_estimates if e != 0])
    158        
    159         false_negatives = 100.0*bad_estimates/num_trials
    160         false_positives = 100.0*bad_null_estimates/num_trials
    161         return false_positives,false_negatives
    162 
    163     #determine expected delta based on differences
    164     mean_diffs = [s['unusual_case']-s['other_cases'] for s in db.subseries('train', unusual_case)]
    165     threshold = summaryFunc(mean_diffs)/2.0
    166     #print("init_threshold:", threshold)
    167    
    168     wt = WorkerThreads(2, trainAux)
    169    
    170     num_trials = 500
    171     performance = []
    172     for distance in range(1,50):
    173         wt.addJob(distance, (distance,threshold,num_trials))
    174     wt.wait()
    175     while not wt.resultq.empty():
    176         job_id,errors = wt.resultq.get()
    177         fp,fn = errors
    178         performance.append(((fp+fn)/2.0, job_id, fn, fp))
    179    
    180     performance.sort()
    181     #pprint.pprint(performance)
    182     good_distance = performance[0][1]
    183     #print("good_distance:",good_distance)
    184 
    185    
    186     num_trials = 500
    187     performance = []
    188     for t in range(80,122,2):
    189         wt.addJob(threshold*(t/100.0), (good_distance,threshold*(t/100.0),num_trials))
    190     wt.wait()
    191     while not wt.resultq.empty():
    192         job_id,errors = wt.resultq.get()
    193         fp,fn = errors
    194         #performance.append(((fp+fn)/2.0, job_id, fn, fp))
    195         performance.append((abs(fp-fn), job_id, fn, fp))
    196     performance.sort()
    197     #pprint.pprint(performance)
    198     good_threshold = performance[0][1]
    199     #print("good_threshold:", good_threshold)
    200 
    201    
    202     num_trials = 500
    203     performance = []
    204     for d in [good_distance+s for s in range(-4,5) if good_distance+s > -1]:
    205         wt.addJob(d, (d,good_threshold,num_trials))
    206     wt.wait()
    207     while not wt.resultq.empty():
    208         job_id,errors = wt.resultq.get()
    209         fp,fn = errors
    210         performance.append(((fp+fn)/2.0, job_id, fn, fp))
    211     performance.sort()
    212     #pprint.pprint(performance)
    213     best_distance = performance[0][1]
    214     #print("best_distance:",best_distance)
    215 
    216    
    217     num_trials = 500
    218     performance = []
    219     for t in range(90,111):
    220         wt.addJob(good_threshold*(t/100.0), (best_distance,good_threshold*(t/100.0),num_trials))
    221     wt.wait()
    222     while not wt.resultq.empty():
    223         job_id,errors = wt.resultq.get()
    224         fp,fn = errors
    225         #performance.append(((fp+fn)/2.0, job_id, fn, fp))
    226         performance.append((abs(fp-fn), job_id, fn, fp))
    227     performance.sort()
    228     #pprint.pprint(performance)
    229     best_threshold = performance[0][1]
    230     #print("best_threshold:", best_threshold)
    231 
    232     wt.stop()
    233     params = json.dumps({'distance':best_distance,'threshold':best_threshold})
    234     return {'trial_type':"train",
    235             'num_observations':num_observations,
    236             'num_trials':num_trials,
    237             'params':params,
    238             'false_positives':performance[0][3],
    239             'false_negatives':performance[0][2]}
    240 
    241 
    242 def trainKalman(db, unusual_case, greater, num_observations):
    243     db.resetOffsets()
    244 
    245     def trainAux(params, num_trials):
    246         estimator = functools.partial(kalmanTest, params, greater)
    247         estimates = bootstrap3(estimator, db, 'train', unusual_case, num_observations, num_trials)
    248         null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, num_observations, num_trials)
    249        
    250         bad_estimates = len([e for e in estimates if e != 1])
    251         bad_null_estimates = len([e for e in null_estimates if e != 0])
    252        
    253         false_negatives = 100.0*bad_estimates/num_trials
    254         false_positives = 100.0*bad_null_estimates/num_trials
    255         return false_positives,false_negatives
    256    
    257     mean_diffs = [s['unusual_case']-s['other_cases'] for s in db.subseries('train', unusual_case)]
    258     good_threshold = kfilter({},mean_diffs)['est'][-1]/2.0
    259 
    260     wt = WorkerThreads(2, trainAux)
    261     num_trials = 200
    262     performance = []
    263     for t in range(90,111):
    264         params = {'threshold':good_threshold*(t/100.0)}
    265         wt.addJob(good_threshold*(t/100.0), (params,num_trials))
    266     wt.wait()
    267     while not wt.resultq.empty():
    268         job_id,errors = wt.resultq.get()
    269         fp,fn = errors
    270         #performance.append(((fp+fn)/2.0, job_id, fn, fp))
    271         performance.append((abs(fp-fn), job_id, fn, fp))
    272     performance.sort()
    273     #pprint.pprint(performance)
    274     best_threshold = performance[0][1]
    275     #print("best_threshold:", best_threshold)
    276     params = {'threshold':best_threshold}
    277 
    278     wt.stop()
    279    
    280     return {'trial_type':"train",
    281             'num_observations':num_observations,
    282             'num_trials':num_trials,
    283             'params':json.dumps(params),
    284             'false_positives':performance[0][3],
    285             'false_negatives':performance[0][2]}
    286 
    287    
    288     #determine expected delta based on differences
    289 classifiers = {'boxtest':{'train':trainBoxTest, 'test':multiBoxTest, 'train_results':[]},
    290                'midsummary':{'train':functools.partial(trainSummary, midsummary), 'test':midsummaryTest, 'train_results':[]},
    291                #'ubersummary':{'train':functools.partial(trainSummary, ubersummary), 'test':ubersummaryTest, 'train_results':[]},
    292                'quadsummary':{'train':functools.partial(trainSummary, quadsummary), 'test':quadsummaryTest, 'train_results':[]},
    293                'kalman':{'train':trainKalman, 'test':kalmanTest, 'train_results':[]},
    294                #'_trimean':{'train':None, 'test':trimeanTest, 'train_results':[]},
    295               }
    296 
    297 
    298 db = nanownlib.storage.db(options.session_data)
    299 
    300 import cProfile
    30149
    30250def trainClassifier(db, unusual_case, greater, classifier, retrain=False):
     
    32472        print("number of observations: %d | error: %f | false_positives: %f | false_negatives: %f | train time: %s | params: %s"
    32573              % (num_obs, error, result['false_positives'],result['false_negatives'], train_time, result['params']))
    326         db.addClassifierResults(result)
     74        db.addClassifierResult(result)
    32775        classifiers[classifier]['train_results'].append(result)
    32876
     
    355103
    356104
     105    def getResult(classifier, params, num_obs, num_trials):
     106        jparams = json.dumps(params, sort_keys=True)
     107        result = db.fetchClassifierResult(classifier, 'test', num_obs, jparams)
     108        if result:
     109            fp = result['false_positives']
     110            fn = result['false_negatives']
     111        else:
     112            fp,fn = testAux(params, num_trials, num_obs)
     113            result = {'classifier':classifier,
     114                      'trial_type':"test",
     115                      'num_observations':num_obs,
     116                      'num_trials':num_trials,
     117                      'params':jparams,
     118                      'false_positives':fp,
     119                      'false_negatives':fn}
     120            db.addClassifierResult(result)
     121        return ((fp+fn)/2.0,result)
     122   
    357123    if retest:
    358124        print("Dropping stored test results...")
     
    368134   
    369135        print("initial test")
    370         fp,fn = testAux(params, num_trials, num_obs)
    371         error = (fp+fn)/2.0
     136        error,result = getResult(classifier,params,num_obs,num_trials)
    372137        print("walking up")
    373138        while (error > target_error) and (num_obs < max_obs):
     
    375140            #print("increase_factor:", increase_factor)
    376141            num_obs = min(int(increase_factor*num_obs), max_obs)
    377             fp,fn = testAux(params, num_trials, num_obs)
    378             error = (fp+fn)/2.0
     142            error,result = getResult(classifier,params,num_obs,num_trials)
    379143
    380144        print("walking down")
    381145        while (num_obs > 0):
    382             current_best = (num_obs,error,params,fp,fn)
     146            current_best = (error,result)
    383147            num_obs = int(0.95*num_obs)
    384             fp,fn = testAux(params, num_trials, num_obs)
    385             error = (fp+fn)/2.0
     148            error,result = getResult(classifier,params,num_obs,num_trials)
    386149            if error > target_error:
    387150                break
    388151       
    389         test_results.append(current_best)
    390 
    391     test_results.sort()
    392     best_obs,error,best_params,fp,fn = test_results[0]
    393    
    394     return {'classifier':classifier,
    395             'trial_type':"test",
    396             'num_observations':best_obs,
    397             'num_trials':num_trials,
    398             'params':best_params,
    399             'false_positives':fp,
    400             'false_negatives':fn}
     152    return current_best
    401153
    402154
    403 start = time.time()
    404 unusual_case,unusual_diff = findUnusualTestCase(db)
    405 greater = (unusual_diff > 0)
    406 print("unusual_case:", unusual_case)
    407 print("unusual_diff:", unusual_diff)
    408 end = time.time()
    409 print(":", end-start)
     155if options.unusual_case != None:
     156    unusual_case,greater = options.unusual_case.split(',')
     157    greater = bool(int(greater))
     158else:
     159    start = time.time()
     160    unusual_case,unusual_diff = findUnusualTestCase(db)
     161    greater = (unusual_diff > 0)
     162    print("unusual_case:", unusual_case)
     163    print("unusual_diff:", unusual_diff)
     164    end = time.time()
     165    print(":", end-start)
     166
    410167
    411168for c in sorted(classifiers.keys()):
     
    424181    start = time.time()
    425182    print("Testing %s..." % c)
    426     result = testClassifier(db, unusual_case, greater, c, c in options.retest)
     183    error,result = testClassifier(db, unusual_case, greater, c, c in (options.retest+options.retrain))
    427184    print("%s result:" % c)
    428185    pprint.pprint(result)
    429     classifiers[c]['test_error'] = (result['false_positives']+result['false_negatives'])/2.0
     186    classifiers[c]['test_error'] = error
    430187    print("completed in:", time.time()-start)
  • trunk/lib/nanownlib/__init__.py

    r10 r11  
    220220        if p['sent']==1 and (seen[key]['observed'] > p['observed']): #earliest sent
    221221            seen[key] = p
    222             suspect += 's'
     222            suspect += 's' # duplicated sent packets
    223223            continue
    224224        if p['sent']==0 and (seen[key]['observed'] > p['observed']): #earliest rcvd
    225225            seen[key] = p
    226             suspect += 'r'
     226            suspect += 'r' # duplicated received packets
    227227            continue
    228228   
     
    236236    suspect,packets = removeDuplicatePackets(packets)
    237237
    238     sort_key = lambda d: (d['tcpseq'],d['observed'])
     238    sort_key = lambda d: (d['observed'],d['tcpseq'])
     239    alt_key = lambda d: (d['tcpseq'],d['observed'])
    239240    sent = sorted((p for p in packets if p['sent']==1 and p['payload_len']>0), key=sort_key)
    240241    rcvd = sorted((p for p in packets if p['sent']==0 and p['payload_len']>0), key=sort_key)
    241 
    242     alt_key = lambda d: (d['observed'],d['tcpseq'])
    243242    rcvd_alt = sorted((p for p in packets if p['sent']==0 and p['payload_len']>0), key=alt_key)
    244243
    245244    s_off = trim_sent
    246245    if s_off >= len(sent):
     246        suspect += 'd' # dropped packet?
    247247        s_off = -1
    248248    last_sent = sent[s_off]
    249249
    250250    r_off = len(rcvd) - trim_rcvd - 1
    251     if r_off <= 0:
     251    if r_off < 0:
     252        suspect += 'd' # dropped packet?
    252253        r_off = 0
    253254    last_rcvd = rcvd[r_off]
    254255    if last_rcvd != rcvd_alt[r_off]:
    255         suspect += 'R'
     256        suspect += 'R' # reordered received packets
    256257   
    257258    packet_rtt = last_rcvd['observed'] - last_sent['observed']
     
    263264        last_sent_ack = min(((p['observed'],p) for p in packets
    264265                             if p['sent']==0 and p['payload_len']+last_sent['tcpseq']==p['tcpack']))[1]
     266       
    265267    except Exception as e:
    266268        sys.stderr.write("WARN: Could not find last_sent_ack.\n")
     
    301303def analyzeProbes(db):
    302304    db.conn.execute("CREATE INDEX IF NOT EXISTS packets_probe ON packets (probe_id)")
     305    db.conn.commit()
     306
    303307    pcursor = db.conn.cursor()
    304     db.conn.commit()
    305 
    306308    pcursor.execute("SELECT tcpts_mean FROM meta")
    307309    try:
     
    346348            db.addTrimAnalyses([analysis])
    347349        except Exception as e:
    348             traceback.print_exc()
     350            #traceback.print_exc()
    349351            sys.stderr.write("WARN: couldn't find enough packets for probe_id=%s\n" % probe_id)
    350352       
     
    367369                    analysis['probe_id'] = probe_id
    368370                except Exception as e:
    369                     print(e)
    370                     sys.stderr.write("WARN: couldn't find enough packets for probe_id=%s\n" % pid)
     371                    #traceback.print_exc()
     372                    sys.stderr.write("WARN: couldn't find enough packets for probe_id=%s\n" % probe_id)
    371373                   
    372374                db.addTrimAnalyses([analysis])
  • trunk/lib/nanownlib/parallel.py

    r9 r11  
    11#
    22
     3import sys
    34import threading
    45import queue
     
    3334                self.resultq.put((job_id, self.target(*args)))
    3435            except Exception as e:
    35                 sys.stderr.write("ERROR: Job '%s' failed with '%s'.  Dropping...\n",
     36                sys.stderr.write("ERROR: Job '%s' failed with '%s'.  Dropping...\n" %
    3637                                 (str(job_id),str(e)))
    3738            self.workq.task_done()
  • trunk/lib/nanownlib/stats.py

    r10 r11  
    144144def ubersummary(values, distance=25):
    145145    left2 = 50-distance
     146    left3 = 50-(distance/2.0)
    146147    left1 = left2/2.0
    147     left3 = (left2+50)/2.0
    148148    right2 = 50+distance
    149     right3 = (right2+50)/2.0
     149    right3 = 50+(distance/2.0)
    150150    right1 = (right2+100)/2.0
    151151    l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
    152     #print(left1,left2,left3,50,right3,right2,right1)
    153152    #print(l1,l2,l3,m,r3,r2,r1)
    154153    return (l1+l2*4+l3+r3+r2*4+r1)/12.0
    155154    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
    156155
    157 def quadsummary(values, distance=25):
    158     left2 = 50-distance
    159     left1 = left2/2.0
    160     right2 = 50+distance
    161     right1 = (right2+100)/2.0
    162     l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
    163     #print(left1,left2,left3,50,right3,right2,right1)
    164     #print(l1,l2,l3,m,r3,r2,r1)
    165     return (l1+l2+r2+r1)/4.0
    166     #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
    167 
     156   
    168157def quadsummary(values, distance=25):
    169158    left1 = 50-distance
     
    177166    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
    178167
    179    
     168
     169def tsvalwmean(subseries):
     170    weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries]
     171    normalizer = sum(weights)/len(weights)
     172    return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer
     173                       for i in range(len(weights))])
     174
     175#def tsvalwmean(subseries):
     176#    return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries])
     177
     178
    180179def weightedMean(derived, weights):
    181180    normalizer = sum(weights.values())/len(weights)
     
    186185    return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
    187186
     187
     188   
    188189
    189190def estimateMean(trustFunc, weightFunc, alpha, derived):
     
    199200
    200201
    201 #def estimateMedian(trustFunc, weightFunc, alpha, derived):
    202 #    trust = trustValues(derived, trustFunc)
    203 #    weights = weightFunc(derived, trust, alpha)
    204 
    205 #    return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0])
    206 
    207 def estimateMedian(derived):
    208     return statistics.median([(d['long']-d['short']) for d in derived.values()])
    209 
    210 
    211 def estimateMidsummary(derived):
    212     return midsummary([(d['long']-d['short']) for d in derived.values()])
    213 
    214 
    215 def estimateTrimean(derived):
    216     return trimean([(d['long']-d['short']) for d in derived.values()])
    217 
    218 
    219 def tTest(expected_mean, derived):
    220     diffs = [(d['long']-d['short']) for d in derived.values()]
    221     null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0)
    222     tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean)
    223 
    224     if pval < null_pval:
    225         return 1
    226     else:
    227         return 0
    228 
    229    
    230 def diffMedian(derived):
    231     l = [tc['long']-tc['short'] for s,tc in derived.items()]
    232     return statistics.median(l)
    233 
    234 
    235 def subsample_ids(db, probe_type, subsample_size=None):
    236     cursor = db.conn.cursor()
    237     cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))
    238     population_size = cursor.fetchone()[0]
    239     #print("population_size:", population_size)
    240     if subsample_size == None or subsample_size > population_size:
    241         subsample_size = population_size
    242    
    243     start = numpy.random.random_integers(0,population_size-1)
    244     cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start))
    245     for row in cursor:
    246         subsample_size -= 1
    247         yield row['sample']
    248 
    249     if subsample_size > 0:
    250         cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size))
    251         for row in cursor:
    252             yield row['sample']
    253    
    254 
    255 def subsample(db, probe_type, subsample_size=None):
    256     cursor = db.conn.cursor()
    257     cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)")
    258     num_test_cases = cursor.fetchone()[0]
    259    
    260     for sid in subsample_ids(db, probe_type, subsample_size):
    261         cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,))
    262         probes = cursor.fetchall()
    263         if len(probes) != num_test_cases:
    264             sys.stderr.write("WARN: sample %d had %d probes, but %d expected!  Discarding...\n" % (sid, len(probes), num_test_cases))
    265             continue
    266         yield (sid,[dict(r) for r in probes])
    267 
    268 
    269 def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
    270     population_size = db.populationSize(probe_type)
    271 
    272     if size == None or size > population_size:
    273         size = population_size
    274     if offset == None or offset >= population_size or offset < 0:
    275         offset = numpy.random.random_integers(0,population_size-1)
    276 
    277     query="""
    278       SELECT %(field)s AS unusual_case,
    279              (SELECT avg(%(field)s) FROM probes,analysis
    280               WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
    281       FROM   (SELECT probes.sample,%(field)s FROM probes,analysis
    282               WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
    283       LIMIT :size OFFSET :offset
    284     """ % {"field":field}
    285    
    286     params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size}
    287     cursor = db.conn.cursor()
    288     cursor.execute(query, params)
    289     ret_val = [dict(row) for row in cursor.fetchall()]
    290     #for row in cursor:
    291     #    size -= 1
    292     #    yield dict(row)
    293 
    294     size -= len(ret_val)
    295     if size > 0:
    296         params['offset'] = 0
    297         params['size'] = size
    298         cursor.execute(query, params)
    299         ret_val += [dict(row) for row in cursor.fetchall()]
    300         #for row in cursor:
    301         #    yield dict(row)
    302    
    303     return ret_val
    304 
    305 
    306 # if test_cases=None, include all of them.  Otherwise, include only the specified test cases.
    307 def samples2Distributions(samples, field, test_cases=None):
    308     ret_val = {}
    309    
    310     for sid,probes in samples:
    311         for p in probes:
    312             if p['test_case'] in ret_val:
    313                 ret_val[p['test_case']].append(p[field])
    314             elif test_cases == None or p['test_case'] in test_cases:
    315                 ret_val[p['test_case']] = [p[field]]
    316                
    317                
    318     return ret_val
    319 
    320 
    321 def samples2MeanDiffs(samples, field, unusual_case):
    322     ret_val = {}
    323    
    324     for sid,probes in samples:
    325         unusual_value = None
    326         for p in probes:
    327             if p['test_case'] == unusual_case:
    328                 unusual_value = p[field]
    329                 break
    330         yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case])
    331 
    332 
    333 def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials):
    334     ret_val = []
    335     for t in range(num_trials):
    336         ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size)))
    337 
    338     return ret_val
    339 
    340 
    341 def bootstrap2(estimator, db, probe_type, subsample_size, num_trials):
    342     ret_val = []
    343     for t in range(num_trials):
    344         ret_val.append(estimator(subsample(db, probe_type, subsample_size)))
    345 
    346     return ret_val
    347 
    348 
    349202def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
    350203    ret_val = []
     
    355208
    356209
    357 # Returns the test case name that clearly has higher RTT; otherwise, returns None
    358 def boxTest(params, test_cases, samples):
    359     if len(test_cases) != 2:
    360         # XXX: somehow generalize the box test to handle more than 2 cases
    361         raise Exception()
    362     dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params
    363 
    364     tmp1,tmp2 = dists.items()
    365     test_case1,dist1 = tmp1
    366     test_case2,dist2 = tmp2
    367    
    368     d1_high = numpy.percentile(dist1, params['high'])
    369     d2_low = numpy.percentile(dist2, params['low'])
    370     if d1_high < d2_low:
    371         return test_case2
    372 
    373     d1_low = numpy.percentile(dist1, params['low'])
    374     d2_high = numpy.percentile(dist2, params['high'])
    375 
    376     if d2_high < d1_low:
    377         return test_case1
    378    
    379     return None
    380 
    381 
    382210# Returns 1 if unusual_case is unusual in the expected direction
    383211#         0 if it isn't unusual
    384212#        -1 if it is unusual in the wrong direction
    385213def multiBoxTest(params, greater, samples):
    386     uc = [s['unusual_case'] for s in samples]
    387     rest = [s['other_cases'] for s in samples]
     214    uc = [s['unusual_packet'] for s in samples]
     215    rest = [s['other_packet'] for s in samples]
    388216   
    389217    uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
     
    407235#         0 otherwise
    408236def summaryTest(f, params, greater, samples):
    409     diffs = [s['unusual_case']-s['other_cases'] for s in samples]
     237    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
    410238
    411239    mh = f(diffs, params['distance'])
     
    420248        else:
    421249            return 0
     250
    422251
    423252midsummaryTest = functools.partial(summaryTest, midsummary)
     
    451280
    452281def kfilter(params, observations):
    453     x = numpy.array(observations)
     282    x = numpy.array(observations)   
    454283    movement = 0
    455     est = []   
     284    est = []
    456285    var = []
    457286    kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
    458                         #P  = 10000,          # initial variance
    459                         P  = 10,          # initial variance
     287                        #P  = 10000,         # initial variance
     288                        P  = 10,            # initial variance
    460289                        R  = numpy.std(x),   # msensor noise
    461290                        Q  = 0)              # movement noise
     
    471300
    472301def kalmanTest(params, greater, samples):
    473     diffs = [s['unusual_case']-s['other_cases'] for s in samples]
     302    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
    474303
    475304    m = kfilter(params, diffs)['est'][-1]
     
    486315
    487316
    488 def kalmanTest2(params, greater, samples):
    489     diffs = [s['unusual_case']-s['other_cases'] for s in samples]
    490 
    491     estimates = []
    492     size = 500
    493     for i in range(100):
    494         off = random.randrange(0,len(diffs))
    495         sub = diffs[off:size]
    496         if len(sub) < size:
    497             sub += diffs[0:size-len(sub)]
    498         estimates.append(kfilter(params, sub)['est'][-1])
    499            
    500     m = quadsummary(estimates)
     317def tsvalwmeanTest(params, greater, samples):
     318    m = tsvalwmean(samples)
    501319    if greater:
    502320        if m > params['threshold']:
     
    509327        else:
    510328            return 0
    511 
  • trunk/lib/nanownlib/storage.py

    r10 r11  
    115115
    116116
    117     def subseries(self, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
    118         cache_key = (probe_type,unusual_case,field)
    119        
     117    def subseries(self, probe_type, unusual_case, size=None, offset=None):
     118        cache_key = (probe_type,unusual_case)
    120119        if cache_key not in self._population_cache:
    121120            query="""
    122             SELECT %(field)s AS unusual_case,
    123                    (SELECT avg(%(field)s) FROM probes,analysis
    124                     WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
    125             FROM   (SELECT probes.sample,%(field)s FROM probes,analysis
     121            SELECT packet_rtt AS unusual_packet,
     122                   (SELECT avg(packet_rtt) FROM probes,analysis
     123                    WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_packet,
     124
     125                   tsval_rtt AS unusual_tsval,
     126                   (SELECT avg(tsval_rtt) FROM probes,analysis
     127                    WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_tsval,
     128
     129                   reported AS unusual_reported,
     130                   (SELECT avg(reported) FROM probes,analysis
     131                    WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_reported
     132
     133            FROM   (SELECT probes.sample,packet_rtt,tsval_rtt,reported FROM probes,analysis
    126134                    WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
    127             """ % {"field":field}
     135            """
    128136   
    129137            params = {"probe_type":probe_type, "unusual_case":unusual_case}
     
    207215        return [self._insert('trim_analysis', row) for row in analyses]
    208216
    209     def addClassifierResults(self, results):
     217    def addClassifierResult(self, results):
    210218        ret_val = self._insert('classifier_results', results)
    211219        self.conn.commit()
    212220        return ret_val
    213221
    214     def fetchClassifierResult(self, classifier, trial_type, num_observations):
     222    def fetchClassifierResult(self, classifier, trial_type, num_observations, params=None):
    215223        query = """
    216224          SELECT * FROM classifier_results
    217           WHERE classifier=? AND trial_type=? AND num_observations=?
    218           ORDER BY false_positives+false_negatives
    219           LIMIT 1;
     225            WHERE classifier=:classifier
     226                  AND trial_type=:trial_type
     227                  AND num_observations=:num_observations"""
     228        if params != None:
     229            query += """
     230                  AND params=:params"""
     231        query += """
     232            ORDER BY false_positives+false_negatives
     233            LIMIT 1
    220234        """
     235
     236        qparams = {'classifier':classifier, 'trial_type':trial_type,
     237                   'num_observations':num_observations,'params':params}
    221238        cursor = self.conn.cursor()
    222         cursor.execute(query, (classifier, trial_type, num_observations))
     239        cursor.execute(query, qparams)
    223240        ret_val = cursor.fetchone()
    224 
    225241        if ret_val != None:
    226242            ret_val = dict(ret_val)
    227243        return ret_val
    228 
     244   
    229245    def deleteClassifierResults(self, classifier, trial_type, num_observations=None):
    230246        params = {"classifier":classifier,"trial_type":trial_type,"num_observations":num_observations}
Note: See TracChangeset for help on using the changeset viewer.