Changeset 10 for trunk


Ignore:
Timestamp:
07/13/15 19:16:30 (9 years ago)
Author:
tim
Message:

.

Location:
trunk
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/bin/analyze_packets

    r5 r10  
    4444
    4545start = time.time()
     46import cProfile
     47#cProfile.run('num_probes = analyzeProbes(db)')
    4648num_probes = analyzeProbes(db)
    4749end = time.time()
  • trunk/bin/graph

    r6 r10  
    1010import json
    1111
     12import numpy
    1213import matplotlib.mlab as mlab
    1314import matplotlib.pyplot as plt
     
    8586
    8687print('packet_rtt diff median: %f' % statistics.median(diffs))
    87 print('packet_rtt diff midhinge: %f' % midhinge(diffs))
     88print('packet_rtt diff midhinge: %f' % midsummary(diffs))
    8889print('packet_rtt diff trimean: %f' % trimean(diffs))
     90print('packet_rtt diff quadsummary: %f' % quadsummary(diffs))
     91print('packet_rtt diff ubersummary: %f' % ubersummary(diffs))
    8992print('packet_rtt diff MAD: %f' % mad(diffs))
    9093print('reported diff trimean: %f' % trimean(reported_diffs))
     94print('reported diff quadsummary: %f' % quadsummary(reported_diffs))
     95print('reported diff ubersummary: %f' % ubersummary(reported_diffs))
    9196print('reported diff MAD: %f' % mad(reported_diffs))
     97
     98import cProfile
     99kresults = kfilter({},diffs)
     100#print('packet_rtt diff kfilter: ', numpy.mean(kresults['est']), kresults['var'])
     101print('packet_rtt diff kfilter: ', kresults['est'][-1], kresults['var'][-1])
     102kresults = kfilter({},reported_diffs)
     103#print('reported diff kfilter: ', numpy.mean(kresults['est']), kresults['var'][-1])
     104print('reported diff kfilter: ', kresults['est'][-1], kresults['var'][-1])
    92105
    93106
  • trunk/bin/train

    r9 r10  
    2626
    2727from nanownlib import *
     28from nanownlib.stats import *
     29from nanownlib.parallel import WorkerThreads
    2830import nanownlib.storage
    29 from nanownlib.stats import boxTest,multiBoxTest,subsample,bootstrap,bootstrap2,trimean,midhinge,midhingeTest,samples2Distributions,samples2MeanDiffs
    30 from nanownlib.parallel import WorkerThreads
     31
    3132
    3233
     
    3536#parser.add_argument('-c', dest='cases', type=str, default='{"short":10000,"long":1010000}',
    3637#                    help='JSON representation of echo timing cases. Default: {"short":10000,"long":1010000}')
     38parser.add_argument('--retrain', action='append', default=[], help='Force a classifier to be retrained.  May be specified multiple times.')
     39parser.add_argument('--retest', action='append', default=[], help='Force a classifier to be retested.  May be specified multiple times.')
    3740parser.add_argument('session_data', default=None,
    3841                    help='Database file storing session information')
    3942options = parser.parse_args()
    4043
    41            
    42 
    43 def trainBoxTest(db, unusual_case, greater, subseries_size):
    44 
     44
     45def trainBoxTest(db, unusual_case, greater, num_observations):
     46    db.resetOffsets()
     47   
    4548    def trainAux(low,high,num_trials):
    4649        estimator = functools.partial(multiBoxTest, {'low':low, 'high':high}, greater)
    47         estimates = bootstrap3(estimator, db, 'train', unusual_case, subseries_size, num_trials)
    48         null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, subseries_size, num_trials)
     50        estimates = bootstrap3(estimator, db, 'train', unusual_case, num_observations, num_trials)
     51        null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, num_observations, num_trials)
    4952
    5053        bad_estimates = len([e for e in estimates if e != 1])
     
    116119   
    117120    num_trials = 500
    118     widths = [good_width+(x/100.0) for x in range(-60,75,5) if good_width+(x/100.0) > 0.0]
     121    widths = [good_width+(x/100.0) for x in range(-70,75,5) if good_width+(x/100.0) > 0.0]
    119122    performance = []
    120123    for width in widths:
     
    124127        job_id,errors = wt.resultq.get()
    125128        fp,fn = errors
    126         performance.append(((fp+fn)/2.0, job_id, fn, fp))
     129        #performance.append(((fp+fn)/2.0, job_id, fn, fp))
     130        performance.append((abs(fp-fn), job_id, fn, fp))
    127131    performance.sort()
    128132    #pprint.pprint(performance)
     
    133137    wt.stop()
    134138    params = json.dumps({"low":best_low,"high":best_low+best_width})
    135     return {'algorithm':"boxtest",
     139    return {'trial_type':"train",
     140            'num_observations':num_observations,
     141            'num_trials':num_trials,
    136142            'params':params,
    137             'sample_size':subseries_size,
    138             'num_trials':num_trials,
    139             'trial_type':"train",
    140143            'false_positives':performance[0][3],
    141144            'false_negatives':performance[0][2]}
    142145
    143146
    144 def trainMidhinge(db, unusual_case, greater, subseries_size):
    145 
     147def trainSummary(summaryFunc, db, unusual_case, greater, num_observations):
     148    db.resetOffsets()
     149    stest = functools.partial(summaryTest, summaryFunc)
     150   
    146151    def trainAux(distance, threshold, num_trials):
    147         estimator = functools.partial(midhingeTest, {'distance':distance,'threshold':threshold}, greater)
    148         estimates = bootstrap3(estimator, db, 'train', unusual_case, subseries_size, num_trials)
    149         null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, subseries_size, num_trials)
     152        estimator = functools.partial(stest, {'distance':distance,'threshold':threshold}, greater)
     153        estimates = bootstrap3(estimator, db, 'train', unusual_case, num_observations, num_trials)
     154        null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, num_observations, num_trials)
    150155
    151156        bad_estimates = len([e for e in estimates if e != 1])
     
    158163    #determine expected delta based on differences
    159164    mean_diffs = [s['unusual_case']-s['other_cases'] for s in db.subseries('train', unusual_case)]
    160     threshold = trimean(mean_diffs)/2.0
     165    threshold = summaryFunc(mean_diffs)/2.0
    161166    #print("init_threshold:", threshold)
    162167   
     
    181186    num_trials = 500
    182187    performance = []
    183     for t in range(50,154,4):
     188    for t in range(80,122,2):
    184189        wt.addJob(threshold*(t/100.0), (good_distance,threshold*(t/100.0),num_trials))
    185190    wt.wait()
     
    187192        job_id,errors = wt.resultq.get()
    188193        fp,fn = errors
    189         performance.append(((fp+fn)/2.0, job_id, fn, fp))
     194        #performance.append(((fp+fn)/2.0, job_id, fn, fp))
     195        performance.append((abs(fp-fn), job_id, fn, fp))
    190196    performance.sort()
    191197    #pprint.pprint(performance)
     
    217223        job_id,errors = wt.resultq.get()
    218224        fp,fn = errors
    219         performance.append(((fp+fn)/2.0, job_id, fn, fp))
     225        #performance.append(((fp+fn)/2.0, job_id, fn, fp))
     226        performance.append((abs(fp-fn), job_id, fn, fp))
    220227    performance.sort()
    221228    #pprint.pprint(performance)
     
    225232    wt.stop()
    226233    params = json.dumps({'distance':best_distance,'threshold':best_threshold})
    227     return {'algorithm':"midhinge",
     234    return {'trial_type':"train",
     235            'num_observations':num_observations,
     236            'num_trials':num_trials,
    228237            'params':params,
    229             'sample_size':subseries_size,
    230             'num_trials':num_trials,
    231             'trial_type':"train",
    232238            'false_positives':performance[0][3],
    233239            'false_negatives':performance[0][2]}
    234240
    235241
    236 classifiers = {'boxtest':{'train':trainBoxTest, 'test':multiBoxTest},
    237                'midhinge':{'train':trainMidhinge, 'test':midhinge}}
     242def trainKalman(db, unusual_case, greater, num_observations):
     243    db.resetOffsets()
     244
     245    def trainAux(params, num_trials):
     246        estimator = functools.partial(kalmanTest, params, greater)
     247        estimates = bootstrap3(estimator, db, 'train', unusual_case, num_observations, num_trials)
     248        null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, num_observations, num_trials)
     249       
     250        bad_estimates = len([e for e in estimates if e != 1])
     251        bad_null_estimates = len([e for e in null_estimates if e != 0])
     252       
     253        false_negatives = 100.0*bad_estimates/num_trials
     254        false_positives = 100.0*bad_null_estimates/num_trials
     255        return false_positives,false_negatives
     256   
     257    mean_diffs = [s['unusual_case']-s['other_cases'] for s in db.subseries('train', unusual_case)]
     258    good_threshold = kfilter({},mean_diffs)['est'][-1]/2.0
     259
     260    wt = WorkerThreads(2, trainAux)
     261    num_trials = 200
     262    performance = []
     263    for t in range(90,111):
     264        params = {'threshold':good_threshold*(t/100.0)}
     265        wt.addJob(good_threshold*(t/100.0), (params,num_trials))
     266    wt.wait()
     267    while not wt.resultq.empty():
     268        job_id,errors = wt.resultq.get()
     269        fp,fn = errors
     270        #performance.append(((fp+fn)/2.0, job_id, fn, fp))
     271        performance.append((abs(fp-fn), job_id, fn, fp))
     272    performance.sort()
     273    #pprint.pprint(performance)
     274    best_threshold = performance[0][1]
     275    #print("best_threshold:", best_threshold)
     276    params = {'threshold':best_threshold}
     277
     278    wt.stop()
     279   
     280    return {'trial_type':"train",
     281            'num_observations':num_observations,
     282            'num_trials':num_trials,
     283            'params':json.dumps(params),
     284            'false_positives':performance[0][3],
     285            'false_negatives':performance[0][2]}
     286
     287   
     288    #determine expected delta based on differences
     289classifiers = {'boxtest':{'train':trainBoxTest, 'test':multiBoxTest, 'train_results':[]},
     290               'midsummary':{'train':functools.partial(trainSummary, midsummary), 'test':midsummaryTest, 'train_results':[]},
     291               #'ubersummary':{'train':functools.partial(trainSummary, ubersummary), 'test':ubersummaryTest, 'train_results':[]},
     292               'quadsummary':{'train':functools.partial(trainSummary, quadsummary), 'test':quadsummaryTest, 'train_results':[]},
     293               'kalman':{'train':trainKalman, 'test':kalmanTest, 'train_results':[]},
     294               #'_trimean':{'train':None, 'test':trimeanTest, 'train_results':[]},
     295              }
    238296
    239297
     
    242300import cProfile
    243301
    244 def trainClassifier(db, unusual_case, greater, trainer):
     302def trainClassifier(db, unusual_case, greater, classifier, retrain=False):
     303    if retrain:
     304        print("Dropping stored training results...")
     305        db.deleteClassifierResults(classifier, 'train')
     306   
     307    trainer = classifiers[classifier]['train']
    245308    threshold = 5.0 # in percent
    246     size = 4000
     309    num_obs = 1000
     310    max_obs = int(db.populationSize('train')/5)
    247311    result = None
    248     while size < db.populationSize('train')/5:
    249         size = min(size*2, int(db.populationSize('train')/5))
    250         result = trainer(db,unusual_case,greater,size)
     312    while num_obs < max_obs:
     313        num_obs = min(int(num_obs*1.5), max_obs)
     314        result = db.fetchClassifierResult(classifier, 'train', num_obs)
     315        if result != None:
     316            train_time = "(stored)"
     317        else:
     318            start = time.time()
     319            result = trainer(db,unusual_case,greater,num_obs)
     320            result['classifier'] = classifier
     321            train_time = "%f" % (time.time()-start)
     322           
    251323        error = statistics.mean([result['false_positives'],result['false_negatives']])
    252         print("subseries size: %d | error: %f | false_positives: %f | false_negatives: %f"
    253               % (size,error,result['false_positives'],result['false_negatives']))
     324        print("number of observations: %d | error: %f | false_positives: %f | false_negatives: %f | train time: %s | params: %s"
     325              % (num_obs, error, result['false_positives'],result['false_negatives'], train_time, result['params']))
     326        db.addClassifierResults(result)
     327        classifiers[classifier]['train_results'].append(result)
     328
    254329        if error < threshold:
    255330            break
    256     if result != None:
    257         db.addClassifierResults(result)
    258331
    259332    return result
     333
     334
     335
     336def testClassifier(db, unusual_case, greater, classifier, retest=False):
     337    target_error = 5.0 # in percent
     338    num_trials = 1000
     339    max_obs = int(db.populationSize('test')/5)
     340
     341    tester = classifiers[classifier]['test']
     342   
     343    def testAux(params, num_trials, num_observations):
     344        estimator = functools.partial(tester, params, greater)
     345        estimates = bootstrap3(estimator, db, 'test', unusual_case, num_observations, num_trials)
     346        null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, num_observations, num_trials)
     347
     348        bad_estimates = len([e for e in estimates if e != 1])
     349        bad_null_estimates = len([e for e in null_estimates if e != 0])
     350       
     351        false_negatives = 100.0*bad_estimates/num_trials
     352        false_positives = 100.0*bad_null_estimates/num_trials
     353        print("testAux:", num_observations, false_positives, false_negatives, params)
     354        return false_positives,false_negatives
     355
     356
     357    if retest:
     358        print("Dropping stored test results...")
     359        db.deleteClassifierResults(classifier, 'test')
     360
     361
     362    test_results = []
     363    lte = math.log(target_error/100.0)
     364    for tr in classifiers[classifier]['train_results']:
     365        db.resetOffsets()
     366        params = json.loads(tr['params'])
     367        num_obs = tr['num_observations']
     368   
     369        print("initial test")
     370        fp,fn = testAux(params, num_trials, num_obs)
     371        error = (fp+fn)/2.0
     372        print("walking up")
     373        while (error > target_error) and (num_obs < max_obs):
     374            increase_factor = 1.5 * lte/math.log(error/100.0) # don't ask how I came up with this
     375            #print("increase_factor:", increase_factor)
     376            num_obs = min(int(increase_factor*num_obs), max_obs)
     377            fp,fn = testAux(params, num_trials, num_obs)
     378            error = (fp+fn)/2.0
     379
     380        print("walking down")
     381        while (num_obs > 0):
     382            current_best = (num_obs,error,params,fp,fn)
     383            num_obs = int(0.95*num_obs)
     384            fp,fn = testAux(params, num_trials, num_obs)
     385            error = (fp+fn)/2.0
     386            if error > target_error:
     387                break
     388       
     389        test_results.append(current_best)
     390
     391    test_results.sort()
     392    best_obs,error,best_params,fp,fn = test_results[0]
     393   
     394    return {'classifier':classifier,
     395            'trial_type':"test",
     396            'num_observations':best_obs,
     397            'num_trials':num_trials,
     398            'params':best_params,
     399            'false_positives':fp,
     400            'false_negatives':fn}
    260401
    261402
     
    268409print(":", end-start)
    269410
    270 
    271 for c,funcs in classifiers.items():
     411for c in sorted(classifiers.keys()):
     412    if classifiers[c]['train'] == None:
     413        continue
    272414    start = time.time()
    273415    print("Training %s..." % c)
    274     result = trainClassifier(db, unusual_case, greater, funcs['train'])
     416    result = trainClassifier(db, unusual_case, greater, c, c in options.retrain)
    275417    print("%s result:" % c)
    276418    pprint.pprint(result)
    277419    print("completed in:", time.time()-start)
    278420
    279 sys.exit(0)
    280 
    281 start = time.time()
    282 results = trainBoxTest(db, unusual_case, greater, 6000)
    283 #db.addClassifierResults(results)
    284 print("multi box test result:")
    285 pprint.pprint(results)
    286 print(":", time.time()-start)
     421db.clearCache()
     422
     423for c in sorted(classifiers.keys()):
     424    start = time.time()
     425    print("Testing %s..." % c)
     426    result = testClassifier(db, unusual_case, greater, c, c in options.retest)
     427    print("%s result:" % c)
     428    pprint.pprint(result)
     429    classifiers[c]['test_error'] = (result['false_positives']+result['false_negatives'])/2.0
     430    print("completed in:", time.time()-start)
  • trunk/lib/nanownlib/__init__.py

    r6 r10  
    44import sys
    55import time
     6import traceback
    67import random
    78import argparse
     
    209210def removeDuplicatePackets(packets):
    210211    #return packets
    211     suspect = None
     212    suspect = ''
    212213    seen = {}
    213214    # XXX: Need to review this deduplication algorithm and make sure it is correct
    214215    for p in packets:
    215216        key = (p['sent'],p['tcpseq'],p['tcpack'],p['payload_len'])
    216         if (key not in seen)\
    217            or p['sent']==1 and (seen[key]['observed'] < p['observed'])\
    218            or p['sent']==0 and (seen[key]['observed'] > p['observed']):
    219             #if (key not in seen) or (seen[key]['observed'] > p['observed']):
     217        if (key not in seen):
    220218            seen[key] = p
    221    
    222     if len(seen) < len(packets):
    223         suspect = 'd'
    224         #sys.stderr.write("INFO: removed %d duplicate packets.\n" % (len(packets) - len(seen)))
     219            continue
     220        if p['sent']==1 and (seen[key]['observed'] > p['observed']): #earliest sent
     221            seen[key] = p
     222            suspect += 's'
     223            continue
     224        if p['sent']==0 and (seen[key]['observed'] > p['observed']): #earliest rcvd
     225            seen[key] = p
     226            suspect += 'r'
     227            continue
     228   
     229    #if len(seen) < len(packets):
     230    #   sys.stderr.write("INFO: removed %d duplicate packets.\n" % (len(packets) - len(seen)))
    225231
    226232    return suspect,seen.values()
     
    230236    suspect,packets = removeDuplicatePackets(packets)
    231237
    232     #sort_key = lambda d: (d['tcpseq'],d['tcpack'])
    233     sort_key = lambda d: (d['observed'],d['tcpseq'])
     238    sort_key = lambda d: (d['tcpseq'],d['observed'])
    234239    sent = sorted((p for p in packets if p['sent']==1 and p['payload_len']>0), key=sort_key)
    235240    rcvd = sorted((p for p in packets if p['sent']==0 and p['payload_len']>0), key=sort_key)
    236241
    237     if len(sent) <= trim_sent:
    238         last_sent = sent[-1]
    239     else:
    240         last_sent = sent[trim_sent]
    241 
    242     if len(rcvd) <= trim_rcvd:
    243         last_rcvd = rcvd[0]
    244     else:
    245         last_rcvd = rcvd[len(rcvd)-1-trim_rcvd]
     242    alt_key = lambda d: (d['observed'],d['tcpseq'])
     243    rcvd_alt = sorted((p for p in packets if p['sent']==0 and p['payload_len']>0), key=alt_key)
     244
     245    s_off = trim_sent
     246    if s_off >= len(sent):
     247        s_off = -1
     248    last_sent = sent[s_off]
     249
     250    r_off = len(rcvd) - trim_rcvd - 1
     251    if r_off <= 0:
     252        r_off = 0
     253    last_rcvd = rcvd[r_off]
     254    if last_rcvd != rcvd_alt[r_off]:
     255        suspect += 'R'
    246256   
    247257    packet_rtt = last_rcvd['observed'] - last_sent['observed']
     
    272282    query="""
    273283      SELECT packet_rtt-(SELECT avg(packet_rtt) FROM probes,trim_analysis
    274                          WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND sample=u.sample AND probes.type in ('train','test'))
    275       FROM (SELECT probes.sample,packet_rtt FROM probes,trim_analysis WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case=:unusual_case AND probes.type in ('train','test')) u
     284                         WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND sample=u.s AND probes.type in ('train','test'))
     285      FROM (SELECT probes.sample s,packet_rtt FROM probes,trim_analysis WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case=:unusual_case AND probes.type in ('train','test') AND 1 NOT IN (select 1 from probes p,trim_analysis t WHERE p.sample=s AND t.probe_id=p.id AND t.suspect LIKE '%R%')) u
     286    """
     287    query="""
     288      SELECT packet_rtt-(SELECT avg(packet_rtt) FROM probes,trim_analysis
     289                         WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND sample=u.s AND probes.type in ('train','test'))
     290      FROM (SELECT probes.sample s,packet_rtt FROM probes,trim_analysis WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case=:unusual_case AND probes.type in ('train','test')) u
    276291    """
    277292
     
    280295    differences = [row[0] for row in cursor]
    281296   
    282     return trimean(differences),mad(differences)
     297    return ubersummary(differences),mad(differences)
    283298
    284299
     
    287302    db.conn.execute("CREATE INDEX IF NOT EXISTS packets_probe ON packets (probe_id)")
    288303    pcursor = db.conn.cursor()
    289     kcursor = db.conn.cursor()
     304    db.conn.commit()
    290305
    291306    pcursor.execute("SELECT tcpts_mean FROM meta")
     
    297312    pcursor.execute("DELETE FROM trim_analysis")
    298313    db.conn.commit()
     314
     315    def loadPackets(db):
     316        cursor = db.conn.cursor()
     317        cursor.execute("SELECT * FROM packets ORDER BY probe_id")
     318
     319        probe_id = None
     320        entry = []
     321        ret_val = []
     322        for p in cursor:
     323            if probe_id == None:
     324                probe_id = p['probe_id']
     325            if p['probe_id'] != probe_id:
     326                ret_val.append((probe_id,entry))
     327                probe_id = p['probe_id']
     328                entry = []
     329            entry.append(dict(p))
     330        ret_val.append((probe_id,entry))
     331        return ret_val
     332   
     333    start = time.time()
     334    packet_cache = loadPackets(db)
     335    print("packets loaded in: %f" % (time.time()-start))
    299336   
    300337    count = 0
    301338    sent_tally = []
    302339    rcvd_tally = []
    303     for pid, in pcursor.execute("SELECT id FROM probes"):
    304         kcursor.execute("SELECT * FROM packets WHERE probe_id=?", (pid,))
     340    for probe_id,packets in packet_cache:
    305341        try:
    306             analysis,s,r = analyzePackets(kcursor.fetchall(), timestamp_precision)
    307             analysis['probe_id'] = pid
     342            analysis,s,r = analyzePackets(packets, timestamp_precision)
     343            analysis['probe_id'] = probe_id
    308344            sent_tally.append(s)
    309345            rcvd_tally.append(r)
     346            db.addTrimAnalyses([analysis])
    310347        except Exception as e:
    311             print(e)
    312             sys.stderr.write("WARN: couldn't find enough packets for probe_id=%s\n" % pid)
     348            traceback.print_exc()
     349            sys.stderr.write("WARN: couldn't find enough packets for probe_id=%s\n" % probe_id)
    313350       
    314351        #print(pid,analysis)
    315         db.addTrimAnalyses([analysis])
    316352        count += 1
    317353    db.conn.commit()
     
    326362            if strim == 0 and rtrim == 0:
    327363                continue # no point in doing 0,0 again
    328             for pid, in pcursor.execute("SELECT id FROM probes"):
    329                 kcursor.execute("SELECT * FROM packets WHERE probe_id=?", (pid,))
     364            for probe_id,packets in packet_cache:
    330365                try:
    331                     analysis,s,r = analyzePackets(kcursor.fetchall(), timestamp_precision, strim, rtrim)
    332                     analysis['probe_id'] = pid
     366                    analysis,s,r = analyzePackets(packets, timestamp_precision, strim, rtrim)
     367                    analysis['probe_id'] = probe_id
    333368                except Exception as e:
    334369                    print(e)
     
    336371                   
    337372                db.addTrimAnalyses([analysis])
    338             db.conn.commit()
     373    db.conn.commit()
    339374
    340375    # Populate analysis table so findUnusualTestCase can give us a starting point
     
    359394    for strim in range(1,num_sent):
    360395        delta,mad = evaluations[(strim,0)]
    361         if abs(good_delta - delta) < abs(delta_margin*good_delta) and mad < good_mad:
     396        if delta*good_delta > 0.0 and (abs(good_delta) - abs(delta)) < abs(delta_margin*good_delta) and mad < good_mad:
    362397            best_strim = strim
    363398        else:
     
    367402    for rtrim in range(1,num_rcvd):
    368403        delta,mad = evaluations[(best_strim,rtrim)]
    369         if (abs(delta) > abs(good_delta) or abs(good_delta - delta) < abs(delta_margin*good_delta)) and mad < good_mad:
     404        if delta*good_delta > 0.0 and (abs(good_delta) - abs(delta)) < abs(delta_margin*good_delta) and mad < good_mad:
    370405            best_rtrim = rtrim
    371406        else:
     
    425460    cursor = db.conn.cursor()
    426461    cursor.execute("SELECT packet_rtt FROM probes,analysis WHERE probes.id=analysis.probe_id AND probes.type in ('train','test')")
    427     global_tm = trimean([row['packet_rtt'] for row in cursor])
     462    global_tm = quadsummary([row['packet_rtt'] for row in cursor])
    428463
    429464    tm_abs = []
     
    432467    for tc in test_cases:
    433468        cursor.execute("SELECT packet_rtt FROM probes,analysis WHERE probes.id=analysis.probe_id AND probes.type in ('train','test') AND probes.test_case=?", (tc,))
    434         tm_map[tc] = trimean([row['packet_rtt'] for row in cursor])
     469        tm_map[tc] = quadsummary([row['packet_rtt'] for row in cursor])
    435470        tm_abs.append((abs(tm_map[tc]-global_tm), tc))
    436471
    437472    magnitude,tc = max(tm_abs)
    438473    cursor.execute("SELECT packet_rtt FROM probes,analysis WHERE probes.id=analysis.probe_id AND probes.type in ('train','test') AND probes.test_case<>?", (tc,))
    439     remaining_tm = trimean([row['packet_rtt'] for row in cursor])
     474    remaining_tm = quadsummary([row['packet_rtt'] for row in cursor])
    440475
    441476    ret_val = (tc, tm_map[tc]-remaining_tm)
  • trunk/lib/nanownlib/stats.py

    r8 r10  
    22import sys
    33import os
     4import functools
    45import math
    56import statistics
     
    133134
    134135
    135 def midhinge(values, distance=25):
    136     return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
     136def midsummary(values, distance=25):
     137    #return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
     138    l,h = numpy.percentile(values, (50-distance,50+distance))
     139    return (l+h)/2.0
    137140
    138141def trimean(values, distance=25):
    139     return (midhinge(values, distance) + statistics.median(values))/2
    140 
     142    return (midsummary(values, distance) + statistics.median(values))/2
     143
     144def ubersummary(values, distance=25):
     145    left2 = 50-distance
     146    left1 = left2/2.0
     147    left3 = (left2+50)/2.0
     148    right2 = 50+distance
     149    right3 = (right2+50)/2.0
     150    right1 = (right2+100)/2.0
     151    l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
     152    #print(left1,left2,left3,50,right3,right2,right1)
     153    #print(l1,l2,l3,m,r3,r2,r1)
     154    return (l1+l2*4+l3+r3+r2*4+r1)/12.0
     155    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
     156
     157def quadsummary(values, distance=25):
     158    left2 = 50-distance
     159    left1 = left2/2.0
     160    right2 = 50+distance
     161    right1 = (right2+100)/2.0
     162    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
     163    #print(left1,left2,left3,50,right3,right2,right1)
     164    #print(l1,l2,l3,m,r3,r2,r1)
     165    return (l1+l2+r2+r1)/4.0
     166    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
     167
     168def quadsummary(values, distance=25):
     169    left1 = 50-distance
     170    left2 = (left1+50)/2.0
     171    right1 = 50+distance
     172    right2 = (right1+50)/2.0
     173    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
     174    #print(left1,left2,left3,50,right3,right2,right1)
     175    #print(l1,l2,l3,m,r3,r2,r1)
     176    return (l1+l2+r2+r1)/4.0
     177    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
     178
     179   
    141180def weightedMean(derived, weights):
    142181    normalizer = sum(weights.values())/len(weights)
     
    170209
    171210
    172 def estimateMidhinge(derived):
    173     return midhinge([(d['long']-d['short']) for d in derived.values()])
     211def estimateMidsummary(derived):
     212    return midsummary([(d['long']-d['short']) for d in derived.values()])
    174213
    175214
     
    348387    rest = [s['other_cases'] for s in samples]
    349388   
    350     uc_high = numpy.percentile(uc, params['high'])
    351     rest_low = numpy.percentile(rest, params['low'])
     389    uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
     390    rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
    352391    if uc_high < rest_low:
    353392        if greater:
     
    356395            return 1
    357396
    358     uc_low = numpy.percentile(uc, params['low'])
    359     rest_high = numpy.percentile(rest, params['high'])
    360397    if rest_high < uc_low:
    361398        if greater:
     
    369406# Returns 1 if unusual_case is unusual in the expected direction
    370407#         0 otherwise
    371 def midhingeTest(params, greater, samples):
     408def summaryTest(f, params, greater, samples):
    372409    diffs = [s['unusual_case']-s['other_cases'] for s in samples]
    373410
    374     mh = midhinge(diffs, params['distance'])
    375     #mh = trimean(diffs, params['distance'])
     411    mh = f(diffs, params['distance'])
    376412    if greater:
    377413        if mh > params['threshold']:
     
    385421            return 0
    386422
     423midsummaryTest = functools.partial(summaryTest, midsummary)
     424trimeanTest = functools.partial(summaryTest, trimean)
     425ubersummaryTest = functools.partial(summaryTest, ubersummary)
     426quadsummaryTest = functools.partial(summaryTest, quadsummary)
    387427
    388428def rmse(expected, measurements):
     
    392432def nrmse(expected, measurements):
    393433    return rmse(expected, measurements)/(max(measurements)-min(measurements))
     434
     435
     436class KalmanFilter1D:
     437    def __init__(self, x0, P, R, Q):
     438        self.x = x0
     439        self.P = P
     440        self.R = R
     441        self.Q = Q
     442
     443    def update(self, z):
     444        self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
     445        self.P = 1. / (1./self.P + 1./self.R)
     446
     447    def predict(self, u=0.0):
     448        self.x += u
     449        self.P += self.Q
     450
     451
     452def kfilter(params, observations):
     453    x = numpy.array(observations)
     454    movement = 0
     455    est = []   
     456    var = []
     457    kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
     458                        #P  = 10000,          # initial variance
     459                        P  = 10,          # initial variance
     460                        R  = numpy.std(x),   # msensor noise
     461                        Q  = 0)              # movement noise
     462    for round in range(1):
     463        for d in x:
     464            kf.predict(movement)
     465            kf.update(d)
     466            est.append(kf.x)
     467            var.append(kf.P)
     468
     469    return({'est':est, 'var':var})
     470
     471
     472def kalmanTest(params, greater, samples):
     473    diffs = [s['unusual_case']-s['other_cases'] for s in samples]
     474
     475    m = kfilter(params, diffs)['est'][-1]
     476    if greater:
     477        if m > params['threshold']:
     478            return 1
     479        else:
     480            return 0
     481    else:
     482        if m < params['threshold']:
     483            return 1
     484        else:
     485            return 0
     486
     487
     488def kalmanTest2(params, greater, samples):
     489    diffs = [s['unusual_case']-s['other_cases'] for s in samples]
     490
     491    estimates = []
     492    size = 500
     493    for i in range(100):
     494        off = random.randrange(0,len(diffs))
     495        sub = diffs[off:size]
     496        if len(sub) < size:
     497            sub += diffs[0:size-len(sub)]
     498        estimates.append(kfilter(params, sub)['est'][-1])
     499           
     500    m = quadsummary(estimates)
     501    if greater:
     502        if m > params['threshold']:
     503            return 1
     504        else:
     505            return 0
     506    else:
     507        if m < params['threshold']:
     508            return 1
     509        else:
     510            return 0
     511
  • trunk/lib/nanownlib/storage.py

    r9 r10  
    44import os
    55import uuid
     6import random
    67import threading
    78import sqlite3
    89
    910import numpy
     11# Don't trust numpy's seeding
     12numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
    1013
    1114def _newid():
     
    1821    _population_sizes = None
    1922    _population_cache = None
     23    _offset_cache = None
     24    _cur_offsets = None
    2025   
    2126    def __init__(self, path):
     
    2631        self._population_sizes = {}
    2732        self._population_cache = {}
     33        self._offset_cache = {}
     34        self._cur_offsets = {}
    2835       
    2936        if not exists:
     
    7986            self.conn.execute(
    8087                """CREATE TABLE classifier_results (id BLOB PRIMARY KEY,
    81                                                     algorithm TEXT,
     88                                                    classifier TEXT,
     89                                                    trial_type TEXT,
     90                                                    num_observations INTEGER,
     91                                                    num_trials INTEGER,
    8292                                                    params TEXT,
    83                                                     sample_size INTEGER,
    84                                                     num_trials INTEGER,
    85                                                     trial_type TEXT,
    8693                                                    false_positives REAL,
    8794                                                    false_negatives REAL)
     
    109116
    110117    def subseries(self, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
    111         if (probe_type,unusual_case,field) not in self._population_cache:
     118        cache_key = (probe_type,unusual_case,field)
     119       
     120        if cache_key not in self._population_cache:
    112121            query="""
    113122            SELECT %(field)s AS unusual_case,
     
    121130            cursor = self.conn.cursor()
    122131            cursor.execute(query, params)
    123             self._population_cache[(probe_type,unusual_case,field)] = [dict(row) for row in cursor.fetchall()]
    124 
    125         population = self._population_cache[(probe_type,unusual_case,field)]
     132            p = [dict(row) for row in cursor.fetchall()]
     133            self._population_cache[cache_key] = p
     134            self._offset_cache[cache_key] = tuple(numpy.random.random_integers(0,len(p)-1, len(p)/5))
     135            self._cur_offsets[cache_key] = 0
     136
     137        population = self._population_cache[cache_key]
    126138
    127139        if size == None or size > len(population):
    128140            size = len(population)
    129141        if offset == None or offset >= len(population) or offset < 0:
    130             offset = numpy.random.random_integers(0,len(population)-1)
    131 
     142            offset = self._offset_cache[cache_key][self._cur_offsets[cache_key]]
     143            self._cur_offsets[cache_key] = (offset + 1) % len(self._offset_cache[cache_key])
     144       
    132145        try:
    133             ret_val = population[offset:offset+size]
     146            offset = int(offset)
     147            size = int(size)
    134148        except Exception as e:
    135149            print(e, offset, size)
    136            
     150            return None
     151       
     152        ret_val = population[offset:offset+size]
    137153        if len(ret_val) < size:
    138154            ret_val += population[0:size-len(ret_val)]
    139155       
    140156        return ret_val
    141 
    142    
     157   
     158   
     159    def resetOffsets(self):
     160        for k in self._cur_offsets.keys():
     161            self._cur_offsets[k] = 0
     162
     163           
    143164    def clearCache(self):
    144165        self._population_cache = {}
     166        self._offset_cache = {}
     167        self._cur_offsets = {}
    145168
    146169       
     
    188211        self.conn.commit()
    189212        return ret_val
     213
     214    def fetchClassifierResult(self, classifier, trial_type, num_observations):
     215        query = """
     216          SELECT * FROM classifier_results
     217          WHERE classifier=? AND trial_type=? AND num_observations=?
     218          ORDER BY false_positives+false_negatives
     219          LIMIT 1;
     220        """
     221        cursor = self.conn.cursor()
     222        cursor.execute(query, (classifier, trial_type, num_observations))
     223        ret_val = cursor.fetchone()
     224
     225        if ret_val != None:
     226            ret_val = dict(ret_val)
     227        return ret_val
     228
     229    def deleteClassifierResults(self, classifier, trial_type, num_observations=None):
     230        params = {"classifier":classifier,"trial_type":trial_type,"num_observations":num_observations}
     231        query = """
     232          DELETE FROM classifier_results
     233          WHERE classifier=:classifier AND trial_type=:trial_type
     234        """
     235        if num_observations != None:
     236            query += " AND num_observations=:num_observations"
     237       
     238        self.conn.execute(query, params)
     239        self.conn.commit()
     240       
Note: See TracChangeset for help on using the changeset viewer.