Ignore:
Timestamp:
07/16/15 20:40:01 (9 years ago)
Author:
tim
Message:

.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/nanownlib/stats.py

    r10 r11  
    144144def ubersummary(values, distance=25):
    145145    left2 = 50-distance
     146    left3 = 50-(distance/2.0)
    146147    left1 = left2/2.0
    147     left3 = (left2+50)/2.0
    148148    right2 = 50+distance
    149     right3 = (right2+50)/2.0
     149    right3 = 50+(distance/2.0)
    150150    right1 = (right2+100)/2.0
    151151    l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
    152     #print(left1,left2,left3,50,right3,right2,right1)
    153152    #print(l1,l2,l3,m,r3,r2,r1)
    154153    return (l1+l2*4+l3+r3+r2*4+r1)/12.0
    155154    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
    156155
    157 def quadsummary(values, distance=25):
    158     left2 = 50-distance
    159     left1 = left2/2.0
    160     right2 = 50+distance
    161     right1 = (right2+100)/2.0
    162     l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
    163     #print(left1,left2,left3,50,right3,right2,right1)
    164     #print(l1,l2,l3,m,r3,r2,r1)
    165     return (l1+l2+r2+r1)/4.0
    166     #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
    167 
     156   
    168157def quadsummary(values, distance=25):
    169158    left1 = 50-distance
     
    177166    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
    178167
    179    
     168
     169def tsvalwmean(subseries):
     170    weights = [(s['unusual_packet']+s['other_packet'])**2 for s in subseries]
     171    normalizer = sum(weights)/len(weights)
     172    return numpy.mean([weights[i]*(subseries[i]['unusual_tsval']-subseries[i]['other_tsval'])/normalizer
     173                       for i in range(len(weights))])
     174
     175#def tsvalwmean(subseries):
     176#    return numpy.mean([(s['unusual_tsval']-s['other_tsval']) for s in subseries])
     177
     178
    180179def weightedMean(derived, weights):
    181180    normalizer = sum(weights.values())/len(weights)
     
    186185    return statistics.mean([w*(derived[k]['long_tsval']-derived[k]['short_tsval'])/normalizer for k,w in weights.items()])
    187186
     187
     188   
    188189
    189190def estimateMean(trustFunc, weightFunc, alpha, derived):
     
    199200
    200201
    201 #def estimateMedian(trustFunc, weightFunc, alpha, derived):
    202 #    trust = trustValues(derived, trustFunc)
    203 #    weights = weightFunc(derived, trust, alpha)
    204 
    205 #    return statistics.median([(derived[k]['long']-derived[k]['short']) for k,w in weights.items() if w > 0.0])
    206 
    207 def estimateMedian(derived):
    208     return statistics.median([(d['long']-d['short']) for d in derived.values()])
    209 
    210 
    211 def estimateMidsummary(derived):
    212     return midsummary([(d['long']-d['short']) for d in derived.values()])
    213 
    214 
    215 def estimateTrimean(derived):
    216     return trimean([(d['long']-d['short']) for d in derived.values()])
    217 
    218 
    219 def tTest(expected_mean, derived):
    220     diffs = [(d['long']-d['short']) for d in derived.values()]
    221     null_tval, null_pval = scipy.stats.ttest_1samp(diffs, 0.0)
    222     tval, pval = scipy.stats.ttest_1samp(diffs, expected_mean)
    223 
    224     if pval < null_pval:
    225         return 1
    226     else:
    227         return 0
    228 
    229    
    230 def diffMedian(derived):
    231     l = [tc['long']-tc['short'] for s,tc in derived.items()]
    232     return statistics.median(l)
    233 
    234 
    235 def subsample_ids(db, probe_type, subsample_size=None):
    236     cursor = db.conn.cursor()
    237     cursor.execute("SELECT max(c) FROM (SELECT count(sample) c FROM probes WHERE type=? GROUP BY test_case)", (probe_type,))
    238     population_size = cursor.fetchone()[0]
    239     #print("population_size:", population_size)
    240     if subsample_size == None or subsample_size > population_size:
    241         subsample_size = population_size
    242    
    243     start = numpy.random.random_integers(0,population_size-1)
    244     cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ? OFFSET ?", (probe_type,subsample_size,start))
    245     for row in cursor:
    246         subsample_size -= 1
    247         yield row['sample']
    248 
    249     if subsample_size > 0:
    250         cursor.execute("SELECT sample FROM probes WHERE type=? GROUP BY sample ORDER BY sample LIMIT ?", (probe_type,subsample_size))
    251         for row in cursor:
    252             yield row['sample']
    253    
    254 
    255 def subsample(db, probe_type, subsample_size=None):
    256     cursor = db.conn.cursor()
    257     cursor.execute("SELECT count(test_case) FROM (SELECT test_case FROM probes GROUP BY test_case)")
    258     num_test_cases = cursor.fetchone()[0]
    259    
    260     for sid in subsample_ids(db, probe_type, subsample_size):
    261         cursor.execute("SELECT test_case,tc_order,time_of_day,reported,userspace_rtt,suspect,packet_rtt,tsval_rtt FROM probes p,analysis a WHERE p.sample=? and a.probe_id=p.id", (sid,))
    262         probes = cursor.fetchall()
    263         if len(probes) != num_test_cases:
    264             sys.stderr.write("WARN: sample %d had %d probes, but %d expected!  Discarding...\n" % (sid, len(probes), num_test_cases))
    265             continue
    266         yield (sid,[dict(r) for r in probes])
    267 
    268 
    269 def subseries(db, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
    270     population_size = db.populationSize(probe_type)
    271 
    272     if size == None or size > population_size:
    273         size = population_size
    274     if offset == None or offset >= population_size or offset < 0:
    275         offset = numpy.random.random_integers(0,population_size-1)
    276 
    277     query="""
    278       SELECT %(field)s AS unusual_case,
    279              (SELECT avg(%(field)s) FROM probes,analysis
    280               WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
    281       FROM   (SELECT probes.sample,%(field)s FROM probes,analysis
    282               WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
    283       LIMIT :size OFFSET :offset
    284     """ % {"field":field}
    285    
    286     params = {"probe_type":probe_type, "unusual_case":unusual_case, "offset":offset, "size":size}
    287     cursor = db.conn.cursor()
    288     cursor.execute(query, params)
    289     ret_val = [dict(row) for row in cursor.fetchall()]
    290     #for row in cursor:
    291     #    size -= 1
    292     #    yield dict(row)
    293 
    294     size -= len(ret_val)
    295     if size > 0:
    296         params['offset'] = 0
    297         params['size'] = size
    298         cursor.execute(query, params)
    299         ret_val += [dict(row) for row in cursor.fetchall()]
    300         #for row in cursor:
    301         #    yield dict(row)
    302    
    303     return ret_val
    304 
    305 
    306 # if test_cases=None, include all of them.  Otherwise, include only the specified test cases.
    307 def samples2Distributions(samples, field, test_cases=None):
    308     ret_val = {}
    309    
    310     for sid,probes in samples:
    311         for p in probes:
    312             if p['test_case'] in ret_val:
    313                 ret_val[p['test_case']].append(p[field])
    314             elif test_cases == None or p['test_case'] in test_cases:
    315                 ret_val[p['test_case']] = [p[field]]
    316                
    317                
    318     return ret_val
    319 
    320 
    321 def samples2MeanDiffs(samples, field, unusual_case):
    322     ret_val = {}
    323    
    324     for sid,probes in samples:
    325         unusual_value = None
    326         for p in probes:
    327             if p['test_case'] == unusual_case:
    328                 unusual_value = p[field]
    329                 break
    330         yield statistics.mean([unusual_value-p[field] for p in probes if p['test_case'] != unusual_case])
    331 
    332 
    333 def bootstrap(estimator, db, probe_type, test_cases, subsample_size, num_trials):
    334     ret_val = []
    335     for t in range(num_trials):
    336         ret_val.append(estimator(test_cases, subsample(db, probe_type, subsample_size)))
    337 
    338     return ret_val
    339 
    340 
    341 def bootstrap2(estimator, db, probe_type, subsample_size, num_trials):
    342     ret_val = []
    343     for t in range(num_trials):
    344         ret_val.append(estimator(subsample(db, probe_type, subsample_size)))
    345 
    346     return ret_val
    347 
    348 
    349202def bootstrap3(estimator, db, probe_type, unusual_case, subseries_size, num_trials):
    350203    ret_val = []
     
    355208
    356209
    357 # Returns the test case name that clearly has higher RTT; otherwise, returns None
    358 def boxTest(params, test_cases, samples):
    359     if len(test_cases) != 2:
    360         # XXX: somehow generalize the box test to handle more than 2 cases
    361         raise Exception()
    362     dists = samples2Distributions(samples,'packet_rtt', test_cases) #XXX: field from params
    363 
    364     tmp1,tmp2 = dists.items()
    365     test_case1,dist1 = tmp1
    366     test_case2,dist2 = tmp2
    367    
    368     d1_high = numpy.percentile(dist1, params['high'])
    369     d2_low = numpy.percentile(dist2, params['low'])
    370     if d1_high < d2_low:
    371         return test_case2
    372 
    373     d1_low = numpy.percentile(dist1, params['low'])
    374     d2_high = numpy.percentile(dist2, params['high'])
    375 
    376     if d2_high < d1_low:
    377         return test_case1
    378    
    379     return None
    380 
    381 
    382210# Returns 1 if unusual_case is unusual in the expected direction
    383211#         0 if it isn't unusual
    384212#        -1 if it is unusual in the wrong direction
    385213def multiBoxTest(params, greater, samples):
    386     uc = [s['unusual_case'] for s in samples]
    387     rest = [s['other_cases'] for s in samples]
     214    uc = [s['unusual_packet'] for s in samples]
     215    rest = [s['other_packet'] for s in samples]
    388216   
    389217    uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
     
    407235#         0 otherwise
    408236def summaryTest(f, params, greater, samples):
    409     diffs = [s['unusual_case']-s['other_cases'] for s in samples]
     237    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
    410238
    411239    mh = f(diffs, params['distance'])
     
    420248        else:
    421249            return 0
     250
    422251
    423252midsummaryTest = functools.partial(summaryTest, midsummary)
     
    451280
    452281def kfilter(params, observations):
    453     x = numpy.array(observations)
     282    x = numpy.array(observations)   
    454283    movement = 0
    455     est = []   
     284    est = []
    456285    var = []
    457286    kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
    458                         #P  = 10000,          # initial variance
    459                         P  = 10,          # initial variance
     287                        #P  = 10000,         # initial variance
     288                        P  = 10,            # initial variance
    460289                        R  = numpy.std(x),   # msensor noise
    461290                        Q  = 0)              # movement noise
     
    471300
    472301def kalmanTest(params, greater, samples):
    473     diffs = [s['unusual_case']-s['other_cases'] for s in samples]
     302    diffs = [s['unusual_packet']-s['other_packet'] for s in samples]
    474303
    475304    m = kfilter(params, diffs)['est'][-1]
     
    486315
    487316
    488 def kalmanTest2(params, greater, samples):
    489     diffs = [s['unusual_case']-s['other_cases'] for s in samples]
    490 
    491     estimates = []
    492     size = 500
    493     for i in range(100):
    494         off = random.randrange(0,len(diffs))
    495         sub = diffs[off:size]
    496         if len(sub) < size:
    497             sub += diffs[0:size-len(sub)]
    498         estimates.append(kfilter(params, sub)['est'][-1])
    499            
    500     m = quadsummary(estimates)
     317def tsvalwmeanTest(params, greater, samples):
     318    m = tsvalwmean(samples)
    501319    if greater:
    502320        if m > params['threshold']:
     
    509327        else:
    510328            return 0
    511 
Note: See TracChangeset for help on using the changeset viewer.