Context Navigation

← Previous Change
Next Change →

nanownlib

Timestamp:

07/13/15 19:16:30 (10 years ago)

Author:

tim

Message:

Location:

trunk/lib/nanownlib

Files:

: 3 edited

__init__.py (modified) (13 diffs)
stats.py (modified) (8 diffs)
storage.py (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/lib/nanownlib/init.py

-                      r6
+                      r10
 import sys
 import time
+import traceback
 import random
 import argparse
 …
 def removeDuplicatePackets(packets):
     #return packets
     suspect = None
+    suspect = ''
     seen = {}
     # XXX: Need to review this deduplication algorithm and make sure it is correct
     for p in packets:
         key = (p['sent'],p['tcpseq'],p['tcpack'],p['payload_len'])
+        if (key not in seen)\
+           or p['sent']==1 and (seen[key]['observed'] < p['observed'])\
+           or p['sent']==0 and (seen[key]['observed'] > p['observed']):
+            #if (key not in seen) or (seen[key]['observed'] > p['observed']):
+        if (key not in seen):
             seen[key] = p
+    if len(seen) < len(packets):
+        suspect = 'd'
+        #sys.stderr.write("INFO: removed %d duplicate packets.\n" % (len(packets) - len(seen)))
+            continue
+        if p['sent']==1 and (seen[key]['observed'] > p['observed']): #earliest sent
+            seen[key] = p
+            suspect += 's'
+            continue
+        if p['sent']==0 and (seen[key]['observed'] > p['observed']): #earliest rcvd
+            seen[key] = p
+            suspect += 'r'
+            continue
+    #if len(seen) < len(packets):
+    #   sys.stderr.write("INFO: removed %d duplicate packets.\n" % (len(packets) - len(seen)))
     return suspect,seen.values()
 …
     suspect,packets = removeDuplicatePackets(packets)
+    #sort_key = lambda d: (d['tcpseq'],d['tcpack'])
+    sort_key = lambda d: (d['observed'],d['tcpseq'])
+    sort_key = lambda d: (d['tcpseq'],d['observed'])
     sent = sorted((p for p in packets if p['sent']==1 and p['payload_len']>0), key=sort_key)
     rcvd = sorted((p for p in packets if p['sent']==0 and p['payload_len']>0), key=sort_key)
+    if len(sent) <= trim_sent:
+        last_sent = sent[-1]
+    else:
+        last_sent = sent[trim_sent]
+    if len(rcvd) <= trim_rcvd:
+        last_rcvd = rcvd[0]
+    else:
+        last_rcvd = rcvd[len(rcvd)-1-trim_rcvd]
+    alt_key = lambda d: (d['observed'],d['tcpseq'])
+    rcvd_alt = sorted((p for p in packets if p['sent']==0 and p['payload_len']>0), key=alt_key)
+    s_off = trim_sent
+    if s_off >= len(sent):
+        s_off = -1
+    last_sent = sent[s_off]
+    r_off = len(rcvd) - trim_rcvd - 1
+    if r_off <= 0:
+        r_off = 0
+    last_rcvd = rcvd[r_off]
+    if last_rcvd != rcvd_alt[r_off]:
+        suspect += 'R'
     packet_rtt = last_rcvd['observed'] - last_sent['observed']
 …
     query="""
       SELECT packet_rtt-(SELECT avg(packet_rtt) FROM probes,trim_analysis
+                         WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND sample=u.sample AND probes.type in ('train','test'))
+      FROM (SELECT probes.sample,packet_rtt FROM probes,trim_analysis WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case=:unusual_case AND probes.type in ('train','test')) u
+                         WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND sample=u.s AND probes.type in ('train','test'))
+      FROM (SELECT probes.sample s,packet_rtt FROM probes,trim_analysis WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case=:unusual_case AND probes.type in ('train','test') AND 1 NOT IN (select 1 from probes p,trim_analysis t WHERE p.sample=s AND t.probe_id=p.id AND t.suspect LIKE '%R%')) u
+    """
+    query="""
+      SELECT packet_rtt-(SELECT avg(packet_rtt) FROM probes,trim_analysis
+                         WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND sample=u.s AND probes.type in ('train','test'))
+      FROM (SELECT probes.sample s,packet_rtt FROM probes,trim_analysis WHERE sent_trimmed=:strim AND rcvd_trimmed=:rtrim AND trim_analysis.probe_id=probes.id AND probes.test_case=:unusual_case AND probes.type in ('train','test')) u
     """
 …
     differences = [row[0] for row in cursor]
     return trimean(differences),mad(differences)
+    return ubersummary(differences),mad(differences)
 …
     db.conn.execute("CREATE INDEX IF NOT EXISTS packets_probe ON packets (probe_id)")
     pcursor = db.conn.cursor()
     kcursor = db.conn.cursor()
+    db.conn.commit()
     pcursor.execute("SELECT tcpts_mean FROM meta")
 …
     pcursor.execute("DELETE FROM trim_analysis")
     db.conn.commit()
+    def loadPackets(db):
+        cursor = db.conn.cursor()
+        cursor.execute("SELECT * FROM packets ORDER BY probe_id")
+        probe_id = None
+        entry = []
+        ret_val = []
+        for p in cursor:
+            if probe_id == None:
+                probe_id = p['probe_id']
+            if p['probe_id'] != probe_id:
+                ret_val.append((probe_id,entry))
+                probe_id = p['probe_id']
+                entry = []
+            entry.append(dict(p))
+        ret_val.append((probe_id,entry))
+        return ret_val
+    start = time.time()
+    packet_cache = loadPackets(db)
+    print("packets loaded in: %f" % (time.time()-start))
     count = 0
     sent_tally = []
     rcvd_tally = []
+    for pid, in pcursor.execute("SELECT id FROM probes"):
+        kcursor.execute("SELECT * FROM packets WHERE probe_id=?", (pid,))
+    for probe_id,packets in packet_cache:
         try:
             analysis,s,r = analyzePackets(kcursor.fetchall(), timestamp_precision)
             analysis['probe_id'] = pid
+            analysis,s,r = analyzePackets(packets, timestamp_precision)
+            analysis['probe_id'] = probe_id
             sent_tally.append(s)
             rcvd_tally.append(r)
+            db.addTrimAnalyses([analysis])
         except Exception as e:
             print(e)
             sys.stderr.write("WARN: couldn't find enough packets for probe_id=%s\n" % pid)
+            traceback.print_exc()
+            sys.stderr.write("WARN: couldn't find enough packets for probe_id=%s\n" % probe_id)
         #print(pid,analysis)
-        db.addTrimAnalyses([analysis])
         count += 1
     db.conn.commit()
 …
             if strim == 0 and rtrim == 0:
                 continue # no point in doing 0,0 again
+            for pid, in pcursor.execute("SELECT id FROM probes"):
+                kcursor.execute("SELECT * FROM packets WHERE probe_id=?", (pid,))
+            for probe_id,packets in packet_cache:
                 try:
                     analysis,s,r = analyzePackets(kcursor.fetchall(), timestamp_precision, strim, rtrim)
                     analysis['probe_id'] = pid
+                    analysis,s,r = analyzePackets(packets, timestamp_precision, strim, rtrim)
+                    analysis['probe_id'] = probe_id
                 except Exception as e:
                     print(e)
 …
                 db.addTrimAnalyses([analysis])
             db.conn.commit()
+    db.conn.commit()
     # Populate analysis table so findUnusualTestCase can give us a starting point
 …
     for strim in range(1,num_sent):
         delta,mad = evaluations[(strim,0)]
         if abs(good_delta - delta) < abs(delta_margin*good_delta) and mad < good_mad:
+        if delta*good_delta > 0.0 and (abs(good_delta) - abs(delta)) < abs(delta_margin*good_delta) and mad < good_mad:
             best_strim = strim
         else:
 …
     for rtrim in range(1,num_rcvd):
         delta,mad = evaluations[(best_strim,rtrim)]
         if (abs(delta) > abs(good_delta) or abs(good_delta - delta) < abs(delta_margin*good_delta)) and mad < good_mad:
+        if delta*good_delta > 0.0 and (abs(good_delta) - abs(delta)) < abs(delta_margin*good_delta) and mad < good_mad:
             best_rtrim = rtrim
         else:
 …
     cursor = db.conn.cursor()
     cursor.execute("SELECT packet_rtt FROM probes,analysis WHERE probes.id=analysis.probe_id AND probes.type in ('train','test')")
     global_tm = trimean([row['packet_rtt'] for row in cursor])
+    global_tm = quadsummary([row['packet_rtt'] for row in cursor])
     tm_abs = []
 …
     for tc in test_cases:
         cursor.execute("SELECT packet_rtt FROM probes,analysis WHERE probes.id=analysis.probe_id AND probes.type in ('train','test') AND probes.test_case=?", (tc,))
         tm_map[tc] = trimean([row['packet_rtt'] for row in cursor])
+        tm_map[tc] = quadsummary([row['packet_rtt'] for row in cursor])
         tm_abs.append((abs(tm_map[tc]-global_tm), tc))
     magnitude,tc = max(tm_abs)
     cursor.execute("SELECT packet_rtt FROM probes,analysis WHERE probes.id=analysis.probe_id AND probes.type in ('train','test') AND probes.test_case<>?", (tc,))
     remaining_tm = trimean([row['packet_rtt'] for row in cursor])
+    remaining_tm = quadsummary([row['packet_rtt'] for row in cursor])
     ret_val = (tc, tm_map[tc]-remaining_tm)

trunk/lib/nanownlib/stats.py

-                      r8
+                      r10
 import sys
 import os
+import functools
 import math
 import statistics
 …
+def midhinge(values, distance=25):
+    return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
+def midsummary(values, distance=25):
+    #return (numpy.percentile(values, 50-distance) + numpy.percentile(values, 50+distance))/2.0
+    l,h = numpy.percentile(values, (50-distance,50+distance))
+    return (l+h)/2.0
 def trimean(values, distance=25):
+    return (midhinge(values, distance) + statistics.median(values))/2
+    return (midsummary(values, distance) + statistics.median(values))/2
+def ubersummary(values, distance=25):
+    left2 = 50-distance
+    left1 = left2/2.0
+    left3 = (left2+50)/2.0
+    right2 = 50+distance
+    right3 = (right2+50)/2.0
+    right1 = (right2+100)/2.0
+    l1,l2,l3,r3,r2,r1 = numpy.percentile(values, (left1,left2,left3,right3,right2,right1))
+    #print(left1,left2,left3,50,right3,right2,right1)
+    #print(l1,l2,l3,m,r3,r2,r1)
+    return (l1+l2*4+l3+r3+r2*4+r1)/12.0
+    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
+def quadsummary(values, distance=25):
+    left2 = 50-distance
+    left1 = left2/2.0
+    right2 = 50+distance
+    right1 = (right2+100)/2.0
+    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
+    #print(left1,left2,left3,50,right3,right2,right1)
+    #print(l1,l2,l3,m,r3,r2,r1)
+    return (l1+l2+r2+r1)/4.0
+    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
+def quadsummary(values, distance=25):
+    left1 = 50-distance
+    left2 = (left1+50)/2.0
+    right1 = 50+distance
+    right2 = (right1+50)/2.0
+    l1,l2,r2,r1 = numpy.percentile(values, (left1,left2,right2,right1))
+    #print(left1,left2,left3,50,right3,right2,right1)
+    #print(l1,l2,l3,m,r3,r2,r1)
+    return (l1+l2+r2+r1)/4.0
+    #return statistics.mean((l1,l2,l3,m,r3,r2,r1))
 def weightedMean(derived, weights):
     normalizer = sum(weights.values())/len(weights)
 …
 def estimateMidhinge(derived):
     return midhinge([(d['long']-d['short']) for d in derived.values()])
+def estimateMidsummary(derived):
+    return midsummary([(d['long']-d['short']) for d in derived.values()])
 …
     rest = [s['other_cases'] for s in samples]
     uc_high = numpy.percentile(uc, params['high'])
     rest_low = numpy.percentile(rest, params['low'])
+    uc_high,uc_low = numpy.percentile(uc, (params['high'],params['low']))
+    rest_high,rest_low = numpy.percentile(rest, (params['high'],params['low']))
     if uc_high < rest_low:
         if greater:
 …
             return 1
-    uc_low = numpy.percentile(uc, params['low'])
-    rest_high = numpy.percentile(rest, params['high'])
     if rest_high < uc_low:
         if greater:
 …
 # Returns 1 if unusual_case is unusual in the expected direction
 #         0 otherwise
 def midhingeTest(params, greater, samples):
+def summaryTest(f, params, greater, samples):
     diffs = [s['unusual_case']-s['other_cases'] for s in samples]
+    mh = midhinge(diffs, params['distance'])
+    #mh = trimean(diffs, params['distance'])
+    mh = f(diffs, params['distance'])
     if greater:
         if mh > params['threshold']:
 …
             return 0
+midsummaryTest = functools.partial(summaryTest, midsummary)
+trimeanTest = functools.partial(summaryTest, trimean)
+ubersummaryTest = functools.partial(summaryTest, ubersummary)
+quadsummaryTest = functools.partial(summaryTest, quadsummary)
 def rmse(expected, measurements):
 …
 def nrmse(expected, measurements):
     return rmse(expected, measurements)/(max(measurements)-min(measurements))
+class KalmanFilter1D:
+    def __init__(self, x0, P, R, Q):
+        self.x = x0
+        self.P = P
+        self.R = R
+        self.Q = Q
+    def update(self, z):
+        self.x = (self.P * z + self.x * self.R) / (self.P + self.R)
+        self.P = 1. / (1./self.P + 1./self.R)
+    def predict(self, u=0.0):
+        self.x += u
+        self.P += self.Q
+def kfilter(params, observations):
+    x = numpy.array(observations)
+    movement = 0
+    est = []
+    var = []
+    kf = KalmanFilter1D(x0 = quadsummary(x), # initial state
+                        #P  = 10000,          # initial variance
+                        P  = 10,          # initial variance
+                        R  = numpy.std(x),   # msensor noise
+                        Q  = 0)              # movement noise
+    for round in range(1):
+        for d in x:
+            kf.predict(movement)
+            kf.update(d)
+            est.append(kf.x)
+            var.append(kf.P)
+    return({'est':est, 'var':var})
+def kalmanTest(params, greater, samples):
+    diffs = [s['unusual_case']-s['other_cases'] for s in samples]
+    m = kfilter(params, diffs)['est'][-1]
+    if greater:
+        if m > params['threshold']:
+            return 1
+        else:
+            return 0
+    else:
+        if m < params['threshold']:
+            return 1
+        else:
+            return 0
+def kalmanTest2(params, greater, samples):
+    diffs = [s['unusual_case']-s['other_cases'] for s in samples]
+    estimates = []
+    size = 500
+    for i in range(100):
+        off = random.randrange(0,len(diffs))
+        sub = diffs[off:size]
+        if len(sub) < size:
+            sub += diffs[0:size-len(sub)]
+        estimates.append(kfilter(params, sub)['est'][-1])
+    m = quadsummary(estimates)
+    if greater:
+        if m > params['threshold']:
+            return 1
+        else:
+            return 0
+    else:
+        if m < params['threshold']:
+            return 1
+        else:
+            return 0

trunk/lib/nanownlib/storage.py

-                      r9
+                      r10
 import os
 import uuid
+import random
 import threading
 import sqlite3
 import numpy
+# Don't trust numpy's seeding
+numpy.random.seed(random.SystemRandom().randint(0,2**32-1))
 def _newid():
 …
     _population_sizes = None
     _population_cache = None
+    _offset_cache = None
+    _cur_offsets = None
     def __init__(self, path):
 …
         self._population_sizes = {}
         self._population_cache = {}
+        self._offset_cache = {}
+        self._cur_offsets = {}
         if not exists:
 …
             self.conn.execute(
                 """CREATE TABLE classifier_results (id BLOB PRIMARY KEY,
+                                                    algorithm TEXT,
+                                                    classifier TEXT,
+                                                    trial_type TEXT,
+                                                    num_observations INTEGER,
+                                                    num_trials INTEGER,
                                                     params TEXT,
-                                                    sample_size INTEGER,
-                                                    num_trials INTEGER,
-                                                    trial_type TEXT,
                                                     false_positives REAL,
                                                     false_negatives REAL)
 …
     def subseries(self, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
+        if (probe_type,unusual_case,field) not in self._population_cache:
+        cache_key = (probe_type,unusual_case,field)
+        if cache_key not in self._population_cache:
             query="""
             SELECT %(field)s AS unusual_case,
 …
             cursor = self.conn.cursor()
             cursor.execute(query, params)
+            self._population_cache[(probe_type,unusual_case,field)] = [dict(row) for row in cursor.fetchall()]
+        population = self._population_cache[(probe_type,unusual_case,field)]
+            p = [dict(row) for row in cursor.fetchall()]
+            self._population_cache[cache_key] = p
+            self._offset_cache[cache_key] = tuple(numpy.random.random_integers(0,len(p)-1, len(p)/5))
+            self._cur_offsets[cache_key] = 0
+        population = self._population_cache[cache_key]
         if size == None or size > len(population):
             size = len(population)
         if offset == None or offset >= len(population) or offset < 0:
+            offset = numpy.random.random_integers(0,len(population)-1)
+            offset = self._offset_cache[cache_key][self._cur_offsets[cache_key]]
+            self._cur_offsets[cache_key] = (offset + 1) % len(self._offset_cache[cache_key])
         try:
+            ret_val = population[offset:offset+size]
+            offset = int(offset)
+            size = int(size)
         except Exception as e:
             print(e, offset, size)
+            return None
+        ret_val = population[offset:offset+size]
         if len(ret_val) < size:
             ret_val += population[0:size-len(ret_val)]
         return ret_val
+    def resetOffsets(self):
+        for k in self._cur_offsets.keys():
+            self._cur_offsets[k] = 0
     def clearCache(self):
         self._population_cache = {}
+        self._offset_cache = {}
+        self._cur_offsets = {}
 …
         self.conn.commit()
         return ret_val
+    def fetchClassifierResult(self, classifier, trial_type, num_observations):
+        query = """
+          SELECT * FROM classifier_results
+          WHERE classifier=? AND trial_type=? AND num_observations=?
+          ORDER BY false_positives+false_negatives
+          LIMIT 1;
+        """
+        cursor = self.conn.cursor()
+        cursor.execute(query, (classifier, trial_type, num_observations))
+        ret_val = cursor.fetchone()
+        if ret_val != None:
+            ret_val = dict(ret_val)
+        return ret_val
+    def deleteClassifierResults(self, classifier, trial_type, num_observations=None):
+        params = {"classifier":classifier,"trial_type":trial_type,"num_observations":num_observations}
+        query = """
+          DELETE FROM classifier_results
+          WHERE classifier=:classifier AND trial_type=:trial_type
+        """
+        if num_observations != None:
+            query += " AND num_observations=:num_observations"
+        self.conn.execute(query, params)
+        self.conn.commit()

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 10 for trunk/lib/nanownlib

Legend:

trunk/lib/nanownlib/__init__.py

trunk/lib/nanownlib/stats.py

trunk/lib/nanownlib/storage.py

Download in other formats:

trunk/lib/nanownlib/init.py