Context Navigation

← Previous Change
Next Change →

Changeset 8 for trunk

Timestamp:

07/09/15 19:01:23 (10 years ago)

Author:

tim

Message:

Location:

trunk

Files:

: 3 edited

bin/train (modified) (12 diffs)
lib/nanownlib/stats.py (modified) (3 diffs)
lib/nanownlib/storage.py (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/bin/train

-                      r7
+                      r8
+def trainBoxTest(db, test_cases, longest, subsample_size):
+def trainBoxTest(db, unusual_case, greater, subseries_size):
     def trainAux(low,high,num_trials):
+        estimator = functools.partial(boxTest, {'low':low, 'high':high})
+        estimates = bootstrap(estimator, db, 'train', test_cases, subsample_size, num_trials)
+        null_estimates = bootstrap(estimator, db, 'train_null', test_cases, subsample_size, num_trials)
+        #XXX: need to have a configurable policy on what we're looking for.
+        #     which is longest or which is shortest?
+        bad_estimates = len([e for e in estimates if e != longest])
+        bad_null_estimates = len([e for e in null_estimates if e != None])
+        false_negatives = 100.0*bad_estimates/num_trials
+        false_positives = 100.0*bad_null_estimates/num_trials
+        return false_positives,false_negatives
+    start = time.time()
+    wt = WorkerThreads(2, trainAux)
+    width = 2.0
+    performance = []
+    percentiles = list(range(0,50,2))
+    for low in percentiles:
+        wt.addJob(low, (low,low+width,200))
+    wt.wait()
+    while not wt.resultq.empty():
+        job_id,errors = wt.resultq.get()
+        fp,fn = errors
+        performance.append(((fp+fn)/2.0, job_id, fn, fp))
+    performance.sort()
+    pprint.pprint(performance)
+    print(time.time()-start)
+    lows = [p[1] for p in performance[0:5]]
+    widths = [w/10.0 for w in range(0,105,5)]
+    performance = []
+    for width in widths:
+        false_positives = []
+        false_negatives = []
+        for low in lows:
+            wt.addJob(low,(low,low+width,150))
+        wt.wait()
+        while not wt.resultq.empty():
+            job_id,errors = wt.resultq.get()
+            fp,fn = errors
+            false_negatives.append(fn)
+            false_positives.append(fp)
+        #print(width, false_negatives)
+        #print(width, false_positives)
+        performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0,
+                            width, statistics.mean(false_negatives), statistics.mean(false_positives)))
+    performance.sort()
+    pprint.pprint(performance)
+    good_width = performance[0][1]
+    print("good_width:",good_width)
+    lc = {}
+    for low in lows:
+        if low-1 > 0:
+            lc[low-1] = None
+        lc[low] = None
+        lc[low+1] = None
+    lows = lc.keys()
+    performance = []
+    for low in lows:
+        wt.addJob(low, (low,low+good_width,300))
+    wt.wait()
+    while not wt.resultq.empty():
+        job_id,errors = wt.resultq.get()
+        fp,fn = errors
+        performance.append(((fp+fn)/2.0, job_id, fn, fp))
+    performance.sort()
+    pprint.pprint(performance)
+    best_low = performance[0][1]
+    print("best_low:", best_low)
+    widths = [good_width-0.4,good_width-0.3,good_width-0.2,good_width-0.1,
+              good_width,good_width+0.1,good_width+0.2,good_width+0.3,good_width+0.4]
+    performance = []
+    for width in widths:
+        wt.addJob(width, (best_low,best_low+width,200))
+    wt.wait()
+    while not wt.resultq.empty():
+        job_id,errors = wt.resultq.get()
+        fp,fn = errors
+        performance.append(((fp+fn)/2.0, job_id, fn, fp))
+    performance.sort()
+    pprint.pprint(performance)
+    best_width=performance[0][1]
+    print("best_width:",best_width)
+    print("final_performance:", performance[0][0])
+    return {"low":best_low,"high":best_low+good_width}
+def trainBoxTest2(db, unusual_case, greater, subsample_size):
+    def trainAux(low,high,num_trials):
+        estimator = functools.partial(multiBoxTest, {'low':low, 'high':high}, unusual_case, greater)
+        estimates = bootstrap2(estimator, db, 'train', subsample_size, num_trials)
+        null_estimates = bootstrap2(estimator, db, 'train_null', subsample_size, num_trials)
+        estimator = functools.partial(multiBoxTest, {'low':low, 'high':high}, greater)
+        estimates = bootstrap3(estimator, db, 'train', unusual_case, subseries_size, num_trials)
+        null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, subseries_size, num_trials)
         bad_estimates = len([e for e in estimates if e != 1])
 …
     num_trials = 200
+    width = 2.0
+    performance = []
+    percentiles = list(range(0,50,2))
+    for low in percentiles:
+    width = 1.0
+    performance = []
+    for low in range(0,50):
         wt.addJob(low, (low,low+width,num_trials))
     wt.wait()
 …
     print(time.time()-start)
     num_trials = 150
+    num_trials = 200
     lows = [p[1] for p in performance[0:5]]
     widths = [w/10.0 for w in range(0,105,5)]
+    widths = [w/10.0 for w in range(5,65,5)]
     performance = []
     for width in widths:
 …
         #print(width, false_negatives)
         #print(width, false_positives)
+        performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0,
+        #performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0,
+        #                    width, statistics.mean(false_negatives), statistics.mean(false_positives)))
+        performance.append((abs(statistics.mean(false_positives)-statistics.mean(false_negatives)),
                             width, statistics.mean(false_negatives), statistics.mean(false_positives)))
     performance.sort()
 …
+    lc = {}
+    for low in lows:
+        if low-1 >= 0:
+            lc[low-1] = None
+        lc[low] = None
+        lc[low+1] = None
+    lows = lc.keys()
+    print("candidate lows:")
+    pprint.pprint(lows)
+    num_trials = 300
+    num_trials = 500
     performance = []
     for low in lows:
 …
     print("best_low:", best_low)
     num_trials = 200
     widths = [good_width-0.4,good_width-0.3,good_width-0.2,good_width-0.1,
               good_width,good_width+0.1,good_width+0.2,good_width+0.3,good_width+0.4]
+    num_trials = 500
+    widths = [good_width+(x/10.0) for x in range(-6,7) if good_width+(x/10.0) > 0.0]
     performance = []
     for width in widths:
 …
     print("final_performance:", performance[0][0])
     params = json.dumps({"low":best_low,"high":best_low+good_width})
+    params = json.dumps({"low":best_low,"high":best_low+best_width})
     return {'algorithm':"boxtest",
             'params':params,
             'sample_size':subsample_size,
+            'sample_size':subseries_size,
             'num_trials':num_trials,
             'trial_type':"train",
 …
     #determine expected delta based on differences
+    start = time.time()
+    mean_diffs = [s['unusual_case']-s['other_cases'] for s in subseries(db, 'train', unusual_case)]
+    mean_diffs = [s['unusual_case']-s['other_cases'] for s in db.subseries('train', unusual_case)]
     threshold = trimean(mean_diffs)/2.0
+    print("initial threshold:", threshold)
+    print("median threshold:", statistics.median(mean_diffs)/2.0)
+    print("midhinge threshold:", midhinge(mean_diffs)/2.0)
+    print("trimean threshold:", trimean(mean_diffs)/2.0)
+    mean_diffs = [s['unusual_case']-s['other_cases'] for s in subseries(db, 'train_null', unusual_case)]
+    print(len(mean_diffs))
+    print("null mean:", statistics.mean(mean_diffs))
+    print("null median:", statistics.median(mean_diffs))
+    print("null midhinge:", midhinge(mean_diffs))
+    print("null trimean:", trimean(mean_diffs))
+    print(time.time()-start)
+    start = time.time()
+    wt = WorkerThreads(4, trainAux)
+    num_trials = 20
+    performance = []
+    #for distance in range(1,46,4):
+    for distance in range(25,46,4):
+    print("init_threshold:", threshold)
+    wt = WorkerThreads(2, trainAux)
+    num_trials = 500
+    performance = []
+    for distance in range(1,50):
         wt.addJob(distance, (distance,threshold,num_trials))
     wt.wait()
 …
         fp,fn = errors
         performance.append(((fp+fn)/2.0, job_id, fn, fp))
+    #for distance in range(25,46,4):
+    #    job_id = distance
+    #    fp,fn = trainAux(distance, threshold, num_trials)
+    #    performance.append(((fp+fn)/2.0, job_id, fn, fp))
+    performance.sort()
+    pprint.pprint(performance)
+    print(time.time()-start)
+    performance.sort()
+    #pprint.pprint(performance)
     good_distance = performance[0][1]
     print("good_distance:",good_distance)
+    num_trials = 20
+    start = time.time()
+    performance = []
+    for t in range(80,125,5):
+    num_trials = 500
+    performance = []
+    for t in range(50,154,4):
         wt.addJob(threshold*(t/100.0), (good_distance,threshold*(t/100.0),num_trials))
     wt.wait()
 …
         performance.append(((fp+fn)/2.0, job_id, fn, fp))
     performance.sort()
+    pprint.pprint(performance)
+    print(time.time()-start)
+    #pprint.pprint(performance)
     good_threshold = performance[0][1]
     print("good_threshold:", good_threshold)
+    num_trials = 20
+    start = time.time()
+    performance = []
+    for d in range(-4,5):
+        wt.addJob(good_distance+d, (good_distance+d,good_threshold,num_trials))
+    wt.wait()
+    while not wt.resultq.empty():
+        job_id,errors = wt.resultq.get()
+        fp,fn = errors
+        performance.append(((fp+fn)/2.0, job_id, fn, fp))
+    performance.sort()
+    pprint.pprint(performance)
+    print(time.time()-start)
+    num_trials = 500
+    performance = []
+    for d in [good_distance+s for s in range(-4,5) if good_distance+s > -1]:
+        wt.addJob(d, (d,good_threshold,num_trials))
+    wt.wait()
+    while not wt.resultq.empty():
+        job_id,errors = wt.resultq.get()
+        fp,fn = errors
+        performance.append(((fp+fn)/2.0, job_id, fn, fp))
+    performance.sort()
+    #pprint.pprint(performance)
     best_distance = performance[0][1]
     print("best_distance:",best_distance)
     num_trials = 20
     start = time.time()
+    num_trials = 500
     performance = []
     for t in range(95,106):
 …
         performance.append(((fp+fn)/2.0, job_id, fn, fp))
     performance.sort()
+    pprint.pprint(performance)
+    print(time.time()-start)
+    #pprint.pprint(performance)
     best_threshold = performance[0][1]
     print("best_threshold:", best_threshold)
 …
 import cProfile
+for size in (500,1000,2000,4000,5000,6000):
+    start = time.time()
+    #cProfile.run('results = trainMidhinge(db, unusual_case, greater, 100)')
+    results = trainMidhinge(db, unusual_case, greater, size)
+    #db.addClassifierResults(results)
+    print("midhinge result:")
+    pprint.pprint(results)
+    print(":", time.time()-start)
+sys.exit(0)
 start = time.time()
+cProfile.run('results = trainMidhinge(db, unusual_case, greater, 100)')
+#results = trainMidhinge(db, unusual_case, greater, 100)
+results = trainBoxTest(db, unusual_case, greater, 6000)
 #db.addClassifierResults(results)
+print("midhinge result:", results)
+end = time.time()
+print(":", end-start)
+sys.exit(0)
+start = time.time()
+results = trainBoxTest2(db, unusual_case, greater, 6000)
+db.addClassifierResults(results)
+print("multi box test result:", results)
+end = time.time()
+print(":", end-start)
+#start = time.time()
+#print("box test params:", trainBoxTest(db, test_cases, 'long', 100))
+#end = time.time()
+#print(":", end-start)
+print("multi box test result:")
+pprint.pprint(results)
+print(":", time.time()-start)

trunk/lib/nanownlib/stats.py

-                      r7
+                      r8
     ret_val = []
     for t in range(num_trials):
         ret_val.append(estimator(subseries(db, probe_type, unusual_case, subseries_size)))
+        ret_val.append(estimator(db.subseries(probe_type, unusual_case, subseries_size)))
     return ret_val
 …
 #         0 if it isn't unusual
 #        -1 if it is unusual in the wrong direction
+def multiBoxTest(params, unusual_case, greater, samples):
+    #XXX: packet_rtt field from params
+    dists = samples2Distributions(samples, 'packet_rtt')
+    uc = dists[unusual_case]
+    rest = []
+    for tc,d in dists.items():
+        if tc != unusual_case:
+            rest.extend(d)
+def multiBoxTest(params, greater, samples):
+    uc = [s['unusual_case'] for s in samples]
+    rest = [s['other_cases'] for s in samples]
     uc_high = numpy.percentile(uc, params['high'])
     rest_low = numpy.percentile(rest, params['low'])
 …
     mh = midhinge(diffs, params['distance'])
+    #mh = trimean(diffs, params['distance'])
     if greater:
         if mh > params['threshold']:

trunk/lib/nanownlib/storage.py

-                      r7
+                      r8
 import threading
 import sqlite3
+import numpy
 def _newid():
 …
     cursor = None
     _population_sizes = None
+    _population_cache = None
     def __init__(self, path):
 …
         self.conn.row_factory = sqlite3.Row
         self._population_sizes = {}
+        self._population_cache = {}
         if not exists:
 …
             self.conn.close()
     def populationSize(self, probe_type):
         if probe_type in self._population_sizes:
 …
             print(e)
             return 0
+    def subseries(self, probe_type, unusual_case, size=None, offset=None, field='packet_rtt'):
+        if (probe_type,unusual_case,field) not in self._population_cache:
+            query="""
+            SELECT %(field)s AS unusual_case,
+                   (SELECT avg(%(field)s) FROM probes,analysis
+                    WHERE analysis.probe_id=probes.id AND probes.test_case!=:unusual_case AND probes.type=:probe_type AND sample=u.sample) AS other_cases
+            FROM   (SELECT probes.sample,%(field)s FROM probes,analysis
+                    WHERE analysis.probe_id=probes.id AND probes.test_case =:unusual_case AND probes.type=:probe_type) u
+            """ % {"field":field}
+            params = {"probe_type":probe_type, "unusual_case":unusual_case}
+            cursor = self.conn.cursor()
+            cursor.execute(query, params)
+            self._population_cache[(probe_type,unusual_case,field)] = [dict(row) for row in cursor.fetchall()]
+        population = self._population_cache[(probe_type,unusual_case,field)]
+        if size == None or size > len(population):
+            size = len(population)
+        if offset == None or offset >= len(population) or offset < 0:
+            offset = numpy.random.random_integers(0,len(population)-1)
+        ret_val = population[offset:offset+size]
+        if len(ret_val) < size:
+            ret_val += population[0:size-len(ret_val)]
+        return ret_val
+    def clearCache(self):
+        self._population_cache = {}
     def _insert(self, table, row):

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 8 for trunk

Legend:

trunk/bin/train

trunk/lib/nanownlib/stats.py

trunk/lib/nanownlib/storage.py

Download in other formats: