source: trunk/bin/train @ 7

Last change on this file since 7 was 7, checked in by tim, 9 years ago

.

  • Property svn:executable set to *
File size: 14.4 KB
Line 
1#!/usr/bin/env python3
2#-*- mode: Python;-*-
3
4import sys
5import os
6import time
7import random
8import statistics
9import functools
10import argparse
11import threading
12import queue
13import pprint
14import json
15
16
17VERSION = "{DEVELOPMENT}"
18if VERSION == "{DEVELOPMENT}":
19    script_dir = '.'
20    try:
21        script_dir = os.path.dirname(os.path.realpath(__file__))
22    except:
23        try:
24            script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
25        except:
26            pass
27    sys.path.append("%s/../lib" % script_dir)
28
29from nanownlib import *
30import nanownlib.storage
31from nanownlib.stats import boxTest,multiBoxTest,subsample,bootstrap,bootstrap2,trimean,midhinge,midhingeTest,samples2Distributions,samples2MeanDiffs
32
33parser = argparse.ArgumentParser(
34    description="")
35#parser.add_argument('-c', dest='cases', type=str, default='{"short":10000,"long":1010000}',
36#                    help='JSON representation of echo timing cases. Default: {"short":10000,"long":1010000}')
37parser.add_argument('session_data', default=None,
38                    help='Database file storing session information')
39options = parser.parse_args()
40
41
42
43class WorkerThreads(object):
44    workq = None
45    resultq = None
46    target = None
47   
48    def __init__(self, num_workers, target):
49        self.workq = queue.Queue()
50        self.resultq = queue.Queue()
51        self.target = target
52       
53        self.workers = []
54        for i in range(num_workers):
55            t = threading.Thread(target=self._worker)
56            t.daemon = True
57            t.start()
58            self.workers.append(t)
59
60    def _worker(self):
61        while True:
62            item = self.workq.get()
63            if item == None:
64                self.workq.task_done()
65                break
66
67            job_id,args = item
68            self.resultq.put((job_id, self.target(*args)))
69            self.workq.task_done()
70
71    def addJob(self, job_id, args):
72        self.workq.put((job_id, args))
73           
74    def wait(self):
75        self.workq.join()
76
77    def stop(self):
78        for i in range(0,len(workers)):
79            self.workq.put(None)
80        for w in self.workers:
81            w.join()
82
83           
84def trainBoxTest(db, test_cases, longest, subsample_size):
85
86    def trainAux(low,high,num_trials):
87        estimator = functools.partial(boxTest, {'low':low, 'high':high})
88        estimates = bootstrap(estimator, db, 'train', test_cases, subsample_size, num_trials)
89        null_estimates = bootstrap(estimator, db, 'train_null', test_cases, subsample_size, num_trials)
90
91        #XXX: need to have a configurable policy on what we're looking for.
92        #     which is longest or which is shortest?
93        bad_estimates = len([e for e in estimates if e != longest])
94        bad_null_estimates = len([e for e in null_estimates if e != None])
95       
96        false_negatives = 100.0*bad_estimates/num_trials
97        false_positives = 100.0*bad_null_estimates/num_trials
98        return false_positives,false_negatives
99
100    start = time.time()
101    wt = WorkerThreads(2, trainAux)
102   
103    width = 2.0
104    performance = []
105    percentiles = list(range(0,50,2))
106    for low in percentiles:
107        wt.addJob(low, (low,low+width,200))
108    wt.wait()
109    while not wt.resultq.empty():
110        job_id,errors = wt.resultq.get()
111        fp,fn = errors
112        performance.append(((fp+fn)/2.0, job_id, fn, fp))
113    performance.sort()
114    pprint.pprint(performance)
115    print(time.time()-start)
116   
117    lows = [p[1] for p in performance[0:5]]
118    widths = [w/10.0 for w in range(0,105,5)]
119    performance = []
120    for width in widths:
121        false_positives = []
122        false_negatives = []
123        for low in lows:
124            wt.addJob(low,(low,low+width,150))
125        wt.wait()
126        while not wt.resultq.empty():
127            job_id,errors = wt.resultq.get()
128            fp,fn = errors
129            false_negatives.append(fn)
130            false_positives.append(fp)
131
132        #print(width, false_negatives)
133        #print(width, false_positives)
134        performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0,
135                            width, statistics.mean(false_negatives), statistics.mean(false_positives)))
136    performance.sort()
137    pprint.pprint(performance)
138    good_width = performance[0][1]
139    print("good_width:",good_width)
140
141
142    lc = {}
143    for low in lows:
144        if low-1 > 0:
145            lc[low-1] = None
146        lc[low] = None
147        lc[low+1] = None
148    lows = lc.keys()
149   
150    performance = []
151    for low in lows:
152        wt.addJob(low, (low,low+good_width,300))
153    wt.wait()
154    while not wt.resultq.empty():
155        job_id,errors = wt.resultq.get()
156        fp,fn = errors
157        performance.append(((fp+fn)/2.0, job_id, fn, fp))
158    performance.sort()
159    pprint.pprint(performance)
160    best_low = performance[0][1]
161    print("best_low:", best_low)
162
163   
164    widths = [good_width-0.4,good_width-0.3,good_width-0.2,good_width-0.1,
165              good_width,good_width+0.1,good_width+0.2,good_width+0.3,good_width+0.4]
166    performance = []
167    for width in widths:
168        wt.addJob(width, (best_low,best_low+width,200))
169    wt.wait()
170    while not wt.resultq.empty():
171        job_id,errors = wt.resultq.get()
172        fp,fn = errors
173        performance.append(((fp+fn)/2.0, job_id, fn, fp))
174    performance.sort()
175    pprint.pprint(performance)
176    best_width=performance[0][1]
177    print("best_width:",best_width)
178    print("final_performance:", performance[0][0])
179
180    return {"low":best_low,"high":best_low+good_width}
181
182
183def trainBoxTest2(db, unusual_case, greater, subsample_size):
184
185    def trainAux(low,high,num_trials):
186        estimator = functools.partial(multiBoxTest, {'low':low, 'high':high}, unusual_case, greater)
187        estimates = bootstrap2(estimator, db, 'train', subsample_size, num_trials)
188        null_estimates = bootstrap2(estimator, db, 'train_null', subsample_size, num_trials)
189
190        bad_estimates = len([e for e in estimates if e != 1])
191        bad_null_estimates = len([e for e in null_estimates if e != 0])
192       
193        false_negatives = 100.0*bad_estimates/num_trials
194        false_positives = 100.0*bad_null_estimates/num_trials
195        return false_positives,false_negatives
196
197    start = time.time()
198    wt = WorkerThreads(2, trainAux)
199   
200    num_trials = 200
201    width = 2.0
202    performance = []
203    percentiles = list(range(0,50,2))
204    for low in percentiles:
205        wt.addJob(low, (low,low+width,num_trials))
206    wt.wait()
207    while not wt.resultq.empty():
208        job_id,errors = wt.resultq.get()
209        fp,fn = errors
210        performance.append(((fp+fn)/2.0, job_id, fn, fp))
211    performance.sort()
212    pprint.pprint(performance)
213    print(time.time()-start)
214   
215    num_trials = 150
216    lows = [p[1] for p in performance[0:5]]
217    widths = [w/10.0 for w in range(0,105,5)]
218    performance = []
219    for width in widths:
220        false_positives = []
221        false_negatives = []
222        for low in lows:
223            wt.addJob(low,(low,low+width,num_trials))
224        wt.wait()
225        while not wt.resultq.empty():
226            job_id,errors = wt.resultq.get()
227            fp,fn = errors
228            false_negatives.append(fn)
229            false_positives.append(fp)
230
231        #print(width, false_negatives)
232        #print(width, false_positives)
233        performance.append(((statistics.mean(false_positives)+statistics.mean(false_negatives))/2.0,
234                            width, statistics.mean(false_negatives), statistics.mean(false_positives)))
235    performance.sort()
236    pprint.pprint(performance)
237    good_width = performance[0][1]
238    print("good_width:",good_width)
239
240
241    lc = {}
242    for low in lows:
243        if low-1 >= 0:
244            lc[low-1] = None
245        lc[low] = None
246        lc[low+1] = None
247    lows = lc.keys()
248    print("candidate lows:")
249    pprint.pprint(lows)
250   
251    num_trials = 300
252    performance = []
253    for low in lows:
254        wt.addJob(low, (low,low+good_width,num_trials))
255    wt.wait()
256    while not wt.resultq.empty():
257        job_id,errors = wt.resultq.get()
258        fp,fn = errors
259        performance.append(((fp+fn)/2.0, job_id, fn, fp))
260    performance.sort()
261    pprint.pprint(performance)
262    best_low = performance[0][1]
263    print("best_low:", best_low)
264
265    num_trials = 200
266    widths = [good_width-0.4,good_width-0.3,good_width-0.2,good_width-0.1,
267              good_width,good_width+0.1,good_width+0.2,good_width+0.3,good_width+0.4]
268    performance = []
269    for width in widths:
270        wt.addJob(width, (best_low,best_low+width,num_trials))
271    wt.wait()
272    while not wt.resultq.empty():
273        job_id,errors = wt.resultq.get()
274        fp,fn = errors
275        performance.append(((fp+fn)/2.0, job_id, fn, fp))
276    performance.sort()
277    pprint.pprint(performance)
278    best_width=performance[0][1]
279    print("best_width:",best_width)
280    print("final_performance:", performance[0][0])
281   
282    params = json.dumps({"low":best_low,"high":best_low+good_width})
283    return {'algorithm':"boxtest",
284            'params':params,
285            'sample_size':subsample_size,
286            'num_trials':num_trials,
287            'trial_type':"train",
288            'false_positives':performance[0][3],
289            'false_negatives':performance[0][2]}
290
291
292def trainMidhinge(db, unusual_case, greater, subseries_size):
293
294    def trainAux(distance, threshold, num_trials):
295        estimator = functools.partial(midhingeTest, {'distance':distance,'threshold':threshold}, greater)
296        estimates = bootstrap3(estimator, db, 'train', unusual_case, subseries_size, num_trials)
297        null_estimates = bootstrap3(estimator, db, 'train_null', unusual_case, subseries_size, num_trials)
298
299        bad_estimates = len([e for e in estimates if e != 1])
300        bad_null_estimates = len([e for e in null_estimates if e != 0])
301       
302        false_negatives = 100.0*bad_estimates/num_trials
303        false_positives = 100.0*bad_null_estimates/num_trials
304        return false_positives,false_negatives
305
306    #determine expected delta based on differences
307    start = time.time()
308    mean_diffs = [s['unusual_case']-s['other_cases'] for s in subseries(db, 'train', unusual_case)]
309    threshold = trimean(mean_diffs)/2.0
310    print("initial threshold:", threshold)
311    print("median threshold:", statistics.median(mean_diffs)/2.0)
312    print("midhinge threshold:", midhinge(mean_diffs)/2.0)
313    print("trimean threshold:", trimean(mean_diffs)/2.0)
314   
315    mean_diffs = [s['unusual_case']-s['other_cases'] for s in subseries(db, 'train_null', unusual_case)]
316    print(len(mean_diffs))
317    print("null mean:", statistics.mean(mean_diffs))
318    print("null median:", statistics.median(mean_diffs))
319    print("null midhinge:", midhinge(mean_diffs))
320    print("null trimean:", trimean(mean_diffs))
321    print(time.time()-start)
322
323   
324    start = time.time()
325    wt = WorkerThreads(4, trainAux)
326   
327    num_trials = 20
328    performance = []
329    #for distance in range(1,46,4):
330    for distance in range(25,46,4):
331        wt.addJob(distance, (distance,threshold,num_trials))
332    wt.wait()
333    while not wt.resultq.empty():
334        job_id,errors = wt.resultq.get()
335        fp,fn = errors
336        performance.append(((fp+fn)/2.0, job_id, fn, fp))
337    #for distance in range(25,46,4):
338    #    job_id = distance
339    #    fp,fn = trainAux(distance, threshold, num_trials)
340    #    performance.append(((fp+fn)/2.0, job_id, fn, fp))
341   
342    performance.sort()
343    pprint.pprint(performance)
344    print(time.time()-start)
345    good_distance = performance[0][1]
346    print("good_distance:",good_distance)
347
348   
349    num_trials = 20
350    start = time.time()
351    performance = []
352    for t in range(80,125,5):
353        wt.addJob(threshold*(t/100.0), (good_distance,threshold*(t/100.0),num_trials))
354    wt.wait()
355    while not wt.resultq.empty():
356        job_id,errors = wt.resultq.get()
357        fp,fn = errors
358        performance.append(((fp+fn)/2.0, job_id, fn, fp))
359    performance.sort()
360    pprint.pprint(performance)
361    print(time.time()-start)
362    good_threshold = performance[0][1]
363    print("good_threshold:", good_threshold)
364
365   
366    num_trials = 20
367    start = time.time()
368    performance = []
369    for d in range(-4,5):
370        wt.addJob(good_distance+d, (good_distance+d,good_threshold,num_trials))
371    wt.wait()
372    while not wt.resultq.empty():
373        job_id,errors = wt.resultq.get()
374        fp,fn = errors
375        performance.append(((fp+fn)/2.0, job_id, fn, fp))
376    performance.sort()
377    pprint.pprint(performance)
378    print(time.time()-start)
379    best_distance = performance[0][1]
380    print("best_distance:",best_distance)
381   
382    num_trials = 20
383    start = time.time()
384    performance = []
385    for t in range(95,106):
386        wt.addJob(good_threshold*(t/100.0), (best_distance,good_threshold*(t/100.0),num_trials))
387    wt.wait()
388    while not wt.resultq.empty():
389        job_id,errors = wt.resultq.get()
390        fp,fn = errors
391        performance.append(((fp+fn)/2.0, job_id, fn, fp))
392    performance.sort()
393    pprint.pprint(performance)
394    print(time.time()-start)
395    best_threshold = performance[0][1]
396    print("best_threshold:", best_threshold)
397
398    params = json.dumps({'distance':best_distance,'threshold':best_threshold})
399    return {'algorithm':"midhinge",
400            'params':params,
401            'sample_size':subseries_size,
402            'num_trials':num_trials,
403            'trial_type':"train",
404            'false_positives':performance[0][3],
405            'false_negatives':performance[0][2]}
406
407
408#classifiers = {'boxtest':{'train':trainBoxTest2, 'test':multiBoxTest},
409#               'midhinge':{'train':trainMidhinge, 'test':midhinge}}
410
411
412db = nanownlib.storage.db(options.session_data)
413#cursor = db.cursor()
414#cursor.execute("SELECT min(sample) min, max(sample) max FROM probes")
415#train_start,test_end = cursor.fetchone()
416#train_end = int(test_end-train_start)
417#test_start = train_end+1
418#subsample_size = min(10000,(train_end-train_start+1)/4)
419
420start = time.time()
421unusual_case,unusual_diff = findUnusualTestCase(db)
422greater = (unusual_diff > 0)
423print("unusual_case:", unusual_case)
424print("unusual_diff:", unusual_diff)
425end = time.time()
426print(":", end-start)
427
428import cProfile
429
430start = time.time()
431cProfile.run('results = trainMidhinge(db, unusual_case, greater, 100)')
432#results = trainMidhinge(db, unusual_case, greater, 100)
433#db.addClassifierResults(results)
434print("midhinge result:", results)
435end = time.time()
436print(":", end-start)
437
438sys.exit(0)
439
440start = time.time()
441results = trainBoxTest2(db, unusual_case, greater, 6000)
442db.addClassifierResults(results)
443print("multi box test result:", results)
444end = time.time()
445print(":", end-start)
446
447#start = time.time()
448#print("box test params:", trainBoxTest(db, test_cases, 'long', 100))
449#end = time.time()
450#print(":", end-start)
451
452
Note: See TracBrowser for help on using the repository browser.