diff --git a/apps/CardinalBanditsPureExploration/algs/KLUCB.py b/apps/CardinalBanditsPureExploration/algs/KLUCB.py index 15b37e83..906735db 100644 --- a/apps/CardinalBanditsPureExploration/algs/KLUCB.py +++ b/apps/CardinalBanditsPureExploration/algs/KLUCB.py @@ -15,148 +15,166 @@ """ +from __future__ import print_function import numpy import numpy.random -class MyAlg: - def initExp(self,butler,n,R,failure_probability,params={}): - butler.algorithms.set(key='n', value=n) - butler.algorithms.set(key='delta',value=failure_probability) - butler.algorithms.set(key='R',value=R) - - empty_list = numpy.zeros(n).tolist() - butler.algorithms.set(key='Xsum',value=empty_list) - butler.algorithms.set(key='X2sum',value=empty_list) - butler.algorithms.set(key='T',value=empty_list) - - priority_list = numpy.random.permutation(n).tolist() - butler.algorithms.set(key='priority_list',value=priority_list) - - return True - - - def getQuery(self,butler,participant_uid): - participant_dict = butler.participants.get(uid=participant_uid) - do_not_ask_hash = {key: True for key in participant_dict.get('do_not_ask_list',[])} - - kv_dict = butler.algorithms.increment_many(key_value_dict={'priority_list':0,'priority_list_cnt':1}) - priority_list = kv_dict['priority_list'] - priority_list_cnt = kv_dict['priority_list_cnt'] - - k = (priority_list_cnt-1) % len(priority_list) - while kaccuracy: - new=leftright(muhat,lower,upper,threshold) - lower=new[0] - upper=new[1] - UCB=new[2] - return UCB - -### leftright is the core funciton, decides which way to proceed with the bisection - -def leftright(muhat,lower,upper,threshold): - if muhat*(1-muhat)!=0: - shit=(upper+lower)/2 - KL=(muhat*numpy.log(muhat/shit))+((1-muhat)*numpy.log((1-muhat)/(1-shit))) - if KL>=threshold: - return [lower,shit,(shit+lower)/2] - if KL=threshold: - return [lower,shit,(shit+lower)/2] - if KL accuracy: + new = leftright(muhat, lower, upper, threshold) + lower = new[0] + upper = new[1] + UCB = new[2] + return UCB +### leftright is the core funciton, decides which way to proceed with the bisection +def leftright(muhat, lower, upper, threshold): + if muhat * (1 - muhat) != 0: + shit = (upper + lower) / 2 + KL = (muhat * numpy.log(muhat / shit)) + ( + (1 - muhat) * numpy.log((1 - muhat) / (1 - shit)) + ) + if KL >= threshold: + return [lower, shit, (shit + lower) / 2] + if KL < threshold: + return [shit, upper, (shit + upper) / 2] + if muhat == 0: + shit = (upper + lower) / 2 + KL = (1 - muhat) * numpy.log((1 - muhat) / (1 - shit)) + if KL >= threshold: + return [lower, shit, (shit + lower) / 2] + if KL < threshold: + return [shit, upper, (shit + upper) / 2] + if muhat == 1: + return [1, 1, 1] diff --git a/apps/CardinalBanditsPureExploration/algs/LilUCB.py b/apps/CardinalBanditsPureExploration/algs/LilUCB.py index 13004d99..fb011a2c 100644 --- a/apps/CardinalBanditsPureExploration/algs/LilUCB.py +++ b/apps/CardinalBanditsPureExploration/algs/LilUCB.py @@ -4,107 +4,121 @@ last updated: 11/13/2015 """ +from __future__ import print_function import numpy import numpy.random -class MyAlg: - def initExp(self,butler,n,R,failure_probability,params={}): - butler.algorithms.set(key='n', value=n) - butler.algorithms.set(key='delta',value=failure_probability) - butler.algorithms.set(key='R',value=R) - - empty_list = numpy.zeros(n).tolist() - butler.algorithms.set(key='Xsum',value=empty_list) - butler.algorithms.set(key='X2sum',value=empty_list) - butler.algorithms.set(key='T',value=empty_list) - - priority_list = numpy.random.permutation(n).tolist() - butler.algorithms.set(key='priority_list',value=priority_list) - - return True - - - def getQuery(self,butler,participant_uid): - participant_dict = butler.participants.get(uid=participant_uid) - do_not_ask_hash = {key: True for key in participant_dict.get('do_not_ask_list',[])} - - kv_dict = butler.algorithms.increment_many(key_value_dict={'priority_list':0,'priority_list_cnt':1}) - priority_list = kv_dict['priority_list'] - priority_list_cnt = kv_dict['priority_list_cnt'] - - k = (priority_list_cnt-1) % len(priority_list) - while k0: - index = numpy.random.choice(A) - else: - index = numpy.argmax(UCB) - - alt_index = numpy.random.choice(n) - while alt_index==index: - alt_index = numpy.random.choice(n) - - random_fork = numpy.random.choice(2) - if random_fork==0: - return [index,alt_index,index] - else: - return [alt_index,index,index] - - - def processAnswer(self,butler, left_id=0, right_id=0, painted_id=0, winner_id=0): - alt_index = left_id - if left_id==painted_id: - alt_index = right_id - - reward = 0. - if painted_id==winner_id: - reward = 1. - - butler.algorithms.increment_many(key_value_dict={'Xsum_'+str(painted_id):reward, - 'T_'+str(painted_id):1., - 'total_pulls':1}) - - return True - - def getModel(self,butler): - keys = butler.algorithms.get(key='keys') - key_value_dict = butler.algorithms.get(key=keys) - - n = butler.algorithms.get(key='n') - - sumX = [key_value_dict['Xsum_'+str(i)] for i in range(n)] - T = [key_value_dict['T_'+str(i)] for i in range(n)] - - mu = numpy.zeros(n) - for i in range(n): - if T[i]==0 or mu[i]==float('inf'): - mu[i] = -1 - else: - mu[i] = sumX[i] / T[i] - - prec = [numpy.sqrt(1.0/max(1,t)) for t in T] - - return mu.tolist(), prec - -def computeUCB(muhat,threshold,accuracy=(10**(-6))): - lower=muhat - upper=1 - UCB=(lower+upper)/2 - while (upper-lower)>accuracy: - new=leftright(muhat,lower,upper,threshold) - lower=new[0] - upper=new[1] - UCB=new[2] - return UCB + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="failure_probability", value=failure_probability) + + arm_key_value_dict = {} + for i in range(n): + arm_key_value_dict["Xsum_" + str(i)] = 0. + arm_key_value_dict["T_" + str(i)] = 0. + arm_key_value_dict.update({"total_pulls": 0}) + + butler.algorithms.set(key="keys", value=list(arm_key_value_dict.keys())) + butler.algorithms.set_many(key_value_dict=arm_key_value_dict) + + return True + + def getQuery(self, butler, participant_uid): + beta = 0.0 # algorithm parameter + + keys = butler.algorithms.get(key="keys") + key_value_dict = butler.algorithms.get(key=keys) + delta = butler.algorithms.get(key="failure_probability") + n = butler.algorithms.get(key="n") + + sumX = [key_value_dict["Xsum_" + str(i)] for i in range(n)] + T = [key_value_dict["T_" + str(i)] for i in range(n)] + + sigma_sq = 0.25 + + mu = numpy.zeros(n) + UCB = numpy.zeros(n) + A = [] + for i in range(n): + if T[i] == 0: + mu[i] = float("inf") + UCB[i] = float("inf") + A.append(i) + else: + # T[i] is the number of times arm has been pulled + # X[i] is incrememnted by 0 or 1 depending on answer. + mu[i] = sumX[i] / T[i] + threshold = np.log(2 * T[i] ** 2 / delta) / T[i] + UCB[i] = computeUCB(muhat=mu[i], threshold=threshold) + + if len(A) > 0: + index = numpy.random.choice(A) + else: + index = numpy.argmax(UCB) + + alt_index = numpy.random.choice(n) + while alt_index == index: + alt_index = numpy.random.choice(n) + + random_fork = numpy.random.choice(2) + if random_fork == 0: + return [index, alt_index, index] + else: + return [alt_index, index, index] + + def processAnswer(self, butler, left_id=0, right_id=0, painted_id=0, winner_id=0): + alt_index = left_id + if left_id == painted_id: + alt_index = right_id + + reward = 0. + if painted_id == winner_id: + reward = 1. + + butler.algorithms.increment_many( + key_value_dict={ + "Xsum_" + str(painted_id): reward, + "T_" + str(painted_id): 1., + "total_pulls": 1, + } + ) + + return True + + def getModel(self, butler): + keys = butler.algorithms.get(key="keys") + key_value_dict = butler.algorithms.get(key=keys) + + n = butler.algorithms.get(key="n") + + sumX = [key_value_dict["Xsum_" + str(i)] for i in range(n)] + T = [key_value_dict["T_" + str(i)] for i in range(n)] + + mu = numpy.zeros(n) + for i in range(n): + if T[i] == 0 or mu[i] == float("inf"): + mu[i] = -1 + else: + mu[i] = sumX[i] / T[i] + + prec = [numpy.sqrt(1.0 / max(1, t)) for t in T] + + return mu.tolist(), prec + + +def computeUCB(muhat, threshold, accuracy=(10 ** (-6))): + lower = muhat + upper = 1 + UCB = (lower + upper) / 2 + while (upper - lower) > accuracy: + new = leftright(muhat, lower, upper, threshold) + lower = new[0] + upper = new[1] + UCB = new[2] + return UCB + ### leftright is the core funciton, decides which way to proceed with the bisection -def leftright(muhat,lower,upper,threshold): - if muhat*(1-muhat)!=0: - shit=(upper+lower)/2 - KL=(muhat*numpy.log(muhat/shit))+((1-muhat)*numpy.log((1-muhat)/(1-shit))) - if KL>=threshold: - return [lower,shit,(shit+lower)/2] - if KL=threshold: - return [lower,shit,(shit+lower)/2] - if KL= threshold: + return [lower, shit, (shit + lower) / 2] + if KL < threshold: + return [shit, upper, (shit + upper) / 2] + if muhat == 0: + shit = (upper + lower) / 2 + KL = (1 - muhat) * numpy.log((1 - muhat) / (1 - shit)) + if KL >= threshold: + return [lower, shit, (shit + lower) / 2] + if KL < threshold: + return [shit, upper, (shit + upper) / 2] + if muhat == 1: + return [1, 1, 1] diff --git a/apps/DuelingBanditsPureExploration/algs/BR_LilUCB.py b/apps/DuelingBanditsPureExploration/algs/BR_LilUCB.py index 6fe91c24..bb2a1ee7 100644 --- a/apps/DuelingBanditsPureExploration/algs/BR_LilUCB.py +++ b/apps/DuelingBanditsPureExploration/algs/BR_LilUCB.py @@ -7,103 +7,110 @@ Jamieson et al "Sparse Borda Bandits," AISTATS 2015. """ +from __future__ import print_function import numpy import numpy.random import next.utils as utils + class MyAlg: - def initExp(self, butler, n, failure_probability, params=None): - """ + def initExp(self, butler, n, failure_probability, params=None): + """ This function is meant to set keys used later by the algorith implemented in this file. """ - butler.algorithms.set(key='n', value=n) - butler.algorithms.set(key='failure_probability', value=failure_probability) - - arm_key_value_dict = {} - for i in range(n): - arm_key_value_dict['Xsum_'+str(i)] = 0. - arm_key_value_dict['T_'+str(i)] = 0. - arm_key_value_dict.update({'total_pulls':0}) - - butler.algorithms.set(key='keys', value=list(arm_key_value_dict.keys())) - butler.algorithms.set_many(key_value_dict=arm_key_value_dict) - - return True - - def getQuery(self, butler, participant_uid): - beta = 0.0 # algorithm parameter - - keys = butler.algorithms.get(key='keys') - key_value_dict = butler.algorithms.get(key=keys) - delta = butler.algorithms.get(key='failure_probability') - n = butler.algorithms.get(key='n') - - sumX = [key_value_dict['Xsum_'+str(i)] for i in range(n)] - T = [key_value_dict['T_'+str(i)] for i in range(n)] - - sigma_sq = 0.25 - - mu = numpy.zeros(n) - UCB = numpy.zeros(n) - A = [] - for i in range(n): - if T[i]==0: - mu[i] = float('inf') - UCB[i] = float('inf') - A.append(i) - else: - mu[i] = sumX[i] / T[i] - # UCB[i] = mu[i] + (1+beta)*numpy.sqrt( 2.0*sigma_sq*numpy.log( numpy.log(4*T[i])/delta ) / T[i] ) - UCB[i] = mu[i] + (1+beta)*numpy.sqrt( 2.0*sigma_sq*numpy.log( 4*T[i]*T[i]/delta ) / T[i] ) - - if len(A)>0: - index = numpy.random.choice(A) - else: - index = numpy.argmax(UCB) - - alt_index = numpy.random.choice(n) - while alt_index==index: - alt_index = numpy.random.choice(n) - - random_fork = numpy.random.choice(2) - if random_fork==0: - return [index,alt_index,index] - else: - return [alt_index,index,index] - - - def processAnswer(self,butler, left_id=0, right_id=0, painted_id=0, winner_id=0): - alt_index = left_id - if left_id==painted_id: - alt_index = right_id - - reward = 0. - if painted_id==winner_id: - reward = 1. - - butler.algorithms.increment_many(key_value_dict={'Xsum_'+str(painted_id):reward, - 'T_'+str(painted_id):1., - 'total_pulls':1}) - - return True - - def getModel(self,butler): - keys = butler.algorithms.get(key='keys') - key_value_dict = butler.algorithms.get(key=keys) - - n = butler.algorithms.get(key='n') - - sumX = [key_value_dict['Xsum_'+str(i)] for i in range(n)] - T = [key_value_dict['T_'+str(i)] for i in range(n)] - - mu = numpy.zeros(n) - for i in range(n): - if T[i]==0 or mu[i]==float('inf'): - mu[i] = -1 - else: - mu[i] = sumX[i] / T[i] - - prec = [numpy.sqrt(1.0/max(1,t)) for t in T] - - return mu.tolist(), prec + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="failure_probability", value=failure_probability) + + arm_key_value_dict = {} + for i in range(n): + arm_key_value_dict["Xsum_" + str(i)] = 0. + arm_key_value_dict["T_" + str(i)] = 0. + arm_key_value_dict.update({"total_pulls": 0}) + + butler.algorithms.set(key="keys", value=list(arm_key_value_dict.keys())) + butler.algorithms.set_many(key_value_dict=arm_key_value_dict) + + return True + + def getQuery(self, butler, participant_uid): + beta = 0.0 # algorithm parameter + + keys = butler.algorithms.get(key="keys") + key_value_dict = butler.algorithms.get(key=keys) + delta = butler.algorithms.get(key="failure_probability") + n = butler.algorithms.get(key="n") + + sumX = [key_value_dict["Xsum_" + str(i)] for i in range(n)] + T = [key_value_dict["T_" + str(i)] for i in range(n)] + + sigma_sq = 0.25 + + mu = numpy.zeros(n) + UCB = numpy.zeros(n) + A = [] + for i in range(n): + if T[i] == 0: + mu[i] = float("inf") + UCB[i] = float("inf") + A.append(i) + else: + mu[i] = sumX[i] / T[i] + # UCB[i] = mu[i] + (1+beta)*numpy.sqrt( 2.0*sigma_sq*numpy.log( numpy.log(4*T[i])/delta ) / T[i] ) + UCB[i] = mu[i] + (1 + beta) * numpy.sqrt( + 2.0 * sigma_sq * numpy.log(4 * T[i] * T[i] / delta) / T[i] + ) + + if len(A) > 0: + index = numpy.random.choice(A) + else: + index = numpy.argmax(UCB) + + alt_index = numpy.random.choice(n) + while alt_index == index: + alt_index = numpy.random.choice(n) + + random_fork = numpy.random.choice(2) + if random_fork == 0: + return [index, alt_index, index] + else: + return [alt_index, index, index] + + def processAnswer(self, butler, left_id=0, right_id=0, painted_id=0, winner_id=0): + alt_index = left_id + if left_id == painted_id: + alt_index = right_id + + reward = 0. + if painted_id == winner_id: + reward = 1. + + butler.algorithms.increment_many( + key_value_dict={ + "Xsum_" + str(painted_id): reward, + "T_" + str(painted_id): 1., + "total_pulls": 1, + } + ) + + return True + + def getModel(self, butler): + keys = butler.algorithms.get(key="keys") + key_value_dict = butler.algorithms.get(key=keys) + + n = butler.algorithms.get(key="n") + + sumX = [key_value_dict["Xsum_" + str(i)] for i in range(n)] + T = [key_value_dict["T_" + str(i)] for i in range(n)] + + mu = numpy.zeros(n) + for i in range(n): + if T[i] == 0 or mu[i] == float("inf"): + mu[i] = -1 + else: + mu[i] = sumX[i] / T[i] + + prec = [numpy.sqrt(1.0 / max(1, t)) for t in T] + + return mu.tolist(), prec diff --git a/apps/DuelingBanditsPureExploration/algs/BR_Random.py b/apps/DuelingBanditsPureExploration/algs/BR_Random.py index 51f8215d..464eed33 100644 --- a/apps/DuelingBanditsPureExploration/algs/BR_Random.py +++ b/apps/DuelingBanditsPureExploration/algs/BR_Random.py @@ -7,34 +7,37 @@ Jamieson et al "Sparse Borda Bandits," AISTATS 2015. """ +from __future__ import print_function import numpy import numpy.random import next.utils as utils + class MyAlg: - app_id = 'DuelingBanditsPureExploration' + app_id = "DuelingBanditsPureExploration" + def initExp(self, butler, n=None, failure_probability=None, params=None): """ This function is meant to set keys used later by the algorith implemented in this file. """ - butler.algorithms.set(key='n', value=n) - butler.algorithms.set(key='failure_probability', value=failure_probability) + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="failure_probability", value=failure_probability) arm_key_value_dict = {} for i in range(n): - arm_key_value_dict['Xsum_'+str(i)] = 0. - arm_key_value_dict['T_'+str(i)] = 0. + arm_key_value_dict["Xsum_" + str(i)] = 0. + arm_key_value_dict["T_" + str(i)] = 0. - arm_key_value_dict.update({'total_pulls':0}) + arm_key_value_dict.update({"total_pulls": 0}) - butler.algorithms.set(key='keys', value=list(arm_key_value_dict.keys())) + butler.algorithms.set(key="keys", value=list(arm_key_value_dict.keys())) butler.algorithms.set_many(key_value_dict=arm_key_value_dict) return True def getQuery(self, butler, participant_uid): - n = butler.algorithms.get(key='n') + n = butler.algorithms.get(key="n") index = numpy.random.choice(n) alt_index = numpy.random.choice(n) @@ -42,43 +45,44 @@ def getQuery(self, butler, participant_uid): alt_index = numpy.random.choice(n) random_fork = numpy.random.choice(2) - if random_fork==0: - return [index,alt_index,index] + if random_fork == 0: + return [index, alt_index, index] else: - return [alt_index,index,index] + return [alt_index, index, index] - def processAnswer(self,butler, left_id=0, right_id=0, painted_id=0, winner_id=0): + def processAnswer(self, butler, left_id=0, right_id=0, painted_id=0, winner_id=0): alt_index = left_id - if left_id==painted_id: + if left_id == painted_id: alt_index = right_id reward = 0.0 - if painted_id==winner_id: + if painted_id == winner_id: reward = 1. - butler.algorithms.increment_many(key_value_dict= - {'Xsum_'+str(painted_id):reward, - 'T_'+str(painted_id):1.0, - 'total_pulls':1}) + butler.algorithms.increment_many( + key_value_dict={ + "Xsum_" + str(painted_id): reward, + "T_" + str(painted_id): 1.0, + "total_pulls": 1, + } + ) return True - def getModel(self,butler): - keys = butler.algorithms.get(key='keys') + def getModel(self, butler): + keys = butler.algorithms.get(key="keys") key_value_dict = butler.algorithms.get(key=keys) - n = butler.algorithms.get(key='n') + n = butler.algorithms.get(key="n") - sumX = [key_value_dict['Xsum_'+str(i)] for i in range(n)] - T = [key_value_dict['T_'+str(i)] for i in range(n)] + sumX = [key_value_dict["Xsum_" + str(i)] for i in range(n)] + T = [key_value_dict["T_" + str(i)] for i in range(n)] - mu = numpy.zeros(n, dtype='float') + mu = numpy.zeros(n, dtype="float") for i in range(n): - if T[i]==0 or mu[i]==float('inf'): + if T[i] == 0 or mu[i] == float("inf"): mu[i] = -1 else: mu[i] = sumX[i] * 1.0 / T[i] - prec = [numpy.sqrt(1.0/max(1,t)) for t in T] + prec = [numpy.sqrt(1.0 / max(1, t)) for t in T] return mu.tolist(), prec - - diff --git a/apps/DuelingBanditsPureExploration/algs/ValidationSampling.py b/apps/DuelingBanditsPureExploration/algs/ValidationSampling.py index 5ad53ac6..d0c41848 100644 --- a/apps/DuelingBanditsPureExploration/algs/ValidationSampling.py +++ b/apps/DuelingBanditsPureExploration/algs/ValidationSampling.py @@ -1,3 +1,4 @@ +from __future__ import print_function import numpy as np import time import next.utils as utils @@ -14,55 +15,60 @@ def getRandomQuery(n): class MyAlg: def initExp(self, butler, n=None, failure_probability=None): - butler.algorithms.set(key='n', value=n) - butler.algorithms.set(key='failure_probability', - value=failure_probability) + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="failure_probability", value=failure_probability) alg = butler.algorithms.get() - params = alg.get(u'params', None) - butler.algorithms.set(key='params', value=params) + params = alg.get(u"params", None) + butler.algorithms.set(key="params", value=params) if params: - if 'query_list' in params: - query_list = params['query_list'] + if "query_list" in params: + query_list = params["query_list"] - elif 'num_tries' in params: - num_tries = params['num_tries'] + elif "num_tries" in params: + num_tries = params["num_tries"] query_list = [getRandomQuery(n) for _ in range(num_tries)] else: - raise ValueError('Either specify "query_list" or "num_tries" ' - 'in params') + raise ValueError( + 'Either specify "query_list" or "num_tries" ' "in params" + ) else: - raise Exception("For ValidationSampling you must specifiy " - "'query_list' or 'num_tries' in params") + raise Exception( + "For ValidationSampling you must specifiy " + "'query_list' or 'num_tries' in params" + ) arm_key_value_dict = {} for i in range(n): - arm_key_value_dict['Xsum_'+str(i)] = 0. - arm_key_value_dict['T_'+str(i)] = 0. + arm_key_value_dict["Xsum_" + str(i)] = 0. + arm_key_value_dict["T_" + str(i)] = 0. - arm_key_value_dict.update({'total_pulls': 0}) + arm_key_value_dict.update({"total_pulls": 0}) - butler.algorithms.set(key='query_list', value=query_list) + butler.algorithms.set(key="query_list", value=query_list) - butler.algorithms.set(key='keys', value=list(arm_key_value_dict.keys())) + butler.algorithms.set(key="keys", value=list(arm_key_value_dict.keys())) butler.algorithms.set_many(key_value_dict=arm_key_value_dict) return True def getQuery(self, butler, participant_uid): - num_ans = butler.algorithms.get(key='total_pulls') - query_list = butler.algorithms.get(key='query_list') + num_ans = butler.algorithms.get(key="total_pulls") + query_list = butler.algorithms.get(key="query_list") i = num_ans % len(query_list) query = query_list[i] return query + [query[0]] def processAnswer(self, butler, left_id, right_id, painted_id, winner_id): - butler.algorithms.increment_many(key_value_dict= - {'Xsum_'+str(painted_id): 1.0, - 'T_'+str(painted_id): 1.0, - 'total_pulls': 1}) + butler.algorithms.increment_many( + key_value_dict={ + "Xsum_" + str(painted_id): 1.0, + "T_" + str(painted_id): 1.0, + "total_pulls": 1, + } + ) # The following lines enforce "do not ask". The query list gets shorter # each time this function is called (and an question is answered). @@ -74,19 +80,19 @@ def processAnswer(self, butler, left_id, right_id, painted_id, winner_id): return True def getModel(self, butler): - keys = butler.algorithms.get(key='keys') + keys = butler.algorithms.get(key="keys") key_value_dict = butler.algorithms.get(key=keys) - n = butler.algorithms.get(key='n') + n = butler.algorithms.get(key="n") - sumX = [key_value_dict['Xsum_'+str(i)] for i in range(n)] - T = [key_value_dict['T_'+str(i)] for i in range(n)] + sumX = [key_value_dict["Xsum_" + str(i)] for i in range(n)] + T = [key_value_dict["T_" + str(i)] for i in range(n)] - mu = np.zeros(n, dtype='float') + mu = np.zeros(n, dtype="float") for i in range(n): - if T[i] == 0 or mu[i] == float('inf'): + if T[i] == 0 or mu[i] == float("inf"): mu[i] = -1 else: mu[i] = sumX[i] * 1.0 / T[i] - prec = [np.sqrt(1.0/max(1, t)) for t in T] + prec = [np.sqrt(1.0 / max(1, t)) for t in T] return mu.tolist(), prec diff --git a/apps/DuelingBanditsPureExploration/dashboard/Dashboard.py b/apps/DuelingBanditsPureExploration/dashboard/Dashboard.py index 5fb78ad7..e717007a 100644 --- a/apps/DuelingBanditsPureExploration/dashboard/Dashboard.py +++ b/apps/DuelingBanditsPureExploration/dashboard/Dashboard.py @@ -1,12 +1,14 @@ +from __future__ import print_function import json import next.utils as utils from next.apps.AppDashboard import AppDashboard + class MyAppDashboard(AppDashboard): - def __init__(self,db,ell): - AppDashboard.__init__(self,db,ell) + def __init__(self, db, ell): + AppDashboard.__init__(self, db, ell) - def most_current_ranking(self,app, butler, alg_label): + def most_current_ranking(self, app, butler, alg_label): """ Description: Returns a ranking of arms in the form of a list of dictionaries, which is conveneint for downstream applications @@ -23,17 +25,19 @@ def most_current_ranking(self,app, butler, alg_label): (int) index : index of target (int) ranking : rank (0 to number of targets - 1) representing belief of being best arm """ - item = app.getModel(json.dumps({'exp_uid':app.exp_uid, 'args': {'alg_label':alg_label}})) + item = app.getModel( + json.dumps({"exp_uid": app.exp_uid, "args": {"alg_label": alg_label}}) + ) return_dict = {} - return_dict['headers'] = [{'label':'Rank','field':'rank'}, - {'label':'Target','field':'index'}, - {'label':'Score','field':'score'}, - {'label':'Precision','field':'precision'}] - for target in item['targets']: - for key in ['score', 'precision']: - target[key] = '{:0.5f}'.format(target[key]) - return_dict['data'] = item['targets'] - return_dict['plot_type'] = 'columnar_table' + return_dict["headers"] = [ + {"label": "Rank", "field": "rank"}, + {"label": "Target", "field": "index"}, + {"label": "Score", "field": "score"}, + {"label": "Precision", "field": "precision"}, + ] + for target in item["targets"]: + for key in ["score", "precision"]: + target[key] = "{:0.5f}".format(target[key]) + return_dict["data"] = item["targets"] + return_dict["plot_type"] = "columnar_table" return return_dict - - diff --git a/apps/DuelingBanditsPureExploration/myApp.py b/apps/DuelingBanditsPureExploration/myApp.py index beb5bda8..bd511d39 100644 --- a/apps/DuelingBanditsPureExploration/myApp.py +++ b/apps/DuelingBanditsPureExploration/myApp.py @@ -2,6 +2,7 @@ # x change the algorithm definitions. Done for LilUCB only # o explore the dashboard, see what you need to change # ? modify the widgets? +from __future__ import print_function import json import numpy import next.apps.SimpleTargetManager @@ -9,8 +10,8 @@ class MyApp: - def __init__(self,db): - self.app_id = 'DuelingBanditsPureExploration' + def __init__(self, db): + self.app_id = "DuelingBanditsPureExploration" self.TargetManager = next.apps.SimpleTargetManager.SimpleTargetManager(db) def initExp(self, butler, init_algs, args): @@ -38,16 +39,18 @@ def initExp(self, butler, init_algs, args): args: The experiment data, potentially modified. """ # TODO: change this in every app type coded thus far! - if 'targetset' in args['targets'].keys(): - n = len(args['targets']['targetset']) - self.TargetManager.set_targetset(butler.exp_uid, args['targets']['targetset']) + if "targetset" in args["targets"].keys(): + n = len(args["targets"]["targetset"]) + self.TargetManager.set_targetset( + butler.exp_uid, args["targets"]["targetset"] + ) else: - n = args['targets']['n'] - args['n'] = n - del args['targets'] + n = args["targets"]["n"] + args["n"] = n + del args["targets"] alg_data = {} - algorithm_keys = ['n', 'failure_probability'] + algorithm_keys = ["n", "failure_probability"] for key in algorithm_keys: alg_data[key] = args[key] @@ -55,54 +58,70 @@ def initExp(self, butler, init_algs, args): return args def getQuery(self, butler, alg, args): - alg_response = alg({'participant_uid':args['participant_uid']}) - targets = [self.TargetManager.get_target_item(butler.exp_uid, alg_response[i]) - for i in [0, 1, 2]] - - targets_list = [{'target':targets[0],'label':'left'}, - {'target':targets[1],'label':'right'}] - - - if targets[0]['target_id'] == targets[-1]['target_id']: - targets_list[0]['flag'] = 1 - targets_list[1]['flag'] = 0 + alg_response = alg({"participant_uid": args["participant_uid"]}) + targets = [ + self.TargetManager.get_target_item(butler.exp_uid, alg_response[i]) + for i in [0, 1, 2] + ] + + targets_list = [ + {"target": targets[0], "label": "left"}, + {"target": targets[1], "label": "right"}, + ] + + if targets[0]["target_id"] == targets[-1]["target_id"]: + targets_list[0]["flag"] = 1 + targets_list[1]["flag"] = 0 else: - targets_list[0]['flag'] = 0 - targets_list[1]['flag'] = 1 + targets_list[0]["flag"] = 0 + targets_list[1]["flag"] = 1 - return_dict = {'target_indices':targets_list} + return_dict = {"target_indices": targets_list} experiment_dict = butler.experiment.get() - - #if 'labels' in experiment_dict['args']['rating_scale']: - #labels = experiment_dict['args']['rating_scale']['labels'] - #return_dict.update({'labels':labels}) - if 'context' in experiment_dict['args'] and 'context_type' in experiment_dict['args']: - return_dict.update({'context':experiment_dict['args']['context'],'context_type':experiment_dict['args']['context_type']}) + # if 'labels' in experiment_dict['args']['rating_scale']: + # labels = experiment_dict['args']['rating_scale']['labels'] + # return_dict.update({'labels':labels}) + + if ( + "context" in experiment_dict["args"] + and "context_type" in experiment_dict["args"] + ): + return_dict.update( + { + "context": experiment_dict["args"]["context"], + "context_type": experiment_dict["args"]["context_type"], + } + ) return return_dict def processAnswer(self, butler, alg, args): - query = butler.queries.get(uid=args['query_uid']) - targets = query['target_indices'] + query = butler.queries.get(uid=args["query_uid"]) + targets = query["target_indices"] for target in targets: - if target['label'] == 'left': - left_id = target['target']['target_id'] - if target['label'] == 'right': - right_id = target['target']['target_id'] - if target['flag'] == 1: - painted_id = target['target']['target_id'] - - winner_id = args['target_winner'] - butler.experiment.increment(key='num_reported_answers_for_' + query['alg_label']) - - alg({'left_id':left_id, - 'right_id':right_id, - 'winner_id':winner_id, - 'painted_id':painted_id}) - return {'winner_id':winner_id} - + if target["label"] == "left": + left_id = target["target"]["target_id"] + if target["label"] == "right": + right_id = target["target"]["target_id"] + if target["flag"] == 1: + painted_id = target["target"]["target_id"] + + winner_id = args["target_winner"] + butler.experiment.increment( + key="num_reported_answers_for_" + query["alg_label"] + ) + + alg( + { + "left_id": left_id, + "right_id": right_id, + "winner_id": winner_id, + "painted_id": painted_id, + } + ) + return {"winner_id": winner_id} def getModel(self, butler, alg, args): scores, precisions = alg() @@ -115,31 +134,46 @@ def getModel(self, butler, alg, args): targets = [] for index in range(n): - targets.append( {'index':indexes[index], - 'target':self.TargetManager.get_target_item(butler.exp_uid, indexes[index]), - 'rank':ranks[index], - 'score':scores[index], - 'precision':precisions[index]} ) - num_reported_answers = butler.experiment.get('num_reported_answers') - return {'targets': targets, 'num_reported_answers':num_reported_answers} - + targets.append( + { + "index": indexes[index], + "target": self.TargetManager.get_target_item( + butler.exp_uid, indexes[index] + ), + "rank": ranks[index], + "score": scores[index], + "precision": precisions[index], + } + ) + num_reported_answers = butler.experiment.get("num_reported_answers") + return {"targets": targets, "num_reported_answers": num_reported_answers} def format_responses(self, responses): formatted = [] for response in responses: - targets = {'target_' + target['label']: target['target']['primary_description'] - for target in response['target_indices']} - ids = {target['label'] + '_id': target['target']['target_id'] - for target in response['target_indices']} - if 'winner_id' not in response: + targets = { + "target_" + target["label"]: target["target"]["primary_description"] + for target in response["target_indices"] + } + ids = { + target["label"] + "_id": target["target"]["target_id"] + for target in response["target_indices"] + } + if "winner_id" not in response: continue - won = {t['target']['target_id'] == response['winner_id']: t - for t in response['target_indices']} + won = { + t["target"]["target_id"] == response["winner_id"]: t + for t in response["target_indices"] + } winner = won[True] - response.update({'target_winner': winner['target']['primary_description'], - 'winner_id': winner['target']['target_id']}) - - for key in ['_id', 'target_indices']: + response.update( + { + "target_winner": winner["target"]["primary_description"], + "winner_id": winner["target"]["target_id"], + } + ) + + for key in ["_id", "target_indices"]: if key in response: del response[key] response.update(targets) diff --git a/apps/DuelingBanditsPureExploration/tests/test_api.py b/apps/DuelingBanditsPureExploration/tests/test_api.py index ed82cce8..b44dc76b 100644 --- a/apps/DuelingBanditsPureExploration/tests/test_api.py +++ b/apps/DuelingBanditsPureExploration/tests/test_api.py @@ -1,3 +1,4 @@ +from __future__ import print_function import numpy import numpy as np import numpy.random @@ -11,66 +12,76 @@ from multiprocessing import Pool import os import sys + try: import next.apps.test_utils as test_utils except: - file_dir = '/'.join(__file__.split('/')[:-1]) - sys.path.append('{}/../../../next/apps'.format(file_dir)) + file_dir = "/".join(__file__.split("/")[:-1]) + sys.path.append("{}/../../../next/apps".format(file_dir)) import test_utils def test_validation_params(): - params = [{'num_tries': 5}, - {'query_list': [[0, 1], [1, 2], [3, 4]]}] + params = [{"num_tries": 5}, {"query_list": [[0, 1], [1, 2], [3, 4]]}] for param in params: print(param) test_api(params=param) -def test_api(assert_200=True, num_arms=5, num_clients=8, delta=0.05, - total_pulls_per_client=5, num_experiments=1, - params={'num_tries': 5}): +def test_api( + assert_200=True, + num_arms=5, + num_clients=8, + delta=0.05, + total_pulls_per_client=5, + num_experiments=1, + params={"num_tries": 5}, +): - app_id = 'DuelingBanditsPureExploration' - true_means = numpy.array(range(num_arms)[::-1])/float(num_arms) + app_id = "DuelingBanditsPureExploration" + true_means = numpy.array(range(num_arms)[::-1]) / float(num_arms) pool = Pool(processes=num_clients) - supported_alg_ids = ['BR_LilUCB', 'BR_Random', 'ValidationSampling', 'BR_KLUCB'] + supported_alg_ids = ["BR_LilUCB", "BR_Random", "ValidationSampling", "BR_KLUCB"] alg_list = [] for i, alg_id in enumerate(supported_alg_ids): alg_item = {} - if alg_id == 'ValidationSampling': - alg_item['params'] = params - alg_item['alg_id'] = alg_id - alg_item['alg_label'] = alg_id+'_'+str(i) + if alg_id == "ValidationSampling": + alg_item["params"] = params + alg_item["alg_id"] = alg_id + alg_item["alg_label"] = alg_id + "_" + str(i) alg_list.append(alg_item) params = [] for algorithm in alg_list: - params.append({'alg_label': algorithm['alg_label'], 'proportion':1./len(alg_list)}) + params.append( + {"alg_label": algorithm["alg_label"], "proportion": 1. / len(alg_list)} + ) algorithm_management_settings = {} - algorithm_management_settings['mode'] = 'fixed_proportions' - algorithm_management_settings['params'] = params + algorithm_management_settings["mode"] = "fixed_proportions" + algorithm_management_settings["params"] = params - print algorithm_management_settings + print(algorithm_management_settings) ################################################# # Test POST Experiment ################################################# initExp_args_dict = {} - initExp_args_dict['args'] = {'alg_list': alg_list, - 'algorithm_management_settings': algorithm_management_settings, - 'context': 'Context for Dueling Bandits', - 'context_type': 'text', - 'debrief': 'Test debried.', - 'failure_probability': 0.05, - 'instructions': 'Test instructions.', - 'participant_to_algorithm_management': 'one_to_many', - 'targets': {'n': num_arms}} - - initExp_args_dict['app_id'] = app_id - initExp_args_dict['site_id'] = 'replace this with working site id' - initExp_args_dict['site_key'] = 'replace this with working site key' + initExp_args_dict["args"] = { + "alg_list": alg_list, + "algorithm_management_settings": algorithm_management_settings, + "context": "Context for Dueling Bandits", + "context_type": "text", + "debrief": "Test debried.", + "failure_probability": 0.05, + "instructions": "Test instructions.", + "participant_to_algorithm_management": "one_to_many", + "targets": {"n": num_arms}, + } + + initExp_args_dict["app_id"] = app_id + initExp_args_dict["site_id"] = "replace this with working site id" + initExp_args_dict["site_key"] = "replace this with working site key" exp_info = [] for ell in range(num_experiments): @@ -80,13 +91,14 @@ def test_api(assert_200=True, num_arms=5, num_clients=8, delta=0.05, participants = [] pool_args = [] for i in range(num_clients): - participant_uid = '%030x' % random.randrange(16**30) + participant_uid = "%030x" % random.randrange(16 ** 30) participants.append(participant_uid) experiment = numpy.random.choice(exp_info) - exp_uid = experiment['exp_uid'] - pool_args.append((exp_uid, participant_uid, total_pulls_per_client, - true_means,assert_200)) + exp_uid = experiment["exp_uid"] + pool_args.append( + (exp_uid, participant_uid, total_pulls_per_client, true_means, assert_200) + ) results = pool.map(simulate_one_client, pool_args) @@ -95,37 +107,39 @@ def test_api(assert_200=True, num_arms=5, num_clients=8, delta=0.05, test_utils.getModel(exp_uid, app_id, supported_alg_ids, alg_list) + def simulate_one_client(input_args): - exp_uid,participant_uid,total_pulls,true_means,assert_200 = input_args + exp_uid, participant_uid, total_pulls, true_means, assert_200 = input_args getQuery_times = [] processAnswer_times = [] for t in range(total_pulls): - print " Participant {} had {} total pulls: ".format(participant_uid, t) + print(" Participant {} had {} total pulls: ".format(participant_uid, t)) # test POST getQuery # # return a widget 1/5 of the time (normally, use HTML) - widget = random.choice([True] + 4*[False]) - getQuery_args_dict = {'args': {'participant_uid': participant_uid, - 'widget': widget}, - 'exp_uid': exp_uid} + widget = random.choice([True] + 4 * [False]) + getQuery_args_dict = { + "args": {"participant_uid": participant_uid, "widget": widget}, + "exp_uid": exp_uid, + } query_dict, dt = test_utils.getQuery(getQuery_args_dict) getQuery_times.append(dt) if widget: - query_dict = query_dict['args'] - query_uid = query_dict['query_uid'] - targets = query_dict['target_indices'] + query_dict = query_dict["args"] + query_uid = query_dict["query_uid"] + targets = query_dict["target_indices"] - left = targets[0]['target'] - right = targets[1]['target'] + left = targets[0]["target"] + right = targets[1]["target"] # sleep for a bit to simulate response time ts = test_utils.response_delay() # print left - reward_left = true_means[left['target_id']] + numpy.random.randn()*0.5 - reward_right = true_means[right['target_id']] + numpy.random.randn()*0.5 + reward_left = true_means[left["target_id"]] + numpy.random.randn() * 0.5 + reward_right = true_means[right["target_id"]] + numpy.random.randn() * 0.5 if reward_left > reward_right: target_winner = left else: @@ -133,20 +147,27 @@ def simulate_one_client(input_args): response_time = time.time() - ts - # test POST processAnswer - processAnswer_args_dict = {'args': {'query_uid': query_uid, - 'response_time': response_time, - 'target_winner': target_winner["target_id"]}, - 'exp_uid': exp_uid} - processAnswer_json_response, dt = test_utils.processAnswer(processAnswer_args_dict) + # test POST processAnswer + processAnswer_args_dict = { + "args": { + "query_uid": query_uid, + "response_time": response_time, + "target_winner": target_winner["target_id"], + }, + "exp_uid": exp_uid, + } + processAnswer_json_response, dt = test_utils.processAnswer( + processAnswer_args_dict + ) processAnswer_times += [dt] - r = test_utils.format_times(getQuery_times, processAnswer_times, total_pulls, - participant_uid) + r = test_utils.format_times( + getQuery_times, processAnswer_times, total_pulls, participant_uid + ) return r -if __name__ == '__main__': +if __name__ == "__main__": test_api() # test_api(assert_200=True, num_arms=5, num_clients=10, delta=0.05, - # total_pulls_per_client=10, num_experiments=1) + # total_pulls_per_client=10, num_experiments=1) diff --git a/apps/PoolBasedBinaryClassification/algs/RandomSamplingLinearLeastSquares.py b/apps/PoolBasedBinaryClassification/algs/RandomSamplingLinearLeastSquares.py index 27527bd6..e6fc61e0 100644 --- a/apps/PoolBasedBinaryClassification/algs/RandomSamplingLinearLeastSquares.py +++ b/apps/PoolBasedBinaryClassification/algs/RandomSamplingLinearLeastSquares.py @@ -1,51 +1,53 @@ +from __future__ import print_function import time import numpy.random import next.utils as utils + class MyAlg: def initExp(self, butler, n, d, failure_probability): # Save the number of targets, dimension, and failure_probability to algorithm storage - butler.algorithms.set(key='n',value= n) - butler.algorithms.set(key='delta',value= failure_probability) - butler.algorithms.set(key='d',value= d) - + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="delta", value=failure_probability) + butler.algorithms.set(key="d", value=d) + # Initialize the weight to an empty list of 0's - butler.algorithms.set(key='weights',value=[0]*(d+1)) - butler.algorithms.set(key='num_reported_answers', value=0) + butler.algorithms.set(key="weights", value=[0] * (d + 1)) + butler.algorithms.set(key="num_reported_answers", value=0) return True def getQuery(self, butler, participant_uid): # Retrieve the number of targets and return the index of one at random - n = butler.algorithms.get(key='n') + n = butler.algorithms.get(key="n") idx = numpy.random.choice(n) return idx def processAnswer(self, butler, target_index, target_label): # S maintains a list of labelled items. Appending to S will create it. - butler.algorithms.append(key='S',value=(target_index,target_label)) + butler.algorithms.append(key="S", value=(target_index, target_label)) # Increment the number of reported answers by one. - num_reported_answers = butler.algorithms.increment(key='num_reported_answers') + num_reported_answers = butler.algorithms.increment(key="num_reported_answers") # Run a model update job after every d answers - d = butler.algorithms.get(key='d') + d = butler.algorithms.get(key="d") if num_reported_answers % int(d) == 0: - butler.job('full_embedding_update', {}, time_limit=30) + butler.job("full_embedding_update", {}, time_limit=30) return True - def getModel(self, butler): # The model is simply the vector of weights and a record of the number of reported answers. - utils.debug_print(butler.algorithms.get(key=['weights','num_reported_answers'])) - return butler.algorithms.get(key=['weights','num_reported_answers']) - + utils.debug_print( + butler.algorithms.get(key=["weights", "num_reported_answers"]) + ) + return butler.algorithms.get(key=["weights", "num_reported_answers"]) def full_embedding_update(self, butler, args): # Main function to update the model. - labelled_items = butler.algorithms.get(key='S') + labelled_items = butler.algorithms.get(key="S") # Get the list of targets. targets = butler.targets.get_targetset(butler.exp_uid) # Extract the features form each target and then append a bias feature. - target_features = [targets[i]['meta']['features'] for i in range(len(targets))] + target_features = [targets[i]["meta"]["features"] for i in range(len(targets))] for feature_vector in target_features: feature_vector.append(1.) # Build a list of feature vectors and associated labels. @@ -57,6 +59,6 @@ def full_embedding_update(self, butler, args): # Convert to numpy arrays and use lstsquares to find the weights. X = numpy.array(X) y = numpy.array(y) - weights = numpy.linalg.lstsq(X,y)[0] + weights = numpy.linalg.lstsq(X, y)[0] # Save the weights under the key weights. - butler.algorithms.set(key='weights',value=weights.tolist()) + butler.algorithms.set(key="weights", value=weights.tolist()) diff --git a/apps/PoolBasedBinaryClassification/algs/RoundRobin.py b/apps/PoolBasedBinaryClassification/algs/RoundRobin.py index 88798a5c..7905115b 100644 --- a/apps/PoolBasedBinaryClassification/algs/RoundRobin.py +++ b/apps/PoolBasedBinaryClassification/algs/RoundRobin.py @@ -1,3 +1,4 @@ +from __future__ import print_function import numpy as np import next.utils as utils @@ -5,48 +6,50 @@ class MyAlg: def initExp(self, butler, n, d, failure_probability): # Save the number of targets, dimension, and initialize how many times each target has been labeled and failure_probability to algorithm storage - butler.algorithms.set(key='n', value=n) - butler.algorithms.set(key='delta', value=failure_probability) - butler.algorithms.set(key='d', value=d) - butler.algorithms.set(key='target_index', value=0) + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="delta", value=failure_probability) + butler.algorithms.set(key="d", value=d) + butler.algorithms.set(key="target_index", value=0) # Initialize the weight to an empty list of 0's - butler.algorithms.set(key='weights', value=[0] * (d + 1)) - butler.algorithms.set(key='num_reported_answers', value=0) + butler.algorithms.set(key="weights", value=[0] * (d + 1)) + butler.algorithms.set(key="num_reported_answers", value=0) return True def getQuery(self, butler, participant_uid): # Retrieve the number of targets and return the index of the one that has been sampled least - idx = butler.algorithms.get(key='target_index') - n = butler.algorithms.get(key='n') - butler.algorithms.set(key='target_index', value=(idx+1) % n) + idx = butler.algorithms.get(key="target_index") + n = butler.algorithms.get(key="n") + butler.algorithms.set(key="target_index", value=(idx + 1) % n) return idx def processAnswer(self, butler, target_index, target_label): # S maintains a list of labelled items. Appending to S will create it. - butler.algorithms.append(key='S', value=(target_index, target_label)) + butler.algorithms.append(key="S", value=(target_index, target_label)) # Increment the number of reported answers by one. - num_reported_answers = butler.algorithms.increment(key='num_reported_answers') + num_reported_answers = butler.algorithms.increment(key="num_reported_answers") # Run a model update job after every d answers - d = butler.algorithms.get(key='d') + d = butler.algorithms.get(key="d") if num_reported_answers % int(d) == 0: - butler.job('full_embedding_update', {}, time_limit=30) + butler.job("full_embedding_update", {}, time_limit=30) return True def getModel(self, butler): # The model is simply the vector of weights and a record of the number of reported answers. - utils.debug_print(butler.algorithms.get(key=['weights', 'num_reported_answers'])) - return butler.algorithms.get(key=['weights', 'num_reported_answers']) + utils.debug_print( + butler.algorithms.get(key=["weights", "num_reported_answers"]) + ) + return butler.algorithms.get(key=["weights", "num_reported_answers"]) def full_embedding_update(self, butler, args): # Main function to update the model. - labelled_items = butler.algorithms.get(key='S') + labelled_items = butler.algorithms.get(key="S") # Get the list of targets. targets = butler.targets.get_targetset(butler.exp_uid) # Extract the features form each target and then append a bias feature. - target_features = [targets[i]['meta']['features'] for i in range(len(targets))] + target_features = [targets[i]["meta"]["features"] for i in range(len(targets))] for feature_vector in target_features: feature_vector.append(1.) # Build a list of feature vectors and associated labels. @@ -60,4 +63,4 @@ def full_embedding_update(self, butler, args): y = np.array(y) weights = np.linalg.lstsq(X, y)[0] # Save the weights under the key weights. - butler.algorithms.set(key='weights', value=weights.tolist()) + butler.algorithms.set(key="weights", value=weights.tolist()) diff --git a/apps/PoolBasedBinaryClassification/dashboard/Dashboard.py b/apps/PoolBasedBinaryClassification/dashboard/Dashboard.py index da174c5b..b9c550d3 100644 --- a/apps/PoolBasedBinaryClassification/dashboard/Dashboard.py +++ b/apps/PoolBasedBinaryClassification/dashboard/Dashboard.py @@ -1,3 +1,4 @@ +from __future__ import print_function import json import numpy import numpy.random @@ -5,15 +6,16 @@ from datetime import timedelta import next.utils as utils from next.apps.AppDashboard import AppDashboard + # import next.database_client.DatabaseAPIHTTP as db # import next.logging_client.LoggerHTTP as ell -class MyAppDashboard(AppDashboard): - def __init__(self,db,ell): +class MyAppDashboard(AppDashboard): + def __init__(self, db, ell): AppDashboard.__init__(self, db, ell) - def test_error_multiline_plot(self,app, butler): + def test_error_multiline_plot(self, app, butler): """ Description: Returns multiline plot where there is a one-to-one mapping lines to algorithms and each line indicates the error on the validation set with respect to number of reported answers @@ -24,64 +26,78 @@ def test_error_multiline_plot(self,app, butler): Expected output (in dict): (dict) MPLD3 plot dictionary """ - args = butler.experiment.get(key='args') - alg_list = args['alg_list'] - test_alg_label = alg_list[0]['test_alg_label'] + args = butler.experiment.get(key="args") + alg_list = args["alg_list"] + test_alg_label = alg_list[0]["test_alg_label"] - test_queries = butler.db.get_docs_with_filter(app.app_id+':queries',{'exp_uid':app.exp_uid, 'alg_label':test_alg_label}) + test_queries = butler.db.get_docs_with_filter( + app.app_id + ":queries", + {"exp_uid": app.exp_uid, "alg_label": test_alg_label}, + ) - test_S = [(query['target_index'], query['target_label']) - for query in test_queries - if 'target_index' in query.keys()] + test_S = [ + (query["target_index"], query["target_label"]) + for query in test_queries + if "target_index" in query.keys() + ] targets = butler.targets.get_targetset(app.exp_uid) - targets = sorted(targets,key=lambda x: x['target_id']) + targets = sorted(targets, key=lambda x: x["target_id"]) target_features = [] for target_index in range(len(targets)): - target_vec = targets[target_index]['meta']['features'] + target_vec = targets[target_index]["meta"]["features"] target_vec.append(1.) target_features.append(target_vec) - x_min = numpy.float('inf') - x_max = -numpy.float('inf') - y_min = numpy.float('inf') - y_max = -numpy.float('inf') + x_min = numpy.float("inf") + x_max = -numpy.float("inf") + y_min = numpy.float("inf") + y_max = -numpy.float("inf") list_of_alg_dicts = [] for algorithm in alg_list: - alg_label = algorithm['alg_label'] - list_of_log_dict = self.ell.get_logs_with_filter(app.app_id+':ALG-EVALUATION',{'exp_uid':app.exp_uid, 'alg_label':alg_label}) - list_of_log_dict = sorted(list_of_log_dict, key=lambda item: utils.str2datetime(item['timestamp']) ) + alg_label = algorithm["alg_label"] + list_of_log_dict = self.ell.get_logs_with_filter( + app.app_id + ":ALG-EVALUATION", + {"exp_uid": app.exp_uid, "alg_label": alg_label}, + ) + list_of_log_dict = sorted( + list_of_log_dict, key=lambda item: utils.str2datetime(item["timestamp"]) + ) x = [] y = [] for item in list_of_log_dict: - num_reported_answers = item['num_reported_answers'] - weights = item['weights'] + num_reported_answers = item["num_reported_answers"] + weights = item["weights"] err = 0. for q in test_S: - estimated_label = numpy.sign(numpy.dot( numpy.array(target_features[q[0]]), numpy.array(weights) )) - err += estimated_label*q[1]<0. #do the labels agree or not + estimated_label = numpy.sign( + numpy.dot( + numpy.array(target_features[q[0]]), numpy.array(weights) + ) + ) + err += estimated_label * q[1] < 0. # do the labels agree or not m = float(len(test_S)) - err = err/m + err = err / m x.append(num_reported_answers) y.append(err) x = numpy.argsort(x) x = [x[i] for i in x] y = [y[i] for i in x] - + alg_dict = {} - alg_dict['legend_label'] = alg_label - alg_dict['x'] = x - alg_dict['y'] = y + alg_dict["legend_label"] = alg_label + alg_dict["x"] = x + alg_dict["y"] = y try: - x_min = min(x_min,min(x)) - x_max = max(x_max,max(x)) - y_min = min(y_min,min(y)) - y_max = max(y_max,max(y)) + x_min = min(x_min, min(x)) + x_max = max(x_max, max(x)) + y_min = min(y_min, min(y)) + y_max = max(y_max, max(y)) except: pass @@ -89,22 +105,20 @@ def test_error_multiline_plot(self,app, butler): import matplotlib.pyplot as plt import mpld3 - fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE')) + + fig, ax = plt.subplots(subplot_kw=dict(axisbg="#EEEEEE")) for alg_dict in list_of_alg_dicts: - ax.plot(alg_dict['x'],alg_dict['y'],label=alg_dict['legend_label']) - ax.set_xlabel('Number of answered queries') - ax.set_ylabel('Error on hold-out set') - ax.set_xlim([x_min,x_max]) - ax.set_ylim([y_min,y_max]) - ax.grid(color='white', linestyle='solid') - ax.set_title('Test Error', size=14) - legend = ax.legend(loc=2,ncol=3,mode="expand") + ax.plot(alg_dict["x"], alg_dict["y"], label=alg_dict["legend_label"]) + ax.set_xlabel("Number of answered queries") + ax.set_ylabel("Error on hold-out set") + ax.set_xlim([x_min, x_max]) + ax.set_ylim([y_min, y_max]) + ax.grid(color="white", linestyle="solid") + ax.set_title("Test Error", size=14) + legend = ax.legend(loc=2, ncol=3, mode="expand") for label in legend.get_texts(): - label.set_fontsize('small') + label.set_fontsize("small") plot_dict = mpld3.fig_to_dict(fig) plt.close() return plot_dict - - - diff --git a/apps/PoolBasedBinaryClassification/myApp.py b/apps/PoolBasedBinaryClassification/myApp.py index 1bb966e4..6786618c 100644 --- a/apps/PoolBasedBinaryClassification/myApp.py +++ b/apps/PoolBasedBinaryClassification/myApp.py @@ -1,49 +1,62 @@ +from __future__ import print_function import json import next.utils as utils import next.apps.SimpleTargetManager + class MyApp: - def __init__(self,db): - self.app_id = 'PoolBasedBinaryClassification' + def __init__(self, db): + self.app_id = "PoolBasedBinaryClassification" self.TargetManager = next.apps.SimpleTargetManager.SimpleTargetManager(db) def initExp(self, butler, init_algs, args): - args['n'] = len(args['targets']['targetset']) + args["n"] = len(args["targets"]["targetset"]) # Get the first target, extract it's feature vector and save this as the dimension # This assumes that feature dimension consistent across all targets - args['d'] = len(args['targets']['targetset'][0]['meta']['features']) - targets = sorted(args['targets']['targetset'],key=lambda x: x['target_id']) + args["d"] = len(args["targets"]["targetset"][0]["meta"]["features"]) + targets = sorted(args["targets"]["targetset"], key=lambda x: x["target_id"]) self.TargetManager.set_targetset(butler.exp_uid, targets) - del args['targets'] - - alg_data = {'n': args['n'], - 'failure_probability': args['failure_probability'], - 'd': args['d']} + del args["targets"] + + alg_data = { + "n": args["n"], + "failure_probability": args["failure_probability"], + "d": args["d"], + } init_algs(alg_data) return args def getQuery(self, butler, alg, args): - alg_response = alg({'participant_uid':args['participant_uid']}) + alg_response = alg({"participant_uid": args["participant_uid"]}) target = self.TargetManager.get_target_item(butler.exp_uid, alg_response) - del target['meta'] - return {'target_indices':target} + del target["meta"] + return {"target_indices": target} def processAnswer(self, butler, alg, args): - query = butler.queries.get(uid=args['query_uid']) - target = query['target_indices'] - target_label = args['target_label'] + query = butler.queries.get(uid=args["query_uid"]) + target = query["target_indices"] + target_label = args["target_label"] - num_reported_answers = butler.experiment.increment(key='num_reported_answers_for_' + query['alg_label']) + num_reported_answers = butler.experiment.increment( + key="num_reported_answers_for_" + query["alg_label"] + ) # make a getModel call ~ every n/4 queries - note that this query will NOT be included in the predict experiment = butler.experiment.get() - d = experiment['args']['d'] - if num_reported_answers % ((d+4)/4) == 0: - butler.job('getModel', json.dumps({'exp_uid':butler.exp_uid,'args':{'alg_label':query['alg_label'], 'logging':True}})) - - alg({'target_index':target['target_id'],'target_label':target_label}) - return {'target_index':target['target_id'],'target_label':target_label} + d = experiment["args"]["d"] + if num_reported_answers % ((d + 4) / 4) == 0: + butler.job( + "getModel", + json.dumps( + { + "exp_uid": butler.exp_uid, + "args": {"alg_label": query["alg_label"], "logging": True}, + } + ), + ) + + alg({"target_index": target["target_id"], "target_label": target_label}) + return {"target_index": target["target_id"], "target_label": target_label} def getModel(self, butler, alg, args): return alg() - diff --git a/apps/PoolBasedBinaryClassification/tests/test_api.py b/apps/PoolBasedBinaryClassification/tests/test_api.py index ef27f96a..d155529f 100644 --- a/apps/PoolBasedBinaryClassification/tests/test_api.py +++ b/apps/PoolBasedBinaryClassification/tests/test_api.py @@ -1,3 +1,4 @@ +from __future__ import print_function import numpy import numpy.random import random @@ -7,74 +8,96 @@ from scipy.linalg import norm from multiprocessing import Pool import os, sys + try: import next.apps.test_utils as test_utils except: - file_dir = '/'.join(__file__.split('/')[:-1]) - sys.path.append('{}/../../../next/apps'.format(file_dir)) + file_dir = "/".join(__file__.split("/")[:-1]) + sys.path.append("{}/../../../next/apps".format(file_dir)) import test_utils -app_id = 'PoolBasedBinaryClassification' +app_id = "PoolBasedBinaryClassification" + -def test_api(assert_200=True, num_objects=4, desired_dimension=1, - total_pulls_per_client=5, num_experiments=1, - num_clients=7): +def test_api( + assert_200=True, + num_objects=4, + desired_dimension=1, + total_pulls_per_client=5, + num_experiments=1, + num_clients=7, +): true_weights = numpy.zeros(desired_dimension) true_weights[0] = 1. pool = Pool(processes=num_clients) - supported_alg_ids = ['RandomSamplingLinearLeastSquares', - 'RandomSamplingLinearLeastSquares', - 'RoundRobin'] + supported_alg_ids = [ + "RandomSamplingLinearLeastSquares", + "RandomSamplingLinearLeastSquares", + "RoundRobin", + ] alg_list = [] - for idx,alg_id in enumerate(supported_alg_ids): + for idx, alg_id in enumerate(supported_alg_ids): alg_item = {} - alg_item['alg_id'] = alg_id - if idx==0: - alg_item['alg_label'] = 'Test' + alg_item["alg_id"] = alg_id + if idx == 0: + alg_item["alg_label"] = "Test" else: - alg_item['alg_label'] = alg_id - alg_item['test_alg_label'] = 'Test' + alg_item["alg_label"] = alg_id + alg_item["test_alg_label"] = "Test" alg_list.append(alg_item) params = [] for algorithm in alg_list: - params.append({'alg_label': algorithm['alg_label'], - 'proportion': 1./len(alg_list)}) + params.append( + {"alg_label": algorithm["alg_label"], "proportion": 1. / len(alg_list)} + ) algorithm_management_settings = {} - algorithm_management_settings['mode'] = 'fixed_proportions' - algorithm_management_settings['params'] = params + algorithm_management_settings["mode"] = "fixed_proportions" + algorithm_management_settings["params"] = params targetset = [] for i in range(num_objects): features = list(numpy.random.randn(desired_dimension)) - targetset.append({'primary_description': str(features), - 'primary_type':'text', - 'alt_description':'%d' % (i), - 'alt_type':'text', - 'target_id': str(i), - 'meta': {'features':features}}) + targetset.append( + { + "primary_description": str(features), + "primary_type": "text", + "alt_description": "%d" % (i), + "alt_type": "text", + "target_id": str(i), + "meta": {"features": features}, + } + ) # Test POST Experiment - print '\n'*2 + 'Testing POST initExp...' + print("\n" * 2 + "Testing POST initExp...") initExp_args_dict = {} - initExp_args_dict['app_id'] = 'PoolBasedBinaryClassification' - initExp_args_dict['args'] = {} - initExp_args_dict['args']['failure_probability'] = 0.01 - initExp_args_dict['args']['participant_to_algorithm_management'] = 'one_to_many' # 'one_to_one' #optional field - initExp_args_dict['args']['algorithm_management_settings'] = algorithm_management_settings #optional field - initExp_args_dict['args']['alg_list'] = alg_list #optional field - initExp_args_dict['args']['instructions'] = 'You want instructions, here are your test instructions' - initExp_args_dict['args']['debrief'] = 'You want a debrief, here is your test debrief' - initExp_args_dict['args']['targets'] = {'targetset': targetset} + initExp_args_dict["app_id"] = "PoolBasedBinaryClassification" + initExp_args_dict["args"] = {} + initExp_args_dict["args"]["failure_probability"] = 0.01 + initExp_args_dict["args"][ + "participant_to_algorithm_management" + ] = "one_to_many" # 'one_to_one' #optional field + initExp_args_dict["args"][ + "algorithm_management_settings" + ] = algorithm_management_settings # optional field + initExp_args_dict["args"]["alg_list"] = alg_list # optional field + initExp_args_dict["args"][ + "instructions" + ] = "You want instructions, here are your test instructions" + initExp_args_dict["args"][ + "debrief" + ] = "You want a debrief, here is your test debrief" + initExp_args_dict["args"]["targets"] = {"targetset": targetset} exp_info = [] for ell in range(num_experiments): initExp_response_dict, exp_info_ = test_utils.initExp(initExp_args_dict) exp_info += [exp_info_] - exp_uid = initExp_response_dict['exp_uid'] + exp_uid = initExp_response_dict["exp_uid"] - exp_info.append({'exp_uid':exp_uid,}) + exp_info.append({"exp_uid": exp_uid}) # Test GET Experiment initExp_response_dict = test_utils.getExp(exp_uid) @@ -85,18 +108,20 @@ def test_api(assert_200=True, num_objects=4, desired_dimension=1, participants = [] pool_args = [] for i in range(num_clients): - participant_uid = '%030x' % random.randrange(16**30) + participant_uid = "%030x" % random.randrange(16 ** 30) participants.append(participant_uid) experiment = numpy.random.choice(exp_info) - exp_uid = experiment['exp_uid'] - pool_args.append((exp_uid,participant_uid,total_pulls_per_client,true_weights,assert_200)) + exp_uid = experiment["exp_uid"] + pool_args.append( + (exp_uid, participant_uid, total_pulls_per_client, true_weights, assert_200) + ) - print "participants are", participants + print("participants are", participants) results = pool.map(simulate_one_client, pool_args) for result in results: - print result + print(result) test_utils.getModel(exp_uid, app_id, supported_alg_ids, alg_list) @@ -108,27 +133,28 @@ def simulate_one_client(input_args): processAnswer_times = [] for t in range(total_pulls): - print "participant {} had {} pulls".format(participant_uid, t) + print("participant {} had {} pulls".format(participant_uid, t)) # test POST getQuery # widget = True - getQuery_args_dict = {'args': {'participant_uid': participant_uid, - 'widget': widget}, - 'exp_uid': exp_uid} + getQuery_args_dict = { + "args": {"participant_uid": participant_uid, "widget": widget}, + "exp_uid": exp_uid, + } query_dict, dt = test_utils.getQuery(getQuery_args_dict) getQuery_times += [dt] if widget: - query_dict = query_dict['args'] - query_uid = query_dict['query_uid'] - target = query_dict['target_indices'] - x = numpy.array(eval(target['primary_description'])) + query_dict = query_dict["args"] + query_uid = query_dict["query_uid"] + target = query_dict["target_indices"] + x = numpy.array(eval(target["primary_description"])) # generate simulated reward # # sleep for a bit to simulate response time ts = test_utils.response_delay() - target_label = numpy.sign(numpy.dot(x,true_weights)) + target_label = numpy.sign(numpy.dot(x, true_weights)) response_time = time.time() - ts # test POST processAnswer @@ -137,18 +163,21 @@ def simulate_one_client(input_args): processAnswer_args_dict["args"] = {} processAnswer_args_dict["args"]["query_uid"] = query_uid processAnswer_args_dict["args"]["target_label"] = target_label - processAnswer_args_dict["args"]['response_time'] = response_time + processAnswer_args_dict["args"]["response_time"] = response_time - processAnswer_json_response, dt = test_utils.processAnswer(processAnswer_args_dict) + processAnswer_json_response, dt = test_utils.processAnswer( + processAnswer_args_dict + ) processAnswer_times += [dt] - return_str = test_utils.format_times(getQuery_times, processAnswer_times, - total_pulls, participant_uid) + return_str = test_utils.format_times( + getQuery_times, processAnswer_times, total_pulls, participant_uid + ) return return_str -if __name__ == '__main__': +if __name__ == "__main__": test_api() # test_api(assert_200=False, num_objects=100, desired_dimension=4, - # total_pulls_per_client=30, num_experiments=1, num_clients=10, - # delta=0.01) + # total_pulls_per_client=30, num_experiments=1, num_clients=10, + # delta=0.01) diff --git a/apps/PoolBasedTripletMDS/algs/CrowdKernel/myAlg.py b/apps/PoolBasedTripletMDS/algs/CrowdKernel/myAlg.py index 4ad3ef1d..8d890e13 100644 --- a/apps/PoolBasedTripletMDS/algs/CrowdKernel/myAlg.py +++ b/apps/PoolBasedTripletMDS/algs/CrowdKernel/myAlg.py @@ -3,160 +3,174 @@ author: Lalit Jain, kevin.g.jamieson@gmail.com last updated: 4/22/2015 """ +from __future__ import print_function import numpy import numpy.random from apps.PoolBasedTripletMDS.algs.CrowdKernel import utilsCrowdKernel import time -class MyAlg: - - def initExp(self,butler,n,d,failure_probability): - X = numpy.random.randn(n,d)*.0001 - tau = numpy.random.rand(n,n) - - butler.algorithms.set(key='n',value=n) - butler.algorithms.set(key='d',value=d) - butler.algorithms.set(key='delta',value=failure_probability) - butler.algorithms.set(key='X',value=X.tolist()) - butler.algorithms.set(key='tau',value=tau.tolist()) - # butler.algorithms.set(key='S',value=[]) # do not initialize a list that you plan to append to! When you append_list the first item it will be created automatically. - butler.algorithms.set(key='num_reported_answers',value=0) - return True - - - def getQuery(self,butler): - R = 10 - n = butler.algorithms.get(key='n') - num_reported_answers = butler.algorithms.get(key='num_reported_answers') - - if num_reported_answers == None: - num_reported_answers = 0 - - if num_reported_answers < R*n: - a = num_reported_answers/R - b = numpy.random.randint(n) - while b==a: - b = numpy.random.randint(n) - c = numpy.random.randint(n) - while c==a or c==b: - c = numpy.random.randint(n) - return [a, b, c] - - X = numpy.array(butler.algorithms.get(key='X')) - tau = numpy.array(butler.algorithms.get(key='tau')) - - # set maximum time allowed to search for a query - t_max = .05 - best_q, best_score = utilsCrowdKernel.getRandomQuery(X) - t_start = time.time() - best_entropy = -1*float('inf') - - while time.time()-t_start best_entropy: - best_q = q - best_entropy = entropy - index_center = best_q[2] - index_left = best_q[0] - index_right = best_q[1] - return [index_center, index_left, index_right] - - - def processAnswer(self,butler,center_id,left_id,right_id,target_winner): - if left_id==target_winner: - q = [left_id,right_id,center_id] - else: - q = [right_id,left_id,center_id] - - butler.algorithms.append(key='S',value=q) - - n = butler.algorithms.get(key='n') - num_reported_answers = butler.algorithms.increment(key='num_reported_answers') - if num_reported_answers % int(n) == 0: - butler.job('full_embedding_update', {}, time_limit=30) - else: - butler.job('incremental_embedding_update', {},time_limit=5) - - return True - - def getModel(self,butler): - return butler.algorithms.get(key=['X','num_reported_answers']) - - def incremental_embedding_update(self,butler,args): - verbose = False - S = butler.algorithms.get(key='S') - - X = numpy.array(butler.algorithms.get(key='X')) - - # set maximum time allowed to update embedding - t_max = 1.0 - epsilon = 0.00001 # a relative convergence criterion, see computeEmbeddingWithGD documentation - mu = .05 - - t_start = time.time() - X,emp_loss_new,hinge_loss_new,log_loss_new,acc = utilsCrowdKernel.computeEmbeddingWithGD(X,S,mu,epsilon=epsilon,max_iters=1) - k = 1 - while (time.time()-t_start<.5*t_max) and (acc > epsilon): - X,emp_loss_new,hinge_loss_new,log_loss_new,acc = utilsCrowdKernel.computeEmbeddingWithGD(X,S,mu,max_iters=2**k, epsilon=epsilon, verbose=verbose) - k+=1 - - tau = utilsCrowdKernel.getCrowdKernelTauDistribution(X,S,mu) - - butler.algorithms.set(key='X',value=X.tolist()) - butler.algorithms.set(key='tau',value=tau.tolist()) - - - def full_embedding_update(self,butler,args): - verbose = False - - n = butler.algorithms.get(key='n') - d = butler.algorithms.get(key='d') - S = butler.algorithms.get(key='S') - - X_old = numpy.array(butler.algorithms.get(key='X')) - # set maximum time allowed to update embedding - t_max = 5.0 - epsilon = 0.00001 # a relative convergence criterion, see computeEmbeddingWithGD documentation - mu = .05 - - - emp_loss_old,hinge_loss_old,log_loss_old = utilsCrowdKernel.getLoss(X_old,S) - X,tmp = utilsCrowdKernel.computeEmbeddingWithEpochSGD(n,d,S,mu,max_num_passes=16,epsilon=0,verbose=verbose) - t_start = time.time() - X,emp_loss_new,hinge_loss_new,log_loss_new,acc = utilsCrowdKernel.computeEmbeddingWithGD(X,S,mu,max_iters=1,epsilon=epsilon,verbose=verbose) - k = 1 - while (time.time()-t_start<.5*t_max) and (acc > epsilon): - X,emp_loss_new,hinge_loss_new,log_loss_new,acc = utilsCrowdKernel.computeEmbeddingWithGD(X,S,mu,max_iters=2**k,epsilon=epsilon,verbose=verbose) - k += 1 - emp_loss_new,hinge_loss_new,log_loss_new = utilsCrowdKernel.getLoss(X,S) - if emp_loss_old < emp_loss_new: - X = X_old - - tau = utilsCrowdKernel.getCrowdKernelTauDistribution(X,S,mu) - - butler.algorithms.set(key='X',value=X.tolist()) - butler.algorithms.set(key='tau',value=tau.tolist()) - - - - - +class MyAlg: + def initExp(self, butler, n, d, failure_probability): + X = numpy.random.randn(n, d) * .0001 + tau = numpy.random.rand(n, n) + + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="d", value=d) + butler.algorithms.set(key="delta", value=failure_probability) + butler.algorithms.set(key="X", value=X.tolist()) + butler.algorithms.set(key="tau", value=tau.tolist()) + # butler.algorithms.set(key='S',value=[]) # do not initialize a list that you plan to append to! When you append_list the first item it will be created automatically. + butler.algorithms.set(key="num_reported_answers", value=0) + return True + + def getQuery(self, butler): + R = 10 + n = butler.algorithms.get(key="n") + num_reported_answers = butler.algorithms.get(key="num_reported_answers") + + if num_reported_answers == None: + num_reported_answers = 0 + + if num_reported_answers < R * n: + a = num_reported_answers / R + b = numpy.random.randint(n) + while b == a: + b = numpy.random.randint(n) + c = numpy.random.randint(n) + while c == a or c == b: + c = numpy.random.randint(n) + return [a, b, c] + + X = numpy.array(butler.algorithms.get(key="X")) + tau = numpy.array(butler.algorithms.get(key="tau")) + + # set maximum time allowed to search for a query + t_max = .05 + best_q, best_score = utilsCrowdKernel.getRandomQuery(X) + t_start = time.time() + best_entropy = -1 * float("inf") + + while time.time() - t_start < t_max: + q, score = utilsCrowdKernel.getRandomQuery(X) + b, c, a = q + p = 0 + for i in range(n): + p += ( + utilsCrowdKernel.getCrowdKernelTripletProbability(X[b], X[c], X[i]) + * tau[a, i] + ) + + taub = list(tau[a]) + for i in range(n): + taub[i] = taub[i] * utilsCrowdKernel.getCrowdKernelTripletProbability( + X[b], X[c], X[i] + ) + taub = taub / sum(taub) + + tauc = list(tau[a]) + for i in range(n): + tauc[i] = tauc[i] * utilsCrowdKernel.getCrowdKernelTripletProbability( + X[c], X[b], X[i] + ) + tauc = tauc / sum(tauc) + + entropy = -p * utilsCrowdKernel.getEntropy(taub) - ( + 1 - p + ) * utilsCrowdKernel.getEntropy(tauc) + + if entropy > best_entropy: + best_q = q + best_entropy = entropy + index_center = best_q[2] + index_left = best_q[0] + index_right = best_q[1] + return [index_center, index_left, index_right] + + def processAnswer(self, butler, center_id, left_id, right_id, target_winner): + if left_id == target_winner: + q = [left_id, right_id, center_id] + else: + q = [right_id, left_id, center_id] + + butler.algorithms.append(key="S", value=q) + + n = butler.algorithms.get(key="n") + num_reported_answers = butler.algorithms.increment(key="num_reported_answers") + if num_reported_answers % int(n) == 0: + butler.job("full_embedding_update", {}, time_limit=30) + else: + butler.job("incremental_embedding_update", {}, time_limit=5) + + return True + + def getModel(self, butler): + return butler.algorithms.get(key=["X", "num_reported_answers"]) + + def incremental_embedding_update(self, butler, args): + verbose = False + S = butler.algorithms.get(key="S") + + X = numpy.array(butler.algorithms.get(key="X")) + + # set maximum time allowed to update embedding + t_max = 1.0 + epsilon = ( + 0.00001 + ) # a relative convergence criterion, see computeEmbeddingWithGD documentation + mu = .05 + + t_start = time.time() + X, emp_loss_new, hinge_loss_new, log_loss_new, acc = utilsCrowdKernel.computeEmbeddingWithGD( + X, S, mu, epsilon=epsilon, max_iters=1 + ) + k = 1 + while (time.time() - t_start < .5 * t_max) and (acc > epsilon): + X, emp_loss_new, hinge_loss_new, log_loss_new, acc = utilsCrowdKernel.computeEmbeddingWithGD( + X, S, mu, max_iters=2 ** k, epsilon=epsilon, verbose=verbose + ) + k += 1 + + tau = utilsCrowdKernel.getCrowdKernelTauDistribution(X, S, mu) + + butler.algorithms.set(key="X", value=X.tolist()) + butler.algorithms.set(key="tau", value=tau.tolist()) + + def full_embedding_update(self, butler, args): + verbose = False + + n = butler.algorithms.get(key="n") + d = butler.algorithms.get(key="d") + S = butler.algorithms.get(key="S") + + X_old = numpy.array(butler.algorithms.get(key="X")) + # set maximum time allowed to update embedding + t_max = 5.0 + epsilon = ( + 0.00001 + ) # a relative convergence criterion, see computeEmbeddingWithGD documentation + mu = .05 + + emp_loss_old, hinge_loss_old, log_loss_old = utilsCrowdKernel.getLoss(X_old, S) + X, tmp = utilsCrowdKernel.computeEmbeddingWithEpochSGD( + n, d, S, mu, max_num_passes=16, epsilon=0, verbose=verbose + ) + t_start = time.time() + X, emp_loss_new, hinge_loss_new, log_loss_new, acc = utilsCrowdKernel.computeEmbeddingWithGD( + X, S, mu, max_iters=1, epsilon=epsilon, verbose=verbose + ) + k = 1 + while (time.time() - t_start < .5 * t_max) and (acc > epsilon): + X, emp_loss_new, hinge_loss_new, log_loss_new, acc = utilsCrowdKernel.computeEmbeddingWithGD( + X, S, mu, max_iters=2 ** k, epsilon=epsilon, verbose=verbose + ) + k += 1 + emp_loss_new, hinge_loss_new, log_loss_new = utilsCrowdKernel.getLoss(X, S) + if emp_loss_old < emp_loss_new: + X = X_old + + tau = utilsCrowdKernel.getCrowdKernelTauDistribution(X, S, mu) + + butler.algorithms.set(key="X", value=X.tolist()) + butler.algorithms.set(key="tau", value=tau.tolist()) diff --git a/apps/PoolBasedTripletMDS/algs/CrowdKernel/utilsCrowdKernel.py b/apps/PoolBasedTripletMDS/algs/CrowdKernel/utilsCrowdKernel.py index f2b6b36e..38758c1d 100644 --- a/apps/PoolBasedTripletMDS/algs/CrowdKernel/utilsCrowdKernel.py +++ b/apps/PoolBasedTripletMDS/algs/CrowdKernel/utilsCrowdKernel.py @@ -14,12 +14,13 @@ You may also consider getLoss to check how well an embedding is performing. """ +from __future__ import print_function from numpy import * from numpy.random import * import numpy.random from numpy.linalg import * -#eig = numpy.linalg +# eig = numpy.linalg norm = linalg.norm floor = math.floor ceil = math.ceil @@ -28,51 +29,55 @@ import time + def main(): """ Example of Usage Creates some fake data and finds an embedding """ - + # generate some fake data n = 30 d = 2 - m = int(ceil(40*n*d*log(n))) # number of labels - - p = 0.1; # error rate - + m = int(ceil(40 * n * d * log(n))) # number of labels + + p = 0.1 + # error rate + Strain = [] Stest = [] - Xtrue = randn(n,d); - for iter in range(0,m): + Xtrue = randn(n, d) + for iter in range(0, m): # get random triplet - q,score = getRandomQuery(Xtrue) + q, score = getRandomQuery(Xtrue) # align it so it agrees with Xtrue: "q[2] is more similar to q[0] than q[1]" - query_ordering_disagrees_with_Xtrue = score<0 + query_ordering_disagrees_with_Xtrue = score < 0 if query_ordering_disagrees_with_Xtrue: - q = [ q[i] for i in [1,0,2]] + q = [q[i] for i in [1, 0, 2]] # add some noise R = rand() - if R 0 then the triplet agrees with the embedding, otherwise it does not @@ -113,12 +119,12 @@ def getTripletScore(X,q): Usage: score = getTripletScore(X,[3,4,5]) """ - i,j,k = q + i, j, k = q - return dot(X[j],X[j]) -2*dot(X[j],X[k]) + 2*dot(X[i],X[k]) - dot(X[i],X[i]) + return dot(X[j], X[j]) - 2 * dot(X[j], X[k]) + 2 * dot(X[i], X[k]) - dot(X[i], X[i]) -def getLoss(X,S): +def getLoss(X, S): """ Returns loss on X with respect to list of triplets S: 1/len(S) \sum_{q in S} loss(X,q). Intuitively, q=[i,j,k] "agrees" with X if ||x_j - x_k||^2 > ||x_i - x_k||^2. @@ -133,24 +139,26 @@ def getLoss(X,S): n = X.shape[0] d = X.shape[1] - emp_loss = 0 # 0/1 loss - hinge_loss = 0 # hinge loss - log_loss = 0 #log_loss in crowd kernel model + emp_loss = 0 # 0/1 loss + hinge_loss = 0 # hinge loss + log_loss = 0 # log_loss in crowd kernel model for q in S: - loss_ijk = getTripletScore(X,q) - hinge_loss = hinge_loss + max(0,1. - loss_ijk) - log_loss = log_loss - log(getCrowdKernelTripletProbability(X[q[0]],X[q[1]],X[q[2]],mu=.01)) + loss_ijk = getTripletScore(X, q) + hinge_loss = hinge_loss + max(0, 1. - loss_ijk) + log_loss = log_loss - log( + getCrowdKernelTripletProbability(X[q[0]], X[q[1]], X[q[2]], mu=.01) + ) if loss_ijk < 0: - emp_loss = emp_loss + 1. + emp_loss = emp_loss + 1. - emp_loss = emp_loss/len(S) - hinge_loss = hinge_loss/len(S) - log_loss = log_loss/len(S) + emp_loss = emp_loss / len(S) + hinge_loss = hinge_loss / len(S) + log_loss = log_loss / len(S) return emp_loss, hinge_loss, log_loss -def getCrowdKernelTripletProbability(b,c,a,mu=0): +def getCrowdKernelTripletProbability(b, c, a, mu=0): """ Return the probability of triplet [b,c,a] where a is closer to b than c. @@ -163,9 +171,9 @@ def getCrowdKernelTripletProbability(b,c,a,mu=0): (numpy.ndarray) c : numpy array (float) mu : regularization parameter """ - ca = norm(c-a) - ba = norm(b-a) - return (mu+ca*ca)/(2*mu+ba*ba+ca*ca) + ca = norm(c - a) + ba = norm(b - a) + return (mu + ca * ca) / (2 * mu + ba * ba + ca * ca) def getEntropy(tau): @@ -181,9 +189,10 @@ def getEntropy(tau): e = 0 for i in range(len(tau)): if tau[i] > 0: - e += -1*tau[i]*log(tau[i]) + e += -1 * tau[i] * log(tau[i]) return e - + + def getCrowdKernelTauDistribution(X, S, mu=.01): """ Return the tau distributions for each point [n]. @@ -197,23 +206,26 @@ def getCrowdKernelTauDistribution(X, S, mu=.01): Usage: tau = getCrowdKernelDistribution(X,S) """ - n,d = X.shape - tau = zeros((n,n)) + n, d = X.shape + tau = zeros((n, n)) # Loop over each query for q in S: a = q[2] # Multiply by the amount the query contributes to each tau for i in range(n): - tau[a,i] = tau[a,i] + log( getCrowdKernelTripletProbability(X[q[0]], X[q[1]], X[i], mu=mu)) - + tau[a, i] = tau[a, i] + log( + getCrowdKernelTripletProbability(X[q[0]], X[q[1]], X[i], mu=mu) + ) + # Normalize for a in range(n): tau[a] = exp(tau[a]) s = sum(tau[a]) - tau[a] = tau[a]/s - + tau[a] = tau[a] / s + return tau - + + def getGradient(X, S, mu): """ Returns gradient of the log loss of the crowd kernel probability distribution. @@ -222,26 +234,33 @@ def getGradient(X, S, mu): Usage: G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) """ - n,d = X.shape + n, d = X.shape m = len(S) - - G = zeros((n,d)) + + G = zeros((n, d)) log_loss = 0 for q in S: - i,j,k = q - num = max(mu + norm(X[k]-X[j])*norm(X[k]-X[j]) , realmin)#max(mu + D[k,j] , realmin) - den = max(2*mu + norm(X[k]-X[i])*norm(X[k]-X[i]) + norm(X[k]-X[j])*norm(X[k]-X[j]), realmin)#max(2*mu + D[k,i] + D[k,j], realmin) - G[i] += 2/den * (X[k]-X[i]) - G[j] += (2/num-2/den)*(X[j]-X[k]) - G[k] += 2/num * (X[k]-X[j])-2/den*(2*X[k]-X[i]-X[j]) + i, j, k = q + num = max( + mu + norm(X[k] - X[j]) * norm(X[k] - X[j]), realmin + ) # max(mu + D[k,j] , realmin) + den = max( + 2 * mu + + norm(X[k] - X[i]) * norm(X[k] - X[i]) + + norm(X[k] - X[j]) * norm(X[k] - X[j]), + realmin, + ) # max(2*mu + D[k,i] + D[k,j], realmin) + G[i] += 2 / den * (X[k] - X[i]) + G[j] += (2 / num - 2 / den) * (X[j] - X[k]) + G[k] += 2 / num * (X[k] - X[j]) - 2 / den * (2 * X[k] - X[i] - X[j]) log_loss += log(den) - log(num) - log_loss = log_loss/len(S) + log_loss = log_loss / len(S) # Remember, the loss function is the sum of log(1/p^k_ij), this leads to an extra minus sign - G = -1./len(S) * G + G = -1. / len(S) * G # compute statistics about gradient used for stopping conditions - mu = mean(X,0) + mu = mean(X, 0) avg_row_norm_sq = 0. avg_grad_row_norm_sq = 0. max_grad_row_norm_sq = 0. @@ -251,17 +270,27 @@ def getGradient(X, S, mu): row_norm_sq = 0 grad_row_norm_sq = 0 for j in range(d): - row_norm_sq += (X[i,j]-mu[j])*(X[i,j]-mu[j]) - grad_row_norm_sq += G[i,j]*G[i,j] - - avg_row_norm_sq += row_norm_sq/n - avg_grad_row_norm_sq += grad_row_norm_sq/n - max_grad_row_norm_sq = max(max_grad_row_norm_sq,grad_row_norm_sq) - - return G,log_loss,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq - - -def computeEmbedding(n, d, S, mu=.01, num_random_restarts=0,max_num_passes=0,max_norm=0, epsilon=0.01, verbose=False): + row_norm_sq += (X[i, j] - mu[j]) * (X[i, j] - mu[j]) + grad_row_norm_sq += G[i, j] * G[i, j] + + avg_row_norm_sq += row_norm_sq / n + avg_grad_row_norm_sq += grad_row_norm_sq / n + max_grad_row_norm_sq = max(max_grad_row_norm_sq, grad_row_norm_sq) + + return G, log_loss, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq + + +def computeEmbedding( + n, + d, + S, + mu=.01, + num_random_restarts=0, + max_num_passes=0, + max_norm=0, + epsilon=0.01, + verbose=False, +): """ Computes an embedding of n objects in d dimensions usin the triplets of S. S is a list of triplets such that for each q in S, q = [i,j,k] means that @@ -283,38 +312,47 @@ def computeEmbedding(n, d, S, mu=.01, num_random_restarts=0,max_num_passes=0,max (float) gamma : Equal to a/b where a is max row norm of the gradient matrix and b is the avg row norm of the centered embedding matrix X. This is a means to determine how close the current solution is to the "best" solution. """ - if max_num_passes==0: + if max_num_passes == 0: max_num_passes = 32 - + X_old = None - emp_loss_old = float('inf') + emp_loss_old = float("inf") num_restarts = -1 - + while num_restarts < num_random_restarts: num_restarts += 1 - - print "Epoch SGD" + + print("Epoch SGD") ts = time.time() - X,acc = computeEmbeddingWithEpochSGD(n,d,S,mu,max_num_passes=max_num_passes,epsilon=0.,verbose=verbose) - te_sgd = time.time()-ts - - print "Gradient Descent" + X, acc = computeEmbeddingWithEpochSGD( + n, d, S, mu, max_num_passes=max_num_passes, epsilon=0., verbose=verbose + ) + te_sgd = time.time() - ts + + print("Gradient Descent") ts = time.time() - X_new, emp_loss_new, log_loss_new, hinge_loss_new, acc_new = computeEmbeddingWithGD(X, S, mu, max_iters=50, max_norm=max_norm, epsilon=epsilon, verbose=verbose) - emp_loss_new,hinge_loss_new,log_loss_new = getLoss(X_new,S) - te_gd = time.time()-ts + X_new, emp_loss_new, log_loss_new, hinge_loss_new, acc_new = computeEmbeddingWithGD( + X, S, mu, max_iters=50, max_norm=max_norm, epsilon=epsilon, verbose=verbose + ) + emp_loss_new, hinge_loss_new, log_loss_new = getLoss(X_new, S) + te_gd = time.time() - ts - if emp_loss_new0 or verbose: + if epsilon > 0 or verbose: # get losses - emp_loss,hinge_loss,log_loss = getLoss(X,S) + emp_loss, hinge_loss, log_loss = getLoss(X, S) # get gradient and check stopping-time statistics - G,log_loss,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S,mu) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, log_loss, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S, mu + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if verbose: - print "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" % (t,emp_loss,hinge_loss,log_loss,rel_avg_grad,rel_max_grad,a) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" + % ( + t, + emp_loss, + hinge_loss, + log_loss, + rel_avg_grad, + rel_max_grad, + a, + ) + ) if rel_max_grad < epsilon: break @@ -396,28 +449,34 @@ def computeEmbeddingWithEpochSGD(n,d,S,mu, max_num_passes=0,max_norm=0,epsilon=0 q = S[randint(m)] # take gradient step - i,j,k = q - num = max(mu + norm(X[k]-X[j])*norm(X[k]-X[j]) , realmin) - den = max(2*mu + norm(X[k]-X[i])*norm(X[k]-X[i]) + norm(X[k]-X[j])*norm(X[k]-X[j]), realmin) - grad_i = -1*(2/den * (X[k]-X[i])) - grad_j = -1*((2/num-2/den)*(X[j]-X[k])) - grad_k = -1*(2/num * (X[k]-X[j])-2/den*(2*X[k]-X[i]-X[j])) - - X[i] = X[i] - a*grad_i/len(S) - X[j] = X[j] - a*grad_j/len(S) - X[k] = X[k] - a*grad_k/len(S) - + i, j, k = q + num = max(mu + norm(X[k] - X[j]) * norm(X[k] - X[j]), realmin) + den = max( + 2 * mu + + norm(X[k] - X[i]) * norm(X[k] - X[i]) + + norm(X[k] - X[j]) * norm(X[k] - X[j]), + realmin, + ) + grad_i = -1 * (2 / den * (X[k] - X[i])) + grad_j = -1 * ((2 / num - 2 / den) * (X[j] - X[k])) + grad_k = -1 * (2 / num * (X[k] - X[j]) - 2 / den * (2 * X[k] - X[i] - X[j])) + + X[i] = X[i] - a * grad_i / len(S) + X[j] = X[j] - a * grad_j / len(S) + X[k] = X[k] - a * grad_k / len(S) + # project back onto ball such that norm(X[i])<=max_norm for i in q: - norm_i = norm(X[i]) - if norm_i>max_norm: - X[i] = X[i] * (max_norm / norm_i) - - return X,rel_max_grad + norm_i = norm(X[i]) + if norm_i > max_norm: + X[i] = X[i] * (max_norm / norm_i) + return X, rel_max_grad -def computeEmbeddingWithGD(X, S, mu, max_iters=0, max_norm=0, epsilon=0.01, c1=0.0001, rho=.7, verbose=False): +def computeEmbeddingWithGD( + X, S, mu, max_iters=0, max_norm=0, epsilon=0.01, c1=0.0001, rho=.7, verbose=False +): """ Performs gradient descent with step size as implemented in stochastic triplet embedding code, namely ckl_x.m See: http://homepage.tudelft.nl/19j49/ste/Stochastic_Triplet_Embedding_files/STE_Release.zip @@ -449,63 +508,79 @@ def computeEmbeddingWithGD(X, S, mu, max_iters=0, max_norm=0, epsilon=0.01, c1=0 """ m = len(S) - n,d = X.shape + n, d = X.shape - if max_iters==0: + if max_iters == 0: max_iters = 100 - if max_norm==0: - max_norm = 10*d + if max_norm == 0: + max_norm = 10 * d # check losses if verbose: - emp_loss,hinge_loss,log_loss = getLoss(X,S) - print "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, a=%f" % (0,emp_loss,hinge_loss,log_loss,float('nan')) - alpha = .5*n + emp_loss, hinge_loss, log_loss = getLoss(X, S) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, a=%f" + % (0, emp_loss, hinge_loss, log_loss, float("nan")) + ) + alpha = .5 * n t = 0 - emp_loss_0 = float('inf') - hinge_loss_0 = float('inf') - log_loss_0 = float('inf') - rel_max_grad = float('inf') - + emp_loss_0 = float("inf") + hinge_loss_0 = float("inf") + log_loss_0 = float("inf") + rel_max_grad = float("inf") + while t < max_iters: - t+=1 + t += 1 # get gradient and stopping-time statistics ts = time.time() - G,log_loss,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X, S, mu) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, log_loss, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S, mu + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if rel_max_grad < epsilon: if verbose: - print "Exiting because of rel_max_grad=%s"%(rel_max_grad) + print("Exiting because of rel_max_grad=%s" % (rel_max_grad)) break - + # perform backtracking line search - alpha = 2*alpha + alpha = 2 * alpha ts = time.time() - emp_loss_0, hinge_loss_0, log_loss_0 = getLoss(X,S) - norm_grad_sq_0 = avg_grad_row_norm_sq*n - emp_loss_k, hinge_loss_k, log_loss_k = getLoss(X-alpha*G, S) - + emp_loss_0, hinge_loss_0, log_loss_0 = getLoss(X, S) + norm_grad_sq_0 = avg_grad_row_norm_sq * n + emp_loss_k, hinge_loss_k, log_loss_k = getLoss(X - alpha * G, S) + inner_t = 0 - while log_loss_k > log_loss_0 - c1*alpha*norm_grad_sq_0: - alpha = alpha*rho - emp_loss_k,hinge_loss_k,log_loss_k = getLoss(X-alpha*G,S) + while log_loss_k > log_loss_0 - c1 * alpha * norm_grad_sq_0: + alpha = alpha * rho + emp_loss_k, hinge_loss_k, log_loss_k = getLoss(X - alpha * G, S) inner_t += 1 - X = X - alpha*G + X = X - alpha * G # project back onto ball such that norm(X[i])<=max_norm for i in range(n): - norm_i = norm(X[i]) - if norm_i>max_norm: - X[i] = X[i] * (max_norm / norm_i) - + norm_i = norm(X[i]) + if norm_i > max_norm: + X[i] = X[i] * (max_norm / norm_i) + # check losses if verbose: - print "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" % (t,emp_loss_k,hinge_loss_k,log_loss_k,rel_avg_grad,rel_max_grad,alpha,inner_t) - - - return X,emp_loss_0,hinge_loss_0,log_loss_0,rel_max_grad + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" + % ( + t, + emp_loss_k, + hinge_loss_k, + log_loss_k, + rel_avg_grad, + rel_max_grad, + alpha, + inner_t, + ) + ) + + return X, emp_loss_0, hinge_loss_0, log_loss_0, rel_max_grad if __name__ == "__main__": diff --git a/apps/PoolBasedTripletMDS/algs/RandomSampling/myAlg.py b/apps/PoolBasedTripletMDS/algs/RandomSampling/myAlg.py index c1266fac..17ccb60d 100644 --- a/apps/PoolBasedTripletMDS/algs/RandomSampling/myAlg.py +++ b/apps/PoolBasedTripletMDS/algs/RandomSampling/myAlg.py @@ -1,84 +1,93 @@ +from __future__ import print_function import time import numpy.random from apps.PoolBasedTripletMDS.algs.RandomSampling import utilsMDS import next.utils as utils -class MyAlg: - def initExp(self,butler, n, d, failure_probability): - X = numpy.random.randn(n,d) - butler.algorithms.set(key='n',value= n) - butler.algorithms.set(key='d',value= d) - butler.algorithms.set(key='delta',value= failure_probability) - butler.algorithms.set(key='X',value= X.tolist()) - butler.algorithms.set(key='num_reported_answers', value=0) - return True - - - def getQuery(self,butler): - X = numpy.array(butler.algorithms.get(key='X')) - q,score = utilsMDS.getRandomQuery(X) - index_center = q[2] - index_left = q[0] - index_right = q[1] - return [index_center,index_left,index_right] - - - def processAnswer(self,butler,center_id,left_id,right_id,target_winner): - if left_id==target_winner: - q = [left_id,right_id,center_id] - else: - q = [right_id,left_id,center_id] - butler.algorithms.append(key='S',value=q) - n = butler.algorithms.get(key='n') - num_reported_answers = butler.algorithms.increment(key='num_reported_answers') - if num_reported_answers % int(n) == 0: - butler.job('full_embedding_update', {}, time_limit=30) - else: - butler.job('incremental_embedding_update', {},time_limit=5) - return True - - - def getModel(self, butler): - return butler.algorithms.get(key=['X','num_reported_answers']) - - - def incremental_embedding_update(self,butler,args): - S = butler.algorithms.get(key='S') - X = numpy.array(butler.algorithms.get(key='X')) - # set maximum time allowed to update embedding - t_max = 1.0 - epsilon = 0.01 # a relative convergence criterion, see computeEmbeddingWithGD documentation - # take a single gradient step - t_start = time.time() - X,emp_loss_new,hinge_loss_new,acc = utilsMDS.computeEmbeddingWithGD(X,S,max_iters=1) - k = 1 - while (time.time()-t_start<0.5*t_max) and (acc > epsilon): - X,emp_loss_new,hinge_loss_new,acc = utilsMDS.computeEmbeddingWithGD(X,S,max_iters=2**k) - k += 1 - butler.algorithms.set(key='X',value=X.tolist()) - - def full_embedding_update(self,butler,args): - n = butler.algorithms.get(key='n') - d = butler.algorithms.get(key='d') - S = butler.algorithms.get(key='S') - - X_old = numpy.array(butler.algorithms.get(key='X')) - - t_max = 5.0 - epsilon = 0.01 # a relative convergence criterion, see computeEmbeddingWithGD documentation - - emp_loss_old,hinge_loss_old = utilsMDS.getLoss(X_old,S) - X,tmp = utilsMDS.computeEmbeddingWithEpochSGD(n,d,S,max_num_passes=16,epsilon=0,verbose=False) - t_start = time.time() - X,emp_loss_new,hinge_loss_new,acc = utilsMDS.computeEmbeddingWithGD(X,S,max_iters=1) - k = 1 - while (time.time()-t_start<0.5*t_max) and (acc > epsilon): - X,emp_loss_new,hinge_loss_new,acc = utilsMDS.computeEmbeddingWithGD(X,S,max_iters=2**k) - k += 1 - emp_loss_new,hinge_loss_new = utilsMDS.getLoss(X,S) - if emp_loss_old < emp_loss_new: - X = X_old - butler.algorithms.set(key='X',value=X.tolist()) - - +class MyAlg: + def initExp(self, butler, n, d, failure_probability): + X = numpy.random.randn(n, d) + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="d", value=d) + butler.algorithms.set(key="delta", value=failure_probability) + butler.algorithms.set(key="X", value=X.tolist()) + butler.algorithms.set(key="num_reported_answers", value=0) + return True + + def getQuery(self, butler): + X = numpy.array(butler.algorithms.get(key="X")) + q, score = utilsMDS.getRandomQuery(X) + index_center = q[2] + index_left = q[0] + index_right = q[1] + return [index_center, index_left, index_right] + + def processAnswer(self, butler, center_id, left_id, right_id, target_winner): + if left_id == target_winner: + q = [left_id, right_id, center_id] + else: + q = [right_id, left_id, center_id] + butler.algorithms.append(key="S", value=q) + n = butler.algorithms.get(key="n") + num_reported_answers = butler.algorithms.increment(key="num_reported_answers") + if num_reported_answers % int(n) == 0: + butler.job("full_embedding_update", {}, time_limit=30) + else: + butler.job("incremental_embedding_update", {}, time_limit=5) + return True + + def getModel(self, butler): + return butler.algorithms.get(key=["X", "num_reported_answers"]) + + def incremental_embedding_update(self, butler, args): + S = butler.algorithms.get(key="S") + X = numpy.array(butler.algorithms.get(key="X")) + # set maximum time allowed to update embedding + t_max = 1.0 + epsilon = ( + 0.01 + ) # a relative convergence criterion, see computeEmbeddingWithGD documentation + # take a single gradient step + t_start = time.time() + X, emp_loss_new, hinge_loss_new, acc = utilsMDS.computeEmbeddingWithGD( + X, S, max_iters=1 + ) + k = 1 + while (time.time() - t_start < 0.5 * t_max) and (acc > epsilon): + X, emp_loss_new, hinge_loss_new, acc = utilsMDS.computeEmbeddingWithGD( + X, S, max_iters=2 ** k + ) + k += 1 + butler.algorithms.set(key="X", value=X.tolist()) + + def full_embedding_update(self, butler, args): + n = butler.algorithms.get(key="n") + d = butler.algorithms.get(key="d") + S = butler.algorithms.get(key="S") + + X_old = numpy.array(butler.algorithms.get(key="X")) + + t_max = 5.0 + epsilon = ( + 0.01 + ) # a relative convergence criterion, see computeEmbeddingWithGD documentation + + emp_loss_old, hinge_loss_old = utilsMDS.getLoss(X_old, S) + X, tmp = utilsMDS.computeEmbeddingWithEpochSGD( + n, d, S, max_num_passes=16, epsilon=0, verbose=False + ) + t_start = time.time() + X, emp_loss_new, hinge_loss_new, acc = utilsMDS.computeEmbeddingWithGD( + X, S, max_iters=1 + ) + k = 1 + while (time.time() - t_start < 0.5 * t_max) and (acc > epsilon): + X, emp_loss_new, hinge_loss_new, acc = utilsMDS.computeEmbeddingWithGD( + X, S, max_iters=2 ** k + ) + k += 1 + emp_loss_new, hinge_loss_new = utilsMDS.getLoss(X, S) + if emp_loss_old < emp_loss_new: + X = X_old + butler.algorithms.set(key="X", value=X.tolist()) diff --git a/apps/PoolBasedTripletMDS/algs/RandomSampling/utilsMDS.py b/apps/PoolBasedTripletMDS/algs/RandomSampling/utilsMDS.py index 35db1fd2..ae604a4b 100644 --- a/apps/PoolBasedTripletMDS/algs/RandomSampling/utilsMDS.py +++ b/apps/PoolBasedTripletMDS/algs/RandomSampling/utilsMDS.py @@ -11,13 +11,14 @@ You may also consider getLoss to check how well an embedding is performing. """ +from __future__ import print_function from numpy import * from numpy.random import * import numpy.random from numpy.linalg import * -#eig = numpy.linalg +# eig = numpy.linalg norm = linalg.norm floor = math.floor ceil = math.ceil @@ -25,54 +26,54 @@ import time - def main(): """ Example of Usage Creates some fake data and finds an embedding """ - + # generate some fake data n = 30 d = 2 - m = int(ceil(40*n*d*log(n))) # number of labels - - p = 0.1; # error rate - + m = int(ceil(40 * n * d * log(n))) # number of labels + + p = 0.1 + # error rate + Strain = [] Stest = [] - Xtrue = randn(n,d); - for iter in range(0,m): + Xtrue = randn(n, d) + for iter in range(0, m): # get random triplet - q,score = getRandomQuery(Xtrue) + q, score = getRandomQuery(Xtrue) # align it so it agrees with Xtrue: "q[2] is more similar to q[0] than q[1]" - query_ordering_disagrees_with_Xtrue = score<0 + query_ordering_disagrees_with_Xtrue = score < 0 if query_ordering_disagrees_with_Xtrue: - q = [ q[i] for i in [1,0,2]] + q = [q[i] for i in [1, 0, 2]] # add some noise R = rand() - if R 0 then the triplet agrees with the embedding, otherwise it does not @@ -113,12 +115,12 @@ def getTripletScore(X,q): Usage: score = getTripletScore(X,[3,4,5]) """ - i,j,k = q + i, j, k = q - return dot(X[j],X[j]) -2*dot(X[j],X[k]) + 2*dot(X[i],X[k]) - dot(X[i],X[i]) + return dot(X[j], X[j]) - 2 * dot(X[j], X[k]) + 2 * dot(X[i], X[k]) - dot(X[i], X[i]) -def getLoss(X,S): +def getLoss(X, S): """ Returns loss on X with respect to list of triplets S: 1/len(S) \sum_{q in S} loss(X,q). Intuitively, q=[i,j,k] "agrees" with X if ||x_j - x_k||^2 > ||x_i - x_k||^2. @@ -133,23 +135,24 @@ def getLoss(X,S): n = X.shape[0] d = X.shape[1] - emp_loss = 0 # 0/1 loss - hinge_loss = 0 # hinge loss - + emp_loss = 0 # 0/1 loss + hinge_loss = 0 # hinge loss + for q in S: - loss_ijk = getTripletScore(X,q) + loss_ijk = getTripletScore(X, q) + + hinge_loss = hinge_loss + max(0, 1. - loss_ijk) - hinge_loss = hinge_loss + max(0,1. - loss_ijk) - if loss_ijk < 0: emp_loss = emp_loss + 1. - emp_loss = emp_loss/len(S) - hinge_loss = hinge_loss/len(S) + emp_loss = emp_loss / len(S) + hinge_loss = hinge_loss / len(S) return emp_loss, hinge_loss -def getGradient(X,S): + +def getGradient(X, S): """ Returns normalized gradient of hinge loss wrt to X and S. Intuitively, q=[i,j,k] "agrees" with X if ||x_j - x_k||^2 > ||x_i - x_k||^2. @@ -160,22 +163,22 @@ def getGradient(X,S): Usage: G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) """ - n,d = X.shape + n, d = X.shape m = len(S) # pattern for computing gradient - H = mat([[2.,0.,-2.],[ 0., -2., 2.],[ -2., 2., 0.]]) + H = mat([[2., 0., -2.], [0., -2., 2.], [-2., 2., 0.]]) - # compute gradient - G = zeros((n,d)) + # compute gradient + G = zeros((n, d)) for q in S: - score = getTripletScore(X,q) - if 1.-score>0: - grad_partial = dot(H,X[q,:])/m - G[q,:] = G[q,:] + grad_partial + score = getTripletScore(X, q) + if 1. - score > 0: + grad_partial = dot(H, X[q, :]) / m + G[q, :] = G[q, :] + grad_partial # compute statistics about gradient used for stopping conditions - mu = mean(X,0) + mu = mean(X, 0) avg_row_norm_sq = 0. avg_grad_row_norm_sq = 0. max_grad_row_norm_sq = 0. @@ -184,16 +187,27 @@ def getGradient(X,S): row_norm_sq = 0 grad_row_norm_sq = 0 for j in range(d): - row_norm_sq += (X[i,j]-mu[j])*(X[i,j]-mu[j]) - grad_row_norm_sq += G[i,j]*G[i,j] - - avg_row_norm_sq += row_norm_sq/n - avg_grad_row_norm_sq += grad_row_norm_sq/n - max_grad_row_norm_sq = max(max_grad_row_norm_sq,grad_row_norm_sq) - - return G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq - -def computeEmbedding(n,d,S,num_random_restarts=0,max_num_passes=0,max_iter_GD=0,max_norm=0,epsilon=0.01,verbose=False): + row_norm_sq += (X[i, j] - mu[j]) * (X[i, j] - mu[j]) + grad_row_norm_sq += G[i, j] * G[i, j] + + avg_row_norm_sq += row_norm_sq / n + avg_grad_row_norm_sq += grad_row_norm_sq / n + max_grad_row_norm_sq = max(max_grad_row_norm_sq, grad_row_norm_sq) + + return G, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq + + +def computeEmbedding( + n, + d, + S, + num_random_restarts=0, + max_num_passes=0, + max_iter_GD=0, + max_norm=0, + epsilon=0.01, + verbose=False, +): """ Computes an embedding of n objects in d dimensions usin the triplets of S. S is a list of triplets such that for each q in S, q = [i,j,k] means that @@ -217,41 +231,61 @@ def computeEmbedding(n,d,S,num_random_restarts=0,max_num_passes=0,max_iter_GD=0, (float) gamma : Equal to a/b where a is max row norm of the gradient matrix and b is the avg row norm of the centered embedding matrix X. This is a means to determine how close the current solution is to the "best" solution. """ - if max_num_passes==0: + if max_num_passes == 0: max_num_passes_SGD = 16 else: max_num_passes_SGD = max_num_passes - - if max_iter_GD ==0: + if max_iter_GD == 0: max_iter_GD = 50 X_old = None - emp_loss_old = float('inf') + emp_loss_old = float("inf") num_restarts = -1 while num_restarts < num_random_restarts: num_restarts += 1 ts = time.time() - X,acc = computeEmbeddingWithEpochSGD(n,d,S,max_num_passes=max_num_passes_SGD,max_norm=max_norm,epsilon=epsilon,verbose=verbose) - te_sgd = time.time()-ts + X, acc = computeEmbeddingWithEpochSGD( + n, + d, + S, + max_num_passes=max_num_passes_SGD, + max_norm=max_norm, + epsilon=epsilon, + verbose=verbose, + ) + te_sgd = time.time() - ts ts = time.time() - X_new,emp_loss_new,hinge_loss_new,acc_new = computeEmbeddingWithGD(X,S,max_iters=max_iter_GD,max_norm=max_norm,epsilon=epsilon,verbose=verbose) - - te_gd = time.time()-ts - - if emp_loss_new0 or verbose: + if epsilon > 0 or verbose: # get losses - emp_loss,hinge_loss = getLoss(X,S) + emp_loss, hinge_loss = getLoss(X, S) # get gradient and check stopping-time statistics - G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if verbose: - print "iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" % (t,emp_loss,hinge_loss,rel_avg_grad,rel_max_grad,a) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" + % (t, emp_loss, hinge_loss, rel_avg_grad, rel_max_grad, a) + ) if rel_max_grad < epsilon: break @@ -335,21 +377,23 @@ def computeEmbeddingWithEpochSGD(n,d,S,max_num_passes=0,max_norm=0,epsilon=0.01, q = S[randint(m)] # take gradient step - score = getTripletScore(X,q) - if 1.-score>0: - grad_partial = dot(H,X[q,:]) - X[q,:] = X[q,:] - a*grad_partial + score = getTripletScore(X, q) + if 1. - score > 0: + grad_partial = dot(H, X[q, :]) + X[q, :] = X[q, :] - a * grad_partial # # project back onto ball such that norm(X[i])<=max_norm for i in q: norm_i = norm(X[i]) - if norm_i>max_norm: + if norm_i > max_norm: X[i] = X[i] * (max_norm / norm_i) - return X,rel_max_grad + return X, rel_max_grad -def computeEmbeddingWithGD(X,S,max_iters=0,max_norm=0,epsilon=0.01,c1=0.0001,rho=0.5,verbose=False): +def computeEmbeddingWithGD( + X, S, max_iters=0, max_norm=0, epsilon=0.01, c1=0.0001, rho=0.5, verbose=False +): """ Performs gradient descent with geometric amarijo line search (with parameter c1) @@ -380,61 +424,73 @@ def computeEmbeddingWithGD(X,S,max_iters=0,max_norm=0,epsilon=0.01,c1=0.0001,rho """ m = len(S) - n,d = X.shape + n, d = X.shape - if max_iters==0: + if max_iters == 0: max_iters = 100 - if max_norm==0: - max_norm = 10.*d + if max_norm == 0: + max_norm = 10. * d # check losses if verbose: - emp_loss,hinge_loss = getLoss(X,S) - print "iter=%d, emp_loss=%f, hinge_loss=%f, a=%f" % (0,emp_loss,hinge_loss,float('nan')) + emp_loss, hinge_loss = getLoss(X, S) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, a=%f" + % (0, emp_loss, hinge_loss, float("nan")) + ) alpha = .5 t = 0 - emp_loss_0 = float('inf') - hinge_loss_0 = float('inf') - rel_max_grad = float('inf') + emp_loss_0 = float("inf") + hinge_loss_0 = float("inf") + rel_max_grad = float("inf") while t < max_iters: - t+=1 + t += 1 # get gradient and stopping-time statistics - G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if rel_max_grad < epsilon: break # perform backtracking line search - alpha = 2*alpha - emp_loss_0,hinge_loss_0 = getLoss(X,S) - norm_grad_sq_0 = avg_grad_row_norm_sq*n - emp_loss_k,hinge_loss_k = getLoss(X-alpha*G,S) + alpha = 2 * alpha + emp_loss_0, hinge_loss_0 = getLoss(X, S) + norm_grad_sq_0 = avg_grad_row_norm_sq * n + emp_loss_k, hinge_loss_k = getLoss(X - alpha * G, S) inner_t = 0 - while hinge_loss_k > hinge_loss_0 - c1*alpha*norm_grad_sq_0: - alpha = alpha*rho - emp_loss_k,hinge_loss_k = getLoss(X-alpha*G,S) + while hinge_loss_k > hinge_loss_0 - c1 * alpha * norm_grad_sq_0: + alpha = alpha * rho + emp_loss_k, hinge_loss_k = getLoss(X - alpha * G, S) inner_t += 1 - X = X-alpha*G + X = X - alpha * G # project back onto ball such that norm(X[i])<=max_norm for i in range(n): norm_i = norm(X[i]) - if norm_i>max_norm: + if norm_i > max_norm: X[i] = X[i] * (max_norm / norm_i) # check losses if verbose: - print "hinge iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" % (t,emp_loss_k,hinge_loss_k,rel_avg_grad,rel_max_grad,alpha,inner_t) - - return X,emp_loss_0,hinge_loss_0,rel_max_grad - - - - + print( + "hinge iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" + % ( + t, + emp_loss_k, + hinge_loss_k, + rel_avg_grad, + rel_max_grad, + alpha, + inner_t, + ) + ) + + return X, emp_loss_0, hinge_loss_0, rel_max_grad if __name__ == "__main__": diff --git a/apps/PoolBasedTripletMDS/algs/STE/myAlg.py b/apps/PoolBasedTripletMDS/algs/STE/myAlg.py index 0c506e7e..84cc51b3 100644 --- a/apps/PoolBasedTripletMDS/algs/STE/myAlg.py +++ b/apps/PoolBasedTripletMDS/algs/STE/myAlg.py @@ -3,6 +3,7 @@ author: Lalit Jain, kevin.g.jamieson@gmail.com last updated: 4/22/2015 """ +from __future__ import print_function import numpy import numpy.random from apps.PoolBasedTripletMDS.algs.STE import utilsSTE @@ -12,157 +13,165 @@ import time -class MyAlg: - def initExp(self,butler,n,d,failure_probability): - X = numpy.random.randn(n,d)*.0001 - tau = numpy.random.rand(n,n) - - butler.algorithms.set(key='n',value=n) - butler.algorithms.set(key='d',value=d) - butler.algorithms.set(key='delta',value=failure_probability) - butler.algorithms.set(key='X',value=X.tolist()) - butler.algorithms.set(key='tau',value=tau.tolist()) - butler.algorithms.set(key='num_reported_answers',value=0) - return True - - - def getQuery(self,butler): - R = 10 - n = butler.algorithms.get(key='n') - num_reported_answers = butler.algorithms.get(key='num_reported_answers') - - if num_reported_answers == None: - num_reported_answers = 0 - butler.algorithms.set(key='num_reported_answers', value=0) - - if num_reported_answers < R*n: - r = random.Random() - r.seed(42) - idxs = np.arange(n).repeat(R).tolist() - r.shuffle(idxs) - a = idxs[num_reported_answers] - - b = numpy.random.randint(n) - while b==a: - b = numpy.random.randint(n) - c = numpy.random.randint(n) - while c==a or c==b: - c = numpy.random.randint(n) - return [a, b, c] - - X = numpy.array(butler.algorithms.get(key='X')) - tau = numpy.array(butler.algorithms.get(key='tau')) - - - # set maximum time allowed to search for a query - t_max = .05 - best_q, best_score = utilsSTE.getRandomQuery(X) - t_start = time.time() - best_entropy = -1*float('inf') - - while time.time()-t_start best_entropy: - best_q = q - best_entropy = entropy - index_center = best_q[2] - index_left = best_q[0] - index_right = best_q[1] - - return [index_center,index_left,index_right] - - - def processAnswer(self,butler,center_id,left_id,right_id,target_winner): - if left_id==target_winner: - q = [left_id,right_id,center_id] - else: - q = [right_id,left_id,center_id] - - butler.algorithms.append(key='S',value=q) - - n = butler.algorithms.get(key='n') - num_reported_answers = butler.algorithms.increment(key='num_reported_answers') - if num_reported_answers % int(n) == 0: - butler.job('full_embedding_update', {}, time_limit=30) - else: - butler.job('incremental_embedding_update', {},time_limit=5) - return True - - - def getModel(self,butler): - return butler.algorithms.get(key=['X','num_reported_answers']) - - - def incremental_embedding_update(self,butler,args): - verbose = False - - S = butler.algorithms.get(key='S') - - X = numpy.array(butler.algorithms.get(key='X')) - # set maximum time allowed to update embedding - t_max = 1.0 - epsilon = 0.00001 # a relative convergence criterion, see computeEmbeddingWithGD documentation - alpha = 1 - - t_start = time.time() - X,emp_loss_new,hinge_loss_new,log_loss_new,acc = utilsSTE.computeEmbeddingWithGD(X,S,alpha,max_iters=1, epsilon=epsilon,verbose=verbose) - k = 1 - while (time.time()-t_start<.5*t_max) and (acc > epsilon): - X,emp_loss_new,hinge_loss_new,log_loss_new,acc = utilsSTE.computeEmbeddingWithGD(X,S,alpha,max_iters=2**k, epsilon=epsilon,verbose=verbose) - k+=1 - - tau = utilsSTE.getSTETauDistribution(X,S,alpha) - - butler.algorithms.set(key='X',value=X.tolist()) - butler.algorithms.set(key='tau',value=tau.tolist()) - - - - def full_embedding_update(self,butler,args): - verbose = False - - n = butler.algorithms.get(key='n') - d = butler.algorithms.get(key='d') - S = butler.algorithms.get(key='S') - - X_old = numpy.array(butler.algorithms.get(key='X')) - # set maximum time allowed to update embedding - t_max = 5.0 - epsilon = 0.00001 # a relative convergence criterion, see computeEmbeddingWithGD documentation - alpha = 1 - - emp_loss_old,hinge_loss_old,log_loss_old = utilsSTE.getLoss(X_old,S,alpha) - X,tmp = utilsSTE.computeEmbeddingWithEpochSGD(n,d,S,alpha,max_num_passes=16,epsilon=0,verbose=verbose) - t_start = time.time() - X,emp_loss_new,hinge_loss_new,log_loss_new,acc = utilsSTE.computeEmbeddingWithGD(X,S,alpha,max_iters=1, epsilon=epsilon,verbose=verbose) - k = 1 - while (time.time()-t_start<.5*t_max) and (acc > epsilon): - X,emp_loss_new,hinge_loss_new,log_loss_new,acc = utilsSTE.computeEmbeddingWithGD(X,S,alpha,max_iters=2**k, epsilon=epsilon,verbose=verbose) - k += 1 - emp_loss_new,hinge_loss_new,log_loss_new = utilsSTE.getLoss(X,S, alpha) - if emp_loss_old < emp_loss_new: - X = X_old - - tau = utilsSTE.getSTETauDistribution(X,S,alpha) - - butler.algorithms.set(key='X',value=X.tolist()) - butler.algorithms.set(key='tau',value=tau.tolist()) - +class MyAlg: + def initExp(self, butler, n, d, failure_probability): + X = numpy.random.randn(n, d) * .0001 + tau = numpy.random.rand(n, n) + + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="d", value=d) + butler.algorithms.set(key="delta", value=failure_probability) + butler.algorithms.set(key="X", value=X.tolist()) + butler.algorithms.set(key="tau", value=tau.tolist()) + butler.algorithms.set(key="num_reported_answers", value=0) + return True + + def getQuery(self, butler): + R = 10 + n = butler.algorithms.get(key="n") + num_reported_answers = butler.algorithms.get(key="num_reported_answers") + + if num_reported_answers == None: + num_reported_answers = 0 + butler.algorithms.set(key="num_reported_answers", value=0) + + if num_reported_answers < R * n: + r = random.Random() + r.seed(42) + idxs = np.arange(n).repeat(R).tolist() + r.shuffle(idxs) + a = idxs[num_reported_answers] + + b = numpy.random.randint(n) + while b == a: + b = numpy.random.randint(n) + c = numpy.random.randint(n) + while c == a or c == b: + c = numpy.random.randint(n) + return [a, b, c] + + X = numpy.array(butler.algorithms.get(key="X")) + tau = numpy.array(butler.algorithms.get(key="tau")) + + # set maximum time allowed to search for a query + t_max = .05 + best_q, best_score = utilsSTE.getRandomQuery(X) + t_start = time.time() + best_entropy = -1 * float("inf") + + while time.time() - t_start < t_max: + q, score = utilsSTE.getRandomQuery(X) + b, c, a = q + p = 0 + for i in range(n): + p += utilsSTE.getSTETripletProbability(X[b], X[c], X[i]) * tau[a, i] + + taub = list(tau[a]) + for i in range(n): + taub[i] = taub[i] * utilsSTE.getSTETripletProbability(X[b], X[c], X[i]) + taub = taub / sum(taub) + + tauc = list(tau[a]) + for i in range(n): + tauc[i] = tauc[i] * utilsSTE.getSTETripletProbability(X[c], X[b], X[i]) + tauc = tauc / sum(tauc) + + entropy = -p * utilsSTE.getEntropy(taub) - (1 - p) * utilsSTE.getEntropy( + tauc + ) + + if entropy > best_entropy: + best_q = q + best_entropy = entropy + index_center = best_q[2] + index_left = best_q[0] + index_right = best_q[1] + + return [index_center, index_left, index_right] + + def processAnswer(self, butler, center_id, left_id, right_id, target_winner): + if left_id == target_winner: + q = [left_id, right_id, center_id] + else: + q = [right_id, left_id, center_id] + + butler.algorithms.append(key="S", value=q) + + n = butler.algorithms.get(key="n") + num_reported_answers = butler.algorithms.increment(key="num_reported_answers") + if num_reported_answers % int(n) == 0: + butler.job("full_embedding_update", {}, time_limit=30) + else: + butler.job("incremental_embedding_update", {}, time_limit=5) + return True + + def getModel(self, butler): + return butler.algorithms.get(key=["X", "num_reported_answers"]) + + def incremental_embedding_update(self, butler, args): + verbose = False + + S = butler.algorithms.get(key="S") + + X = numpy.array(butler.algorithms.get(key="X")) + # set maximum time allowed to update embedding + t_max = 1.0 + epsilon = ( + 0.00001 + ) # a relative convergence criterion, see computeEmbeddingWithGD documentation + alpha = 1 + + t_start = time.time() + X, emp_loss_new, hinge_loss_new, log_loss_new, acc = utilsSTE.computeEmbeddingWithGD( + X, S, alpha, max_iters=1, epsilon=epsilon, verbose=verbose + ) + k = 1 + while (time.time() - t_start < .5 * t_max) and (acc > epsilon): + X, emp_loss_new, hinge_loss_new, log_loss_new, acc = utilsSTE.computeEmbeddingWithGD( + X, S, alpha, max_iters=2 ** k, epsilon=epsilon, verbose=verbose + ) + k += 1 + + tau = utilsSTE.getSTETauDistribution(X, S, alpha) + + butler.algorithms.set(key="X", value=X.tolist()) + butler.algorithms.set(key="tau", value=tau.tolist()) + + def full_embedding_update(self, butler, args): + verbose = False + + n = butler.algorithms.get(key="n") + d = butler.algorithms.get(key="d") + S = butler.algorithms.get(key="S") + + X_old = numpy.array(butler.algorithms.get(key="X")) + # set maximum time allowed to update embedding + t_max = 5.0 + epsilon = ( + 0.00001 + ) # a relative convergence criterion, see computeEmbeddingWithGD documentation + alpha = 1 + + emp_loss_old, hinge_loss_old, log_loss_old = utilsSTE.getLoss(X_old, S, alpha) + X, tmp = utilsSTE.computeEmbeddingWithEpochSGD( + n, d, S, alpha, max_num_passes=16, epsilon=0, verbose=verbose + ) + t_start = time.time() + X, emp_loss_new, hinge_loss_new, log_loss_new, acc = utilsSTE.computeEmbeddingWithGD( + X, S, alpha, max_iters=1, epsilon=epsilon, verbose=verbose + ) + k = 1 + while (time.time() - t_start < .5 * t_max) and (acc > epsilon): + X, emp_loss_new, hinge_loss_new, log_loss_new, acc = utilsSTE.computeEmbeddingWithGD( + X, S, alpha, max_iters=2 ** k, epsilon=epsilon, verbose=verbose + ) + k += 1 + emp_loss_new, hinge_loss_new, log_loss_new = utilsSTE.getLoss(X, S, alpha) + if emp_loss_old < emp_loss_new: + X = X_old + + tau = utilsSTE.getSTETauDistribution(X, S, alpha) + + butler.algorithms.set(key="X", value=X.tolist()) + butler.algorithms.set(key="tau", value=tau.tolist()) diff --git a/apps/PoolBasedTripletMDS/algs/STE/utilsSTE.py b/apps/PoolBasedTripletMDS/algs/STE/utilsSTE.py index 34391751..a525946a 100644 --- a/apps/PoolBasedTripletMDS/algs/STE/utilsSTE.py +++ b/apps/PoolBasedTripletMDS/algs/STE/utilsSTE.py @@ -13,12 +13,13 @@ You may also consider getLoss to check how well an embedding is performing. """ +from __future__ import print_function from numpy import * from numpy.random import * import numpy.random from numpy.linalg import * -#eig = numpy.linalg +# eig = numpy.linalg norm = linalg.norm floor = math.floor ceil = math.ceil @@ -27,52 +28,57 @@ import time + def main(): """ Example of Usage Creates some fake data and finds an embedding """ - + # generate some fake data n = 30 d = 2 - m = int(ceil(40*n*d*log(n))) # number of labels - - p = 0.1; # error rate - + m = int(ceil(40 * n * d * log(n))) # number of labels + + p = 0.1 + # error rate + Strain = [] Stest = [] - Xtrue = randn(n,d); - for iter in range(0,m): + Xtrue = randn(n, d) + for iter in range(0, m): # get random triplet - q,score = getRandomQuery(Xtrue) + q, score = getRandomQuery(Xtrue) # align it so it agrees with Xtrue: "q[2] is more similar to q[0] than q[1]" - query_ordering_disagrees_with_Xtrue = score<0 + query_ordering_disagrees_with_Xtrue = score < 0 if query_ordering_disagrees_with_Xtrue: - q = [ q[i] for i in [1,0,2]] + q = [q[i] for i in [1, 0, 2]] # add some noise R = rand() - if R 0 then the triplet agrees with the embedding, otherwise it does not @@ -112,12 +119,12 @@ def getTripletScore(X,q): Usage: score = getTripletScore(X,[3,4,5]) """ - i,j,k = q + i, j, k = q - return dot(X[j],X[j]) -2*dot(X[j],X[k]) + 2*dot(X[i],X[k]) - dot(X[i],X[i]) + return dot(X[j], X[j]) - 2 * dot(X[j], X[k]) + 2 * dot(X[i], X[k]) - dot(X[i], X[i]) -def getLoss(X,S,alpha): +def getLoss(X, S, alpha): """ Returns loss on X with respect to list of triplets S: 1/len(S) \sum_{q in S} loss(X,q). Intuitively, q=[i,j,k] "agrees" with X if ||x_j - x_k||^2 > ||x_i - x_k||^2. @@ -132,24 +139,26 @@ def getLoss(X,S,alpha): n = X.shape[0] d = X.shape[1] - emp_loss = 0 # 0/1 loss - hinge_loss = 0 # hinge loss - log_loss = 0 #log_loss in crowd kernel model + emp_loss = 0 # 0/1 loss + hinge_loss = 0 # hinge loss + log_loss = 0 # log_loss in crowd kernel model for q in S: - loss_ijk = getTripletScore(X,q) - hinge_loss = hinge_loss + max(0,1. - loss_ijk) - log_loss = log_loss - log(getSTETripletProbability(X[q[0]], X[q[1]], X[q[2]],alpha=1)) + loss_ijk = getTripletScore(X, q) + hinge_loss = hinge_loss + max(0, 1. - loss_ijk) + log_loss = log_loss - log( + getSTETripletProbability(X[q[0]], X[q[1]], X[q[2]], alpha=1) + ) if loss_ijk < 0: - emp_loss = emp_loss + 1. + emp_loss = emp_loss + 1. - emp_loss = emp_loss/len(S) - hinge_loss = hinge_loss/len(S) - log_loss = log_loss/len(S) + emp_loss = emp_loss / len(S) + hinge_loss = hinge_loss / len(S) + log_loss = log_loss / len(S) return emp_loss, hinge_loss, log_loss -def getSTETripletProbability(i,j,k,alpha=1): +def getSTETripletProbability(i, j, k, alpha=1): """ Return the probability of triplet [i,l,j] where a is closer to b than c. @@ -162,10 +171,12 @@ def getSTETripletProbability(i,j,k,alpha=1): (numpy.ndarray) c : numpy array (float) alpha : regularization parameter """ - ki = norm(k-i) - kj = norm(k-j) - c = -(alpha+1.)/2 - return (1 + ki*ki/alpha )**c / ( (1 + ki*ki/alpha )**c + ( 1 + kj*kj/alpha )**c ) + ki = norm(k - i) + kj = norm(k - j) + c = -(alpha + 1.) / 2 + return (1 + ki * ki / alpha) ** c / ( + (1 + ki * ki / alpha) ** c + (1 + kj * kj / alpha) ** c + ) def getEntropy(tau): @@ -181,9 +192,10 @@ def getEntropy(tau): e = 0 for i in range(len(tau)): if tau[i] > 0: - e += -1*tau[i]*log(tau[i]) + e += -1 * tau[i] * log(tau[i]) return e - + + def getSTETauDistribution(X, S, alpha=1): """ Return the tau distributions for each point [n]. @@ -197,23 +209,26 @@ def getSTETauDistribution(X, S, alpha=1): Usage: tau = getSTEDistribution(X,S) """ - n,d = X.shape - tau = zeros((n,n)) + n, d = X.shape + tau = zeros((n, n)) # Loop over each query for q in S: a = q[2] # Multiply by the amount the query contributes to each tau for i in range(n): - tau[a,i] = tau[a,i] + log( getSTETripletProbability(X[q[0]], X[q[1]], X[i], alpha=alpha) ) - + tau[a, i] = tau[a, i] + log( + getSTETripletProbability(X[q[0]], X[q[1]], X[i], alpha=alpha) + ) + # Normalize for a in range(n): tau[a] = exp(tau[a]) s = sum(tau[a]) - tau[a] = tau[a]/s - + tau[a] = tau[a] / s + return tau - + + def getGradient(X, S, alpha): """ Returns gradient of the log loss of the crowd kernel probability distribution. @@ -222,33 +237,42 @@ def getGradient(X, S, alpha): Usage: G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) """ - n,d = X.shape + n, d = X.shape m = len(S) - - G = zeros((n,d)) - c = -(alpha+1.)/2 + + G = zeros((n, d)) + c = -(alpha + 1.) / 2 # By defintion d = 2c/alpha - cc = -(alpha+1.)/alpha + cc = -(alpha + 1.) / alpha log_loss = 0 for q in S: - i,j,k = q - normki = norm( X[k] - X[i] ) - normki = normki*normki - normkj = norm( X[k] - X[j] ) - normkj = normkj*normkj - num = max((1 + normki/alpha )**c , realmin) - den = max((1 + normki/alpha )**c + ( 1 + normkj/alpha )**c, realmin)#max(2*alpha + D[k,i] + D[k,j], realmin) - P = num/den - G[k] += (1-P) * cc * ( ( X[k] - X[i] )/( 1 + normki/alpha ) - ( X[k] - X[j] )/( 1 + normkj/alpha ) ) - G[i] += -(1-P) * cc * ( X[k] - X[i] )/( 1 + normki/alpha ) - G[j] += (1-P) * cc * ( X[k] - X[j] )/( 1 + normkj/alpha ) + i, j, k = q + normki = norm(X[k] - X[i]) + normki = normki * normki + normkj = norm(X[k] - X[j]) + normkj = normkj * normkj + num = max((1 + normki / alpha) ** c, realmin) + den = max( + (1 + normki / alpha) ** c + (1 + normkj / alpha) ** c, realmin + ) # max(2*alpha + D[k,i] + D[k,j], realmin) + P = num / den + G[k] += ( + (1 - P) + * cc + * ( + (X[k] - X[i]) / (1 + normki / alpha) + - (X[k] - X[j]) / (1 + normkj / alpha) + ) + ) + G[i] += -(1 - P) * cc * (X[k] - X[i]) / (1 + normki / alpha) + G[j] += (1 - P) * cc * (X[k] - X[j]) / (1 + normkj / alpha) log_loss += log(den) - log(num) - log_loss = log_loss/len(S) + log_loss = log_loss / len(S) # Remember, the loss function is the sum of log(1/p^k_ij), this leads to an extra minus sign - G = -1./len(S) * G + G = -1. / len(S) * G # compute statistics about gradient used for stopping conditions - mu = mean(X,0) + mu = mean(X, 0) avg_row_norm_sq = 0. avg_grad_row_norm_sq = 0. max_grad_row_norm_sq = 0. @@ -258,16 +282,26 @@ def getGradient(X, S, alpha): row_norm_sq = 0 grad_row_norm_sq = 0 for j in range(d): - row_norm_sq += (X[i,j]-mu[j])*(X[i,j]-mu[j]) - grad_row_norm_sq += G[i,j]*G[i,j] - avg_row_norm_sq += row_norm_sq/n - avg_grad_row_norm_sq += grad_row_norm_sq/n - max_grad_row_norm_sq = max(max_grad_row_norm_sq,grad_row_norm_sq) - - return G,log_loss,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq - - -def computeEmbedding(n, d, S, alpha=1, num_random_restarts=0,max_num_passes=0,max_norm=0, epsilon=0.001, verbose=False): + row_norm_sq += (X[i, j] - mu[j]) * (X[i, j] - mu[j]) + grad_row_norm_sq += G[i, j] * G[i, j] + avg_row_norm_sq += row_norm_sq / n + avg_grad_row_norm_sq += grad_row_norm_sq / n + max_grad_row_norm_sq = max(max_grad_row_norm_sq, grad_row_norm_sq) + + return G, log_loss, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq + + +def computeEmbedding( + n, + d, + S, + alpha=1, + num_random_restarts=0, + max_num_passes=0, + max_norm=0, + epsilon=0.001, + verbose=False, +): """ Computes an embedding of n objects in d dimensions usin the triplets of S. S is a list of triplets such that for each q in S, q = [i,j,k] means that @@ -289,38 +323,51 @@ def computeEmbedding(n, d, S, alpha=1, num_random_restarts=0,max_num_passes=0,ma (float) gamma : Equal to a/b where a is max row norm of the gradient matrix and b is the avg row norm of the centered embedding matrix X. This is a means to determine how close the current solution is to the "best" solution. """ - if max_num_passes==0: + if max_num_passes == 0: max_num_passes = 32 - + X_old = None - emp_loss_old = float('inf') + emp_loss_old = float("inf") num_restarts = -1 while num_restarts < num_random_restarts: num_restarts += 1 - + # print "Epoch SGD" # ts = time.time() # X,acc = computeEmbeddingWithEpochSGD(n, d, S, alpha, max_num_passes=max_num_passes, max_norm=max_norm, epsilon=0., verbose=verbose) # te_sgd = time.time()-ts - X = randn(n,d)*.0001 - + X = randn(n, d) * .0001 + # print "Gradient Descent" ts = time.time() - X_new, emp_loss_new, hinge_loss_new, log_loss_new, acc_new = computeEmbeddingWithGD(X, S, alpha, max_iters=50, max_norm=max_norm, epsilon=epsilon, verbose=verbose) - emp_loss_new,hinge_loss_new,log_loss_new = getLoss(X_new,S,alpha) - te_gd = time.time()-ts - - if emp_loss_new0 or verbose: + if epsilon > 0 or verbose: # get losses - emp_loss,hinge_loss,log_loss = getLoss(X,S,alpha) + emp_loss, hinge_loss, log_loss = getLoss(X, S, alpha) # get gradient and check stopping-time statistics - G,log_loss,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S,alpha) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, log_loss, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S, alpha + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if verbose: - print "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" % (t,emp_loss,hinge_loss,log_loss,rel_avg_grad,rel_max_grad,a) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" + % ( + t, + emp_loss, + hinge_loss, + log_loss, + rel_avg_grad, + rel_max_grad, + a, + ) + ) if rel_max_grad < epsilon: break - # get random triplet uniformly at random q = S[randint(m)] - c = -(alpha+1.)/2 + c = -(alpha + 1.) / 2 # By defintion d = 2c/alpha - cc = -(alpha+1.)/alpha + cc = -(alpha + 1.) / alpha # take gradient step - i,j,k = q - normki = norm( X[k] - X[i] )**2 - normkj = norm( X[k] - X[j] )**2 - num = (1 + normki/alpha )**c - den = (1 + normki/alpha )**c + ( 1 + normkj/alpha )**c - P = num/den - - grad_k = -(1-P) * cc * ( ( X[k] - X[i] )/( 1 + normki/alpha ) - ( X[k] - X[j] )/( 1 + normkj/alpha ) ) - grad_i = (1-P) * cc * ( X[k] - X[i] )/( 1 + normki/alpha ) - grad_j = -(1-P) * cc * ( X[k] - X[j] )/( 1 + normkj/alpha ) - - - X[i] = X[i] - a*grad_i/len(S) - X[j] = X[j] - a*grad_j/len(S) - X[k] = X[k] - a*grad_k/len(S) + i, j, k = q + normki = norm(X[k] - X[i]) ** 2 + normkj = norm(X[k] - X[j]) ** 2 + num = (1 + normki / alpha) ** c + den = (1 + normki / alpha) ** c + (1 + normkj / alpha) ** c + P = num / den + + grad_k = ( + -(1 - P) + * cc + * ( + (X[k] - X[i]) / (1 + normki / alpha) + - (X[k] - X[j]) / (1 + normkj / alpha) + ) + ) + grad_i = (1 - P) * cc * (X[k] - X[i]) / (1 + normki / alpha) + grad_j = -(1 - P) * cc * (X[k] - X[j]) / (1 + normkj / alpha) + + X[i] = X[i] - a * grad_i / len(S) + X[j] = X[j] - a * grad_j / len(S) + X[k] = X[k] - a * grad_k / len(S) # project back onto ball such that norm(X[i])<=max_norm for i in q: - norm_i = norm(X[i]) - if norm_i>max_norm: - X[i] = X[i] * (max_norm / norm_i) - - - return X,rel_max_grad - - - -def computeEmbeddingWithGD(X, S, alpha=1, max_iters=0, max_norm=0, epsilon=0.001, c1=0.0001, rho=.7, verbose=False): + norm_i = norm(X[i]) + if norm_i > max_norm: + X[i] = X[i] * (max_norm / norm_i) + + return X, rel_max_grad + + +def computeEmbeddingWithGD( + X, + S, + alpha=1, + max_iters=0, + max_norm=0, + epsilon=0.001, + c1=0.0001, + rho=.7, + verbose=False, +): """ Performs gradient descent with step size as implemented in stochastic triplet embedding code, namely ckl_x.m See: http://homepage.tudelft.nl/19j49/ste/Stochastic_Triplet_Embedding_files/STE_Release.zip @@ -464,62 +539,78 @@ def computeEmbeddingWithGD(X, S, alpha=1, max_iters=0, max_norm=0, epsilon=0.001 """ m = len(S) - n,d = X.shape + n, d = X.shape - if max_iters==0: - max_iters = 16*m + if max_iters == 0: + max_iters = 16 * m - if max_norm==0: - max_norm = 10*d + if max_norm == 0: + max_norm = 10 * d # check losses if verbose: - emp_loss,hinge_loss,log_loss = getLoss(X,S,alpha) - print "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, a=%f" % (0,emp_loss,hinge_loss,log_loss,float('nan')) - a = .5*n + emp_loss, hinge_loss, log_loss = getLoss(X, S, alpha) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, a=%f" + % (0, emp_loss, hinge_loss, log_loss, float("nan")) + ) + a = .5 * n t = 0 - emp_loss_0 = float('inf') - hinge_loss_0 = float('inf') - log_loss_0 = float('inf') - rel_max_grad = float('inf') - + emp_loss_0 = float("inf") + hinge_loss_0 = float("inf") + log_loss_0 = float("inf") + rel_max_grad = float("inf") + while t < max_iters: - t+=1 + t += 1 # get gradient and stopping-time statistics ts = time.time() - G,log_loss,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X, S, alpha) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, log_loss, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S, alpha + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if rel_max_grad < epsilon: if verbose: - print "Exited on rel_max_grad %s"%(str(rel_max_grad)) + print("Exited on rel_max_grad %s" % (str(rel_max_grad))) break - + # perform backtracking line search - a = 2*a + a = 2 * a ts = time.time() - emp_loss_0, hinge_loss_0, log_loss_0 = getLoss(X,S,alpha) - norm_grad_sq_0 = avg_grad_row_norm_sq*n - emp_loss_k, hinge_loss_k, log_loss_k = getLoss(X-a*G, S,alpha) - + emp_loss_0, hinge_loss_0, log_loss_0 = getLoss(X, S, alpha) + norm_grad_sq_0 = avg_grad_row_norm_sq * n + emp_loss_k, hinge_loss_k, log_loss_k = getLoss(X - a * G, S, alpha) + inner_t = 0 - while log_loss_k > log_loss_0 - c1*a*norm_grad_sq_0: - a = a*rho - emp_loss_k,hinge_loss_k,log_loss_k = getLoss(X-a*G, S,alpha) + while log_loss_k > log_loss_0 - c1 * a * norm_grad_sq_0: + a = a * rho + emp_loss_k, hinge_loss_k, log_loss_k = getLoss(X - a * G, S, alpha) inner_t += 1 - X = X - a*G + X = X - a * G for i in range(n): - norm_i = norm(X[i]) - if norm_i>max_norm: - X[i] = X[i] * (max_norm / norm_i) + norm_i = norm(X[i]) + if norm_i > max_norm: + X[i] = X[i] * (max_norm / norm_i) # check losses if verbose: - print "ste iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" % (t,emp_loss_k,hinge_loss_k,log_loss_k,rel_avg_grad,rel_max_grad,a,inner_t) - - - return X,emp_loss_0,hinge_loss_0,log_loss_0,rel_max_grad + print( + "ste iter=%d, emp_loss=%f, hinge_loss=%f, log_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" + % ( + t, + emp_loss_k, + hinge_loss_k, + log_loss_k, + rel_avg_grad, + rel_max_grad, + a, + inner_t, + ) + ) + + return X, emp_loss_0, hinge_loss_0, log_loss_0, rel_max_grad if __name__ == "__main__": diff --git a/apps/PoolBasedTripletMDS/algs/UncertaintySampling/myAlg.py b/apps/PoolBasedTripletMDS/algs/UncertaintySampling/myAlg.py index 08f6ded1..090e6644 100644 --- a/apps/PoolBasedTripletMDS/algs/UncertaintySampling/myAlg.py +++ b/apps/PoolBasedTripletMDS/algs/UncertaintySampling/myAlg.py @@ -3,6 +3,7 @@ author: Kevin Jamieson, kevin.g.jamieson@gmail.com last updated: 1/17/2015 """ +from __future__ import print_function import numpy import numpy as np import numpy.random @@ -11,124 +12,125 @@ from apps.PoolBasedTripletMDS.algs.UncertaintySampling import utilsMDS import time -class MyAlg: - def initExp(self,butler,n,d,failure_probability): - X = numpy.random.randn(n,d) - butler.algorithms.set(key='n',value=n) - butler.algorithms.set(key='d',value=d) - butler.algorithms.set(key='delta',value=failure_probability) - butler.algorithms.set(key='X',value=X.tolist()) - butler.algorithms.set(key='num_reported_answers',value=0) - return True - - - def getQuery(self,butler): - n = butler.algorithms.get(key='n') - d = butler.algorithms.get(key='d') - # If number of reported answers is small, generate random to avoid overfitting - num_reported_answers = butler.algorithms.get(key='num_reported_answers') - if num_reported_answers == None: - num_reported_answers = 0 - R = int(1+d*numpy.log(n)) - if num_reported_answers < R*n: - # This generates the same shuffle every time this everytime - # TODO: but this in utils and call this from other algorithms (they use - # the same method). - r = random.Random() - r.seed(42) - idxs = np.arange(n).repeat(R).tolist() - r.shuffle(idxs) - a = idxs[num_reported_answers] - b = numpy.random.randint(n) - while b==a: - b = numpy.random.randint(n) - c = numpy.random.randint(n) - while c==a or c==b: - c = numpy.random.randint(n) - return [a, b, c] - # generate an active query - X = numpy.array(butler.algorithms.get(key='X')) - # set maximum time allowed to search for a query - t_max = 0.05 - q,signed_score = utilsMDS.getRandomQuery(X) - best_q = q - best_score = abs(signed_score) - t_start = time.time() - while time.time()-t_start epsilon): - # take a single gradient step - X,emp_loss_new,hinge_loss_new,acc = utilsMDS.computeEmbeddingWithGD(X,S,max_iters=2**k,verbose=verbose) - k += 1 - butler.algorithms.set(key='X',value=X.tolist()) - - def full_embedding_update(self,butler,args): - verbose = False - n = butler.algorithms.get(key='n') - d = butler.algorithms.get(key='d') - S = butler.algorithms.get(key='S') - X_old = numpy.array(butler.algorithms.get(key='X')) - t_max = 5.0 - epsilon = 0.01 # a relative convergence criterion, see computeEmbeddingWithGD documentation - emp_loss_old,hinge_loss_old = utilsMDS.getLoss(X_old,S) - X,tmp = utilsMDS.computeEmbeddingWithEpochSGD(n,d,S,max_num_passes=16,epsilon=0,verbose=verbose) - t_start = time.time() - X,emp_loss_new,hinge_loss_new,acc = utilsMDS.computeEmbeddingWithGD(X,S,max_iters=1,verbose=verbose) - k = 1 - while (time.time()-t_start<0.5*t_max) and (acc > epsilon): - X,emp_loss_new,hinge_loss_new,acc = utilsMDS.computeEmbeddingWithGD(X,S,max_iters=2**k,verbose=verbose) - k += 1 - emp_loss_new,hinge_loss_new = utilsMDS.getLoss(X,S) - if emp_loss_old < emp_loss_new: - X = X_old - butler.algorithms.set(key='X',value=X.tolist()) - - - - - - - - - - + t_start = time.time() + while time.time() - t_start < t_max: + q, signed_score = utilsMDS.getRandomQuery(X) + if abs(signed_score) < best_score: + best_q = q + best_score = abs(signed_score) + index_center = best_q[2] + index_left = best_q[0] + index_right = best_q[1] + return [index_center, index_left, index_right] + + def processAnswer(self, butler, center_id, left_id, right_id, target_winner): + if left_id == target_winner: + q = [left_id, right_id, center_id] + else: + q = [right_id, left_id, center_id] + butler.algorithms.append(key="S", value=q) + n = butler.algorithms.get(key="n") + num_reported_answers = butler.algorithms.increment(key="num_reported_answers") + if num_reported_answers % int(n) == 0: + butler.job("full_embedding_update", {}, time_limit=30) + else: + butler.job("incremental_embedding_update", {}, time_limit=5) + return True + + def getModel(self, butler): + return butler.algorithms.get(key=["X", "num_reported_answers"]) + + def incremental_embedding_update(self, butler, args): + verbose = False + S = butler.algorithms.get(key="S") + X = numpy.array(butler.algorithms.get(key="X")) + # set maximum time allowed to update embedding + t_max = 1.0 + epsilon = ( + 0.01 + ) # a relative convergence criterion, see computeEmbeddingWithGD documentation + # take a single gradient step + t_start = time.time() + X, emp_loss_new, hinge_loss_new, acc = utilsMDS.computeEmbeddingWithGD( + X, S, max_iters=1, verbose=verbose + ) + k = 1 + while (time.time() - t_start < 0.5 * t_max) and (acc > epsilon): + # take a single gradient step + X, emp_loss_new, hinge_loss_new, acc = utilsMDS.computeEmbeddingWithGD( + X, S, max_iters=2 ** k, verbose=verbose + ) + k += 1 + butler.algorithms.set(key="X", value=X.tolist()) + + def full_embedding_update(self, butler, args): + verbose = False + n = butler.algorithms.get(key="n") + d = butler.algorithms.get(key="d") + S = butler.algorithms.get(key="S") + X_old = numpy.array(butler.algorithms.get(key="X")) + t_max = 5.0 + epsilon = ( + 0.01 + ) # a relative convergence criterion, see computeEmbeddingWithGD documentation + emp_loss_old, hinge_loss_old = utilsMDS.getLoss(X_old, S) + X, tmp = utilsMDS.computeEmbeddingWithEpochSGD( + n, d, S, max_num_passes=16, epsilon=0, verbose=verbose + ) + t_start = time.time() + X, emp_loss_new, hinge_loss_new, acc = utilsMDS.computeEmbeddingWithGD( + X, S, max_iters=1, verbose=verbose + ) + k = 1 + while (time.time() - t_start < 0.5 * t_max) and (acc > epsilon): + X, emp_loss_new, hinge_loss_new, acc = utilsMDS.computeEmbeddingWithGD( + X, S, max_iters=2 ** k, verbose=verbose + ) + k += 1 + emp_loss_new, hinge_loss_new = utilsMDS.getLoss(X, S) + if emp_loss_old < emp_loss_new: + X = X_old + butler.algorithms.set(key="X", value=X.tolist()) diff --git a/apps/PoolBasedTripletMDS/algs/UncertaintySampling/utilsMDS.py b/apps/PoolBasedTripletMDS/algs/UncertaintySampling/utilsMDS.py index 35db1fd2..5385d68e 100644 --- a/apps/PoolBasedTripletMDS/algs/UncertaintySampling/utilsMDS.py +++ b/apps/PoolBasedTripletMDS/algs/UncertaintySampling/utilsMDS.py @@ -12,12 +12,13 @@ You may also consider getLoss to check how well an embedding is performing. """ +from __future__ import print_function from numpy import * from numpy.random import * import numpy.random from numpy.linalg import * -#eig = numpy.linalg +# eig = numpy.linalg norm = linalg.norm floor = math.floor ceil = math.ceil @@ -25,54 +26,54 @@ import time - def main(): """ Example of Usage Creates some fake data and finds an embedding """ - + # generate some fake data n = 30 d = 2 - m = int(ceil(40*n*d*log(n))) # number of labels - - p = 0.1; # error rate - + m = int(ceil(40 * n * d * log(n))) # number of labels + + p = 0.1 + # error rate + Strain = [] Stest = [] - Xtrue = randn(n,d); - for iter in range(0,m): + Xtrue = randn(n, d) + for iter in range(0, m): # get random triplet - q,score = getRandomQuery(Xtrue) + q, score = getRandomQuery(Xtrue) # align it so it agrees with Xtrue: "q[2] is more similar to q[0] than q[1]" - query_ordering_disagrees_with_Xtrue = score<0 + query_ordering_disagrees_with_Xtrue = score < 0 if query_ordering_disagrees_with_Xtrue: - q = [ q[i] for i in [1,0,2]] + q = [q[i] for i in [1, 0, 2]] # add some noise R = rand() - if R 0 then the triplet agrees with the embedding, otherwise it does not @@ -113,12 +115,12 @@ def getTripletScore(X,q): Usage: score = getTripletScore(X,[3,4,5]) """ - i,j,k = q + i, j, k = q - return dot(X[j],X[j]) -2*dot(X[j],X[k]) + 2*dot(X[i],X[k]) - dot(X[i],X[i]) + return dot(X[j], X[j]) - 2 * dot(X[j], X[k]) + 2 * dot(X[i], X[k]) - dot(X[i], X[i]) -def getLoss(X,S): +def getLoss(X, S): """ Returns loss on X with respect to list of triplets S: 1/len(S) \sum_{q in S} loss(X,q). Intuitively, q=[i,j,k] "agrees" with X if ||x_j - x_k||^2 > ||x_i - x_k||^2. @@ -133,23 +135,24 @@ def getLoss(X,S): n = X.shape[0] d = X.shape[1] - emp_loss = 0 # 0/1 loss - hinge_loss = 0 # hinge loss - + emp_loss = 0 # 0/1 loss + hinge_loss = 0 # hinge loss + for q in S: - loss_ijk = getTripletScore(X,q) + loss_ijk = getTripletScore(X, q) + + hinge_loss = hinge_loss + max(0, 1. - loss_ijk) - hinge_loss = hinge_loss + max(0,1. - loss_ijk) - if loss_ijk < 0: emp_loss = emp_loss + 1. - emp_loss = emp_loss/len(S) - hinge_loss = hinge_loss/len(S) + emp_loss = emp_loss / len(S) + hinge_loss = hinge_loss / len(S) return emp_loss, hinge_loss -def getGradient(X,S): + +def getGradient(X, S): """ Returns normalized gradient of hinge loss wrt to X and S. Intuitively, q=[i,j,k] "agrees" with X if ||x_j - x_k||^2 > ||x_i - x_k||^2. @@ -160,22 +163,22 @@ def getGradient(X,S): Usage: G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) """ - n,d = X.shape + n, d = X.shape m = len(S) # pattern for computing gradient - H = mat([[2.,0.,-2.],[ 0., -2., 2.],[ -2., 2., 0.]]) + H = mat([[2., 0., -2.], [0., -2., 2.], [-2., 2., 0.]]) - # compute gradient - G = zeros((n,d)) + # compute gradient + G = zeros((n, d)) for q in S: - score = getTripletScore(X,q) - if 1.-score>0: - grad_partial = dot(H,X[q,:])/m - G[q,:] = G[q,:] + grad_partial + score = getTripletScore(X, q) + if 1. - score > 0: + grad_partial = dot(H, X[q, :]) / m + G[q, :] = G[q, :] + grad_partial # compute statistics about gradient used for stopping conditions - mu = mean(X,0) + mu = mean(X, 0) avg_row_norm_sq = 0. avg_grad_row_norm_sq = 0. max_grad_row_norm_sq = 0. @@ -184,16 +187,27 @@ def getGradient(X,S): row_norm_sq = 0 grad_row_norm_sq = 0 for j in range(d): - row_norm_sq += (X[i,j]-mu[j])*(X[i,j]-mu[j]) - grad_row_norm_sq += G[i,j]*G[i,j] - - avg_row_norm_sq += row_norm_sq/n - avg_grad_row_norm_sq += grad_row_norm_sq/n - max_grad_row_norm_sq = max(max_grad_row_norm_sq,grad_row_norm_sq) - - return G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq - -def computeEmbedding(n,d,S,num_random_restarts=0,max_num_passes=0,max_iter_GD=0,max_norm=0,epsilon=0.01,verbose=False): + row_norm_sq += (X[i, j] - mu[j]) * (X[i, j] - mu[j]) + grad_row_norm_sq += G[i, j] * G[i, j] + + avg_row_norm_sq += row_norm_sq / n + avg_grad_row_norm_sq += grad_row_norm_sq / n + max_grad_row_norm_sq = max(max_grad_row_norm_sq, grad_row_norm_sq) + + return G, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq + + +def computeEmbedding( + n, + d, + S, + num_random_restarts=0, + max_num_passes=0, + max_iter_GD=0, + max_norm=0, + epsilon=0.01, + verbose=False, +): """ Computes an embedding of n objects in d dimensions usin the triplets of S. S is a list of triplets such that for each q in S, q = [i,j,k] means that @@ -217,41 +231,61 @@ def computeEmbedding(n,d,S,num_random_restarts=0,max_num_passes=0,max_iter_GD=0, (float) gamma : Equal to a/b where a is max row norm of the gradient matrix and b is the avg row norm of the centered embedding matrix X. This is a means to determine how close the current solution is to the "best" solution. """ - if max_num_passes==0: + if max_num_passes == 0: max_num_passes_SGD = 16 else: max_num_passes_SGD = max_num_passes - - if max_iter_GD ==0: + if max_iter_GD == 0: max_iter_GD = 50 X_old = None - emp_loss_old = float('inf') + emp_loss_old = float("inf") num_restarts = -1 while num_restarts < num_random_restarts: num_restarts += 1 ts = time.time() - X,acc = computeEmbeddingWithEpochSGD(n,d,S,max_num_passes=max_num_passes_SGD,max_norm=max_norm,epsilon=epsilon,verbose=verbose) - te_sgd = time.time()-ts + X, acc = computeEmbeddingWithEpochSGD( + n, + d, + S, + max_num_passes=max_num_passes_SGD, + max_norm=max_norm, + epsilon=epsilon, + verbose=verbose, + ) + te_sgd = time.time() - ts ts = time.time() - X_new,emp_loss_new,hinge_loss_new,acc_new = computeEmbeddingWithGD(X,S,max_iters=max_iter_GD,max_norm=max_norm,epsilon=epsilon,verbose=verbose) - - te_gd = time.time()-ts - - if emp_loss_new0 or verbose: + if epsilon > 0 or verbose: # get losses - emp_loss,hinge_loss = getLoss(X,S) + emp_loss, hinge_loss = getLoss(X, S) # get gradient and check stopping-time statistics - G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if verbose: - print "iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" % (t,emp_loss,hinge_loss,rel_avg_grad,rel_max_grad,a) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" + % (t, emp_loss, hinge_loss, rel_avg_grad, rel_max_grad, a) + ) if rel_max_grad < epsilon: break @@ -335,21 +377,23 @@ def computeEmbeddingWithEpochSGD(n,d,S,max_num_passes=0,max_norm=0,epsilon=0.01, q = S[randint(m)] # take gradient step - score = getTripletScore(X,q) - if 1.-score>0: - grad_partial = dot(H,X[q,:]) - X[q,:] = X[q,:] - a*grad_partial + score = getTripletScore(X, q) + if 1. - score > 0: + grad_partial = dot(H, X[q, :]) + X[q, :] = X[q, :] - a * grad_partial # # project back onto ball such that norm(X[i])<=max_norm for i in q: norm_i = norm(X[i]) - if norm_i>max_norm: + if norm_i > max_norm: X[i] = X[i] * (max_norm / norm_i) - return X,rel_max_grad + return X, rel_max_grad -def computeEmbeddingWithGD(X,S,max_iters=0,max_norm=0,epsilon=0.01,c1=0.0001,rho=0.5,verbose=False): +def computeEmbeddingWithGD( + X, S, max_iters=0, max_norm=0, epsilon=0.01, c1=0.0001, rho=0.5, verbose=False +): """ Performs gradient descent with geometric amarijo line search (with parameter c1) @@ -380,61 +424,73 @@ def computeEmbeddingWithGD(X,S,max_iters=0,max_norm=0,epsilon=0.01,c1=0.0001,rho """ m = len(S) - n,d = X.shape + n, d = X.shape - if max_iters==0: + if max_iters == 0: max_iters = 100 - if max_norm==0: - max_norm = 10.*d + if max_norm == 0: + max_norm = 10. * d # check losses if verbose: - emp_loss,hinge_loss = getLoss(X,S) - print "iter=%d, emp_loss=%f, hinge_loss=%f, a=%f" % (0,emp_loss,hinge_loss,float('nan')) + emp_loss, hinge_loss = getLoss(X, S) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, a=%f" + % (0, emp_loss, hinge_loss, float("nan")) + ) alpha = .5 t = 0 - emp_loss_0 = float('inf') - hinge_loss_0 = float('inf') - rel_max_grad = float('inf') + emp_loss_0 = float("inf") + hinge_loss_0 = float("inf") + rel_max_grad = float("inf") while t < max_iters: - t+=1 + t += 1 # get gradient and stopping-time statistics - G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if rel_max_grad < epsilon: break # perform backtracking line search - alpha = 2*alpha - emp_loss_0,hinge_loss_0 = getLoss(X,S) - norm_grad_sq_0 = avg_grad_row_norm_sq*n - emp_loss_k,hinge_loss_k = getLoss(X-alpha*G,S) + alpha = 2 * alpha + emp_loss_0, hinge_loss_0 = getLoss(X, S) + norm_grad_sq_0 = avg_grad_row_norm_sq * n + emp_loss_k, hinge_loss_k = getLoss(X - alpha * G, S) inner_t = 0 - while hinge_loss_k > hinge_loss_0 - c1*alpha*norm_grad_sq_0: - alpha = alpha*rho - emp_loss_k,hinge_loss_k = getLoss(X-alpha*G,S) + while hinge_loss_k > hinge_loss_0 - c1 * alpha * norm_grad_sq_0: + alpha = alpha * rho + emp_loss_k, hinge_loss_k = getLoss(X - alpha * G, S) inner_t += 1 - X = X-alpha*G + X = X - alpha * G # project back onto ball such that norm(X[i])<=max_norm for i in range(n): norm_i = norm(X[i]) - if norm_i>max_norm: + if norm_i > max_norm: X[i] = X[i] * (max_norm / norm_i) # check losses if verbose: - print "hinge iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" % (t,emp_loss_k,hinge_loss_k,rel_avg_grad,rel_max_grad,alpha,inner_t) - - return X,emp_loss_0,hinge_loss_0,rel_max_grad - - - - + print( + "hinge iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" + % ( + t, + emp_loss_k, + hinge_loss_k, + rel_avg_grad, + rel_max_grad, + alpha, + inner_t, + ) + ) + + return X, emp_loss_0, hinge_loss_0, rel_max_grad if __name__ == "__main__": diff --git a/apps/PoolBasedTripletMDS/algs/ValidationSampling/myAlg.py b/apps/PoolBasedTripletMDS/algs/ValidationSampling/myAlg.py index 3180fc7c..e0a94279 100644 --- a/apps/PoolBasedTripletMDS/algs/ValidationSampling/myAlg.py +++ b/apps/PoolBasedTripletMDS/algs/ValidationSampling/myAlg.py @@ -1,3 +1,4 @@ +from __future__ import print_function import numpy as np import utilsMDS import time @@ -14,38 +15,40 @@ def initExp(self, butler, n, d, failure_probability): # (either one results in same behavior) X = np.random.randn(n, d) - butler.algorithms.set(key='n', value=n) - butler.algorithms.set(key='d', value=d) - butler.algorithms.set(key='delta', value=failure_probability) - butler.algorithms.set(key='X', value=X.tolist()) - butler.algorithms.set(key='num_reported_answers', value=0) + butler.algorithms.set(key="n", value=n) + butler.algorithms.set(key="d", value=d) + butler.algorithms.set(key="delta", value=failure_probability) + butler.algorithms.set(key="X", value=X.tolist()) + butler.algorithms.set(key="num_reported_answers", value=0) - params = butler.algorithms.get(key='params') # alg specific parameters + params = butler.algorithms.get(key="params") # alg specific parameters if params: - if 'query_list' in params: - query_list = params['query_list'] + if "query_list" in params: + query_list = params["query_list"] if isinstance(query_list[0][0], (str, unicode)): targetset = butler.targets.get_targetset(butler.exp_uid) query_list = utils.filenames_to_ids(query_list, targetset) - elif 'num_tries' in params: - num_tries = params['num_tries'] + elif "num_tries" in params: + num_tries = params["num_tries"] query_list = [] for i in range(num_tries): # generate a lot of queries. q, score = utilsMDS.getRandomQuery(X) query_list.append(q) else: - raise Exception("For ValidationSampling you must specifiy " - "'query_list' or 'num_tries'") + raise Exception( + "For ValidationSampling you must specifiy " + "'query_list' or 'num_tries'" + ) - butler.algorithms.set(key='do_not_ask', value=[]) - butler.algorithms.set(key='query_list', value=query_list) + butler.algorithms.set(key="do_not_ask", value=[]) + butler.algorithms.set(key="query_list", value=query_list) return True def getQuery(self, butler): - num_ans = butler.algorithms.get(key='num_reported_answers') - query_list = butler.algorithms.get(key='query_list') + num_ans = butler.algorithms.get(key="num_reported_answers") + query_list = butler.algorithms.get(key="query_list") i = num_ans % len(query_list) query = query_list[i] @@ -56,14 +59,13 @@ def getQuery(self, butler): # butler.algorithms.append(key='do_not_ask', value=query) return query[2], query[0], query[1] - def processAnswer(self, butler, center_id, left_id, right_id, - target_winner): + def processAnswer(self, butler, center_id, left_id, right_id, target_winner): if left_id == target_winner: q = [left_id, right_id, target_winner] else: q = [left_id, left_id, target_winner] - butler.algorithms.append(key='S', value=q) + butler.algorithms.append(key="S", value=q) # The following lines enforce "do not ask". The query list gets shorter # each time this function is called (and an question is answered). @@ -72,25 +74,24 @@ def processAnswer(self, butler, center_id, left_id, right_id, # query_list.remove(query) # butler.participants.set(key='query_list', value=query_list) - n = butler.algorithms.get(key='n') - num_answers = butler.algorithms.increment(key='num_reported_answers') + n = butler.algorithms.get(key="n") + num_answers = butler.algorithms.increment(key="num_reported_answers") if num_answers % int(n) == 0: - args = {'task': '_full_embedding_update', 'task_args_json': {}} + args = {"task": "_full_embedding_update", "task_args_json": {}} butler.job(time_limit=30, **args) else: - args = {'task': '_incremental_embedding_update', - 'task_args_json': {}} + args = {"task": "_incremental_embedding_update", "task_args_json": {}} butler.job(time_limit=5, **args) return True def getModel(self, butler): - return butler.algorithms.get(key=['X', 'num_reported_answers']) + return butler.algorithms.get(key=["X", "num_reported_answers"]) def _incremental_embedding_update(self, butler, args): - S = butler.algorithms.get(key='S') + S = butler.algorithms.get(key="S") - X = np.array(butler.algorithms.get(key='X')) + X = np.array(butler.algorithms.get(key="X")) # set maximum time allowed to update embedding t_max = 1.0 @@ -104,21 +105,21 @@ def _incremental_embedding_update(self, butler, args): response = utilsMDS.computeEmbeddingWithGD(X, S, max_iters=1) X, emp_loss_new, hinge_loss_new, acc = response k = 1 - while (time.time() - t_start < 0.5*t_max) and (acc > epsilon): - response = utilsMDS.computeEmbeddingWithGD(X, S, max_iters=2**k) + while (time.time() - t_start < 0.5 * t_max) and (acc > epsilon): + response = utilsMDS.computeEmbeddingWithGD(X, S, max_iters=2 ** k) X, emp_loss_new, hinge_loss_new, acc = response k += 1 - butler.algorithms.set(key='X', value=X.tolist()) + butler.algorithms.set(key="X", value=X.tolist()) def _full_embedding_update(self, butler, args): verbose = False - n = butler.algorithms.get(key='n') - d = butler.algorithms.get(key='d') - S = butler.algorithms.get(key='S') + n = butler.algorithms.get(key="n") + d = butler.algorithms.get(key="d") + S = butler.algorithms.get(key="S") - X_old = np.array(butler.algorithms.get(key='X')) + X_old = np.array(butler.algorithms.get(key="X")) t_max = 5.0 # a relative convergence criterion @@ -126,21 +127,20 @@ def _full_embedding_update(self, butler, args): epsilon = 0.01 emp_loss_old, hinge_loss_old = utilsMDS.getLoss(X_old, S) - X, tmp = utilsMDS.computeEmbeddingWithEpochSGD(n, d, S, - max_num_passes=16, - epsilon=0, - verbose=verbose) + X, tmp = utilsMDS.computeEmbeddingWithEpochSGD( + n, d, S, max_num_passes=16, epsilon=0, verbose=verbose + ) t_start = time.time() response = utilsMDS.computeEmbeddingWithGD(X, S, max_iters=1) X, emp_loss_new, hinge_loss_new, acc = response k = 1 - while (time.time() - t_start < 0.5*t_max) and (acc > epsilon): - response = utilsMDS.computeEmbeddingWithGD(X, S, max_iters=2**k) + while (time.time() - t_start < 0.5 * t_max) and (acc > epsilon): + response = utilsMDS.computeEmbeddingWithGD(X, S, max_iters=2 ** k) X, emp_loss_new, hinge_loss_new, acc = response k += 1 emp_loss_new, hinge_loss_new = utilsMDS.getLoss(X, S) if emp_loss_old < emp_loss_new: X = X_old - butler.algorithms.set(key='X', value=X.tolist()) + butler.algorithms.set(key="X", value=X.tolist()) diff --git a/apps/PoolBasedTripletMDS/algs/ValidationSampling/utilsMDS.py b/apps/PoolBasedTripletMDS/algs/ValidationSampling/utilsMDS.py index b7dd3bda..87107a56 100644 --- a/apps/PoolBasedTripletMDS/algs/ValidationSampling/utilsMDS.py +++ b/apps/PoolBasedTripletMDS/algs/ValidationSampling/utilsMDS.py @@ -12,12 +12,13 @@ You may also consider getLoss to check how well an embedding is performing. """ +from __future__ import print_function from numpy import * from numpy.random import * import numpy.random from numpy.linalg import * -#eig = numpy.linalg +# eig = numpy.linalg norm = linalg.norm floor = math.floor ceil = math.ceil @@ -25,54 +26,54 @@ import time - def main(): """ Example of Usage Creates some fake data and finds an embedding """ - + # generate some fake data n = 30 d = 2 - m = int(ceil(40*n*d*log(n))) # number of labels - - p = 0.1; # error rate - + m = int(ceil(40 * n * d * log(n))) # number of labels + + p = 0.1 + # error rate + Strain = [] Stest = [] - Xtrue = randn(n,d); - for iter in range(0,m): + Xtrue = randn(n, d) + for iter in range(0, m): # get random triplet - q,score = getRandomQuery(Xtrue) + q, score = getRandomQuery(Xtrue) # align it so it agrees with Xtrue: "q[2] is more similar to q[0] than q[1]" - query_ordering_disagrees_with_Xtrue = score<0 + query_ordering_disagrees_with_Xtrue = score < 0 if query_ordering_disagrees_with_Xtrue: - q = [ q[i] for i in [1,0,2]] + q = [q[i] for i in [1, 0, 2]] # add some noise R = rand() - if R 0 then the triplet agrees with the embedding, otherwise it does not @@ -110,12 +112,12 @@ def getTripletScore(X,q): Usage: score = getTripletScore(X,[3,4,5]) """ - i,j,k = q + i, j, k = q - return dot(X[j],X[j]) -2*dot(X[j],X[k]) + 2*dot(X[i],X[k]) - dot(X[i],X[i]) + return dot(X[j], X[j]) - 2 * dot(X[j], X[k]) + 2 * dot(X[i], X[k]) - dot(X[i], X[i]) -def getLoss(X,S): +def getLoss(X, S): """ Returns loss on X with respect to list of triplets S: 1/len(S) \sum_{q in S} loss(X,q). Intuitively, q=[i,j,k] "agrees" with X if ||x_j - x_k||^2 > ||x_i - x_k||^2. @@ -130,23 +132,24 @@ def getLoss(X,S): n = X.shape[0] d = X.shape[1] - emp_loss = 0 # 0/1 loss - hinge_loss = 0 # hinge loss - + emp_loss = 0 # 0/1 loss + hinge_loss = 0 # hinge loss + for q in S: - loss_ijk = getTripletScore(X,q) + loss_ijk = getTripletScore(X, q) + + hinge_loss = hinge_loss + max(0, 1. - loss_ijk) - hinge_loss = hinge_loss + max(0,1. - loss_ijk) - if loss_ijk < 0: emp_loss = emp_loss + 1. - emp_loss = emp_loss/len(S) - hinge_loss = hinge_loss/len(S) + emp_loss = emp_loss / len(S) + hinge_loss = hinge_loss / len(S) return emp_loss, hinge_loss -def getGradient(X,S): + +def getGradient(X, S): """ Returns normalized gradient of hinge loss wrt to X and S. Intuitively, q=[i,j,k] "agrees" with X if ||x_j - x_k||^2 > ||x_i - x_k||^2. @@ -157,22 +160,22 @@ def getGradient(X,S): Usage: G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) """ - n,d = X.shape + n, d = X.shape m = len(S) # pattern for computing gradient - H = mat([[2.,0.,-2.],[ 0., -2., 2.],[ -2., 2., 0.]]) + H = mat([[2., 0., -2.], [0., -2., 2.], [-2., 2., 0.]]) - # compute gradient - G = zeros((n,d)) + # compute gradient + G = zeros((n, d)) for q in S: - score = getTripletScore(X,q) - if 1.-score>0: - grad_partial = dot(H,X[q,:])/m - G[q,:] = G[q,:] + grad_partial + score = getTripletScore(X, q) + if 1. - score > 0: + grad_partial = dot(H, X[q, :]) / m + G[q, :] = G[q, :] + grad_partial # compute statistics about gradient used for stopping conditions - mu = mean(X,0) + mu = mean(X, 0) avg_row_norm_sq = 0. avg_grad_row_norm_sq = 0. max_grad_row_norm_sq = 0. @@ -181,16 +184,27 @@ def getGradient(X,S): row_norm_sq = 0 grad_row_norm_sq = 0 for j in range(d): - row_norm_sq += (X[i,j]-mu[j])*(X[i,j]-mu[j]) - grad_row_norm_sq += G[i,j]*G[i,j] - - avg_row_norm_sq += row_norm_sq/n - avg_grad_row_norm_sq += grad_row_norm_sq/n - max_grad_row_norm_sq = max(max_grad_row_norm_sq,grad_row_norm_sq) - - return G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq - -def computeEmbedding(n,d,S,num_random_restarts=0,max_num_passes=0,max_iter_GD=0,max_norm=0,epsilon=0.01,verbose=False): + row_norm_sq += (X[i, j] - mu[j]) * (X[i, j] - mu[j]) + grad_row_norm_sq += G[i, j] * G[i, j] + + avg_row_norm_sq += row_norm_sq / n + avg_grad_row_norm_sq += grad_row_norm_sq / n + max_grad_row_norm_sq = max(max_grad_row_norm_sq, grad_row_norm_sq) + + return G, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq + + +def computeEmbedding( + n, + d, + S, + num_random_restarts=0, + max_num_passes=0, + max_iter_GD=0, + max_norm=0, + epsilon=0.01, + verbose=False, +): """ Computes an embedding of n objects in d dimensions usin the triplets of S. S is a list of triplets such that for each q in S, q = [i,j,k] means that @@ -214,41 +228,61 @@ def computeEmbedding(n,d,S,num_random_restarts=0,max_num_passes=0,max_iter_GD=0, (float) gamma : Equal to a/b where a is max row norm of the gradient matrix and b is the avg row norm of the centered embedding matrix X. This is a means to determine how close the current solution is to the "best" solution. """ - if max_num_passes==0: + if max_num_passes == 0: max_num_passes_SGD = 16 else: max_num_passes_SGD = max_num_passes - - if max_iter_GD ==0: + if max_iter_GD == 0: max_iter_GD = 50 X_old = None - emp_loss_old = float('inf') + emp_loss_old = float("inf") num_restarts = -1 while num_restarts < num_random_restarts: num_restarts += 1 ts = time.time() - X,acc = computeEmbeddingWithEpochSGD(n,d,S,max_num_passes=max_num_passes_SGD,max_norm=max_norm,epsilon=epsilon,verbose=verbose) - te_sgd = time.time()-ts + X, acc = computeEmbeddingWithEpochSGD( + n, + d, + S, + max_num_passes=max_num_passes_SGD, + max_norm=max_norm, + epsilon=epsilon, + verbose=verbose, + ) + te_sgd = time.time() - ts ts = time.time() - X_new,emp_loss_new,hinge_loss_new,acc_new = computeEmbeddingWithGD(X,S,max_iters=max_iter_GD,max_norm=max_norm,epsilon=epsilon,verbose=verbose) - - te_gd = time.time()-ts - - if emp_loss_new0 or verbose: + if epsilon > 0 or verbose: # get losses - emp_loss,hinge_loss = getLoss(X,S) + emp_loss, hinge_loss = getLoss(X, S) # get gradient and check stopping-time statistics - G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if verbose: - print "iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" % (t,emp_loss,hinge_loss,rel_avg_grad,rel_max_grad,a) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f" + % (t, emp_loss, hinge_loss, rel_avg_grad, rel_max_grad, a) + ) if rel_max_grad < epsilon: break @@ -332,21 +374,23 @@ def computeEmbeddingWithEpochSGD(n,d,S,max_num_passes=0,max_norm=0,epsilon=0.01, q = S[randint(m)] # take gradient step - score = getTripletScore(X,q) - if 1.-score>0: - grad_partial = dot(H,X[q,:]) - X[q,:] = X[q,:] - a*grad_partial + score = getTripletScore(X, q) + if 1. - score > 0: + grad_partial = dot(H, X[q, :]) + X[q, :] = X[q, :] - a * grad_partial # # project back onto ball such that norm(X[i])<=max_norm for i in q: norm_i = norm(X[i]) - if norm_i>max_norm: + if norm_i > max_norm: X[i] = X[i] * (max_norm / norm_i) - return X,rel_max_grad + return X, rel_max_grad -def computeEmbeddingWithGD(X,S,max_iters=0,max_norm=0,epsilon=0.01,c1=0.0001,rho=0.5,verbose=False): +def computeEmbeddingWithGD( + X, S, max_iters=0, max_norm=0, epsilon=0.01, c1=0.0001, rho=0.5, verbose=False +): """ Performs gradient descent with geometric amarijo line search (with parameter c1) @@ -377,57 +421,73 @@ def computeEmbeddingWithGD(X,S,max_iters=0,max_norm=0,epsilon=0.01,c1=0.0001,rho """ m = len(S) - n,d = X.shape + n, d = X.shape - if max_iters==0: + if max_iters == 0: max_iters = 100 - if max_norm==0: - max_norm = 10.*d + if max_norm == 0: + max_norm = 10. * d # check losses if verbose: - emp_loss,hinge_loss = getLoss(X,S) - print "iter=%d, emp_loss=%f, hinge_loss=%f, a=%f" % (0,emp_loss,hinge_loss,float('nan')) + emp_loss, hinge_loss = getLoss(X, S) + print( + "iter=%d, emp_loss=%f, hinge_loss=%f, a=%f" + % (0, emp_loss, hinge_loss, float("nan")) + ) alpha = .5 t = 0 - emp_loss_0 = float('inf') - hinge_loss_0 = float('inf') - rel_max_grad = float('inf') + emp_loss_0 = float("inf") + hinge_loss_0 = float("inf") + rel_max_grad = float("inf") while t < max_iters: - t+=1 + t += 1 # get gradient and stopping-time statistics - G,avg_grad_row_norm_sq,max_grad_row_norm_sq,avg_row_norm_sq = getGradient(X,S) - rel_max_grad = sqrt( max_grad_row_norm_sq / avg_row_norm_sq ) - rel_avg_grad = sqrt( avg_grad_row_norm_sq / avg_row_norm_sq ) + G, avg_grad_row_norm_sq, max_grad_row_norm_sq, avg_row_norm_sq = getGradient( + X, S + ) + rel_max_grad = sqrt(max_grad_row_norm_sq / avg_row_norm_sq) + rel_avg_grad = sqrt(avg_grad_row_norm_sq / avg_row_norm_sq) if rel_max_grad < epsilon: break # perform backtracking line search - alpha = 2*alpha - emp_loss_0,hinge_loss_0 = getLoss(X,S) - norm_grad_sq_0 = avg_grad_row_norm_sq*n - emp_loss_k,hinge_loss_k = getLoss(X-alpha*G,S) + alpha = 2 * alpha + emp_loss_0, hinge_loss_0 = getLoss(X, S) + norm_grad_sq_0 = avg_grad_row_norm_sq * n + emp_loss_k, hinge_loss_k = getLoss(X - alpha * G, S) inner_t = 0 - while hinge_loss_k > hinge_loss_0 - c1*alpha*norm_grad_sq_0: - alpha = alpha*rho - emp_loss_k,hinge_loss_k = getLoss(X-alpha*G,S) + while hinge_loss_k > hinge_loss_0 - c1 * alpha * norm_grad_sq_0: + alpha = alpha * rho + emp_loss_k, hinge_loss_k = getLoss(X - alpha * G, S) inner_t += 1 - X = X-alpha*G + X = X - alpha * G # project back onto ball such that norm(X[i])<=max_norm for i in range(n): norm_i = norm(X[i]) - if norm_i>max_norm: + if norm_i > max_norm: X[i] = X[i] * (max_norm / norm_i) # check losses if verbose: - print "hinge iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" % (t,emp_loss_k,hinge_loss_k,rel_avg_grad,rel_max_grad,alpha,inner_t) - - return X,emp_loss_0,hinge_loss_0,rel_max_grad + print( + "hinge iter=%d, emp_loss=%f, hinge_loss=%f, rel_avg_grad=%f, rel_max_grad=%f, a=%f, i_t=%d" + % ( + t, + emp_loss_k, + hinge_loss_k, + rel_avg_grad, + rel_max_grad, + alpha, + inner_t, + ) + ) + + return X, emp_loss_0, hinge_loss_0, rel_max_grad if __name__ == "__main__": diff --git a/apps/PoolBasedTripletMDS/dashboard/Dashboard.py b/apps/PoolBasedTripletMDS/dashboard/Dashboard.py index 7e2e8a03..ebc0aa38 100644 --- a/apps/PoolBasedTripletMDS/dashboard/Dashboard.py +++ b/apps/PoolBasedTripletMDS/dashboard/Dashboard.py @@ -1,11 +1,13 @@ +from __future__ import print_function import json import numpy import numpy.random import next.utils as utils from next.apps.AppDashboard import AppDashboard + class MyAppDashboard(AppDashboard): - def __init__(self,db,ell): + def __init__(self, db, ell): AppDashboard.__init__(self, db, ell) def test_error_multiline_plot(self, app, butler): @@ -20,70 +22,82 @@ def test_error_multiline_plot(self, app, butler): (dict) MPLD3 plot dictionary """ # get list of algorithms associated with project - args = butler.experiment.get(key='args') - test_alg_label = args['alg_list'][0]['test_alg_label'] - - test_S = butler.queries.get(pattern={'exp_uid':app.exp_uid, 'alg_label':test_alg_label}) - x_min = numpy.float('inf') - x_max = -numpy.float('inf') - y_min = numpy.float('inf') - y_max = -numpy.float('inf') + args = butler.experiment.get(key="args") + test_alg_label = args["alg_list"][0]["test_alg_label"] + + test_S = butler.queries.get( + pattern={"exp_uid": app.exp_uid, "alg_label": test_alg_label} + ) + x_min = numpy.float("inf") + x_max = -numpy.float("inf") + y_min = numpy.float("inf") + y_max = -numpy.float("inf") list_of_alg_dicts = [] - for algorithm in args['alg_list']: - alg_label = algorithm['alg_label'] - list_of_log_dict = butler.ell.get_logs_with_filter(app.app_id+':ALG-EVALUATION',{'exp_uid':app.exp_uid, 'alg_label':alg_label}) - list_of_log_dict = sorted(list_of_log_dict, key=lambda item: utils.str2datetime(item['timestamp']) ) + for algorithm in args["alg_list"]: + alg_label = algorithm["alg_label"] + list_of_log_dict = butler.ell.get_logs_with_filter( + app.app_id + ":ALG-EVALUATION", + {"exp_uid": app.exp_uid, "alg_label": alg_label}, + ) + list_of_log_dict = sorted( + list_of_log_dict, key=lambda item: utils.str2datetime(item["timestamp"]) + ) x = [] y = [] for item in list_of_log_dict: - num_reported_answers = item['num_reported_answers'] - Xd = item['X'] + num_reported_answers = item["num_reported_answers"] + Xd = item["X"] err = 0.5 - if len(test_S)>0: + if len(test_S) > 0: # compute error rate number_correct = 0. for query in test_S: - if 'q' in query: - i, j, k = query['q'] - score = numpy.dot(Xd[j],Xd[j]) -2*numpy.dot(Xd[j],Xd[k]) + 2*numpy.dot(Xd[i],Xd[k]) - numpy.dot(Xd[i],Xd[i]) + if "q" in query: + i, j, k = query["q"] + score = ( + numpy.dot(Xd[j], Xd[j]) + - 2 * numpy.dot(Xd[j], Xd[k]) + + 2 * numpy.dot(Xd[i], Xd[k]) + - numpy.dot(Xd[i], Xd[i]) + ) if score > 0: number_correct += 1.0 - accuracy = number_correct/len(test_S) - err = 1.0-accuracy + accuracy = number_correct / len(test_S) + err = 1.0 - accuracy x.append(num_reported_answers) y.append(err) - alg_dict = {'legend_label':alg_label, 'x':x,'y':y} + alg_dict = {"legend_label": alg_label, "x": x, "y": y} try: - x_min = min(x_min,min(x)) - x_max = max(x_max,max(x)) - y_min = min(y_min,min(y)) - y_max = max(y_max,max(y)) + x_min = min(x_min, min(x)) + x_max = max(x_max, max(x)) + y_min = min(y_min, min(y)) + y_max = max(y_max, max(y)) except: pass list_of_alg_dicts.append(alg_dict) import matplotlib.pyplot as plt import mpld3 - fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE')) + + fig, ax = plt.subplots(subplot_kw=dict(axisbg="#EEEEEE")) for alg_dict in list_of_alg_dicts: - ax.plot(alg_dict['x'],alg_dict['y'],label=alg_dict['legend_label']) - ax.set_xlabel('Number of answered triplets') - ax.set_ylabel('Error on hold-out set') - ax.set_xlim([x_min,x_max]) - ax.set_ylim([y_min,y_max]) - ax.grid(color='white', linestyle='solid') - ax.set_title('Triplet Test Error', size=14) - legend = ax.legend(loc=2,ncol=3,mode="expand") + ax.plot(alg_dict["x"], alg_dict["y"], label=alg_dict["legend_label"]) + ax.set_xlabel("Number of answered triplets") + ax.set_ylabel("Error on hold-out set") + ax.set_xlim([x_min, x_max]) + ax.set_ylim([y_min, y_max]) + ax.grid(color="white", linestyle="solid") + ax.set_title("Triplet Test Error", size=14) + legend = ax.legend(loc=2, ncol=3, mode="expand") for label in legend.get_texts(): - label.set_fontsize('small') + label.set_fontsize("small") plot_dict = mpld3.fig_to_dict(fig) plt.close() return plot_dict - - def most_current_embedding(self,app,butler,alg_label): + def most_current_embedding(self, app, butler, alg_label): """ Description: Returns embedding in the form of a list of dictionaries, which is conveneint for downstream applications @@ -103,37 +117,42 @@ def most_current_embedding(self,app,butler,alg_label): """ TargetManager = butler.targets - item = app.getModel(json.dumps({'exp_uid':app.exp_uid, 'args':{'alg_label':alg_label}})) - embedding = item['X'] + item = app.getModel( + json.dumps({"exp_uid": app.exp_uid, "args": {"alg_label": alg_label}}) + ) + embedding = item["X"] data = [] - x_min = numpy.float('inf') - x_max = -numpy.float('inf') - y_min = numpy.float('inf') - y_max = -numpy.float('inf') - for idx,target in enumerate(embedding): + x_min = numpy.float("inf") + x_max = -numpy.float("inf") + y_min = numpy.float("inf") + y_max = -numpy.float("inf") + for idx, target in enumerate(embedding): target_dict = {} - target_dict['target'] = TargetManager.get_target_item(app.exp_uid, idx) - target_dict['x'] = target[0] # this is what will actually be plotted, + target_dict["target"] = TargetManager.get_target_item(app.exp_uid, idx) + target_dict["x"] = target[0] # this is what will actually be plotted, try: - target_dict['y'] = target[1] # takes first two components, (could be replaced by PCA) + target_dict["y"] = target[ + 1 + ] # takes first two components, (could be replaced by PCA) except: - target_dict['y'] = 0. - target_dict['darray'] = target + target_dict["y"] = 0. + target_dict["darray"] = target - x_min = min(x_min,target[0]) - x_max = max(x_max,target[0]) - y_min = min(y_min,target[1]) - y_max = max(y_max,target[1]) + x_min = min(x_min, target[0]) + x_max = max(x_max, target[0]) + y_min = min(y_min, target[1]) + y_max = max(y_max, target[1]) data.append(target_dict) - return_dict = {'timestamp':str(utils.datetimeNow()), - 'x_min':x_min, 'x_max':x_max, 'y_min':y_min, 'y_max':y_max, 'data':data, - 'plot_type':'scatter2d_noaxis'} + return_dict = { + "timestamp": str(utils.datetimeNow()), + "x_min": x_min, + "x_max": x_max, + "y_min": y_min, + "y_max": y_max, + "data": data, + "plot_type": "scatter2d_noaxis", + } return return_dict - - - - - diff --git a/apps/PoolBasedTripletMDS/myApp.py b/apps/PoolBasedTripletMDS/myApp.py index 1e7b8ca5..25904d81 100644 --- a/apps/PoolBasedTripletMDS/myApp.py +++ b/apps/PoolBasedTripletMDS/myApp.py @@ -1,27 +1,29 @@ +from __future__ import print_function import json import next.utils as utils import next.apps.SimpleTargetManager + class MyApp: - def __init__(self,db): - self.app_id = 'PoolBasedTripletMDS' + def __init__(self, db): + self.app_id = "PoolBasedTripletMDS" self.TargetManager = next.apps.SimpleTargetManager.SimpleTargetManager(db) def initExp(self, butler, init_algs, args): exp_uid = butler.exp_uid - if 'targetset' in args['targets'].keys(): - n = len(args['targets']['targetset']) - self.TargetManager.set_targetset(exp_uid, args['targets']['targetset']) + if "targetset" in args["targets"].keys(): + n = len(args["targets"]["targetset"]) + self.TargetManager.set_targetset(exp_uid, args["targets"]["targetset"]) else: - n = args['targets']['n'] - args['n'] = n - del args['targets'] + n = args["targets"]["n"] + args["n"] = n + del args["targets"] alg_data = {} - algorithm_keys = ['n','d','failure_probability'] + algorithm_keys = ["n", "d", "failure_probability"] for key in algorithm_keys: if key in args: - alg_data[key]=args[key] + alg_data[key] = args[key] init_algs(alg_data) return args @@ -29,36 +31,57 @@ def initExp(self, butler, init_algs, args): def getQuery(self, butler, alg, args): alg_response = alg() exp_uid = butler.exp_uid - center = self.TargetManager.get_target_item(exp_uid, alg_response[0]) - left = self.TargetManager.get_target_item(exp_uid, alg_response[1]) - right = self.TargetManager.get_target_item(exp_uid, alg_response[2]) - center['label'] = 'center' - left['label'] = 'left' - right['label'] = 'right' - return {'target_indices':[center, left, right]} + center = self.TargetManager.get_target_item(exp_uid, alg_response[0]) + left = self.TargetManager.get_target_item(exp_uid, alg_response[1]) + right = self.TargetManager.get_target_item(exp_uid, alg_response[2]) + center["label"] = "center" + left["label"] = "left" + right["label"] = "right" + return {"target_indices": [center, left, right]} def processAnswer(self, butler, alg, args): - query = butler.queries.get(uid=args['query_uid']) - targets = query['target_indices'] + query = butler.queries.get(uid=args["query_uid"]) + targets = query["target_indices"] for target in targets: - if target['label'] == 'center': - center_id = target['target_id'] - elif target['label'] == 'left': - left_id = target['target_id'] - elif target['label'] == 'right': - right_id = target['target_id'] - target_winner = args['target_winner'] + if target["label"] == "center": + center_id = target["target_id"] + elif target["label"] == "left": + left_id = target["target_id"] + elif target["label"] == "right": + right_id = target["target_id"] + target_winner = args["target_winner"] # make a getModel call ~ every n/4 queries - note that this query will NOT be included in the predict experiment = butler.experiment.get() - num_reported_answers = butler.experiment.increment(key='num_reported_answers_for_' + query['alg_label']) - - n = experiment['args']['n'] - if num_reported_answers % ((n+4)/4) == 0: - butler.job('getModel', json.dumps({'exp_uid':butler.exp_uid,'args':{'alg_label':query['alg_label'], 'logging':True}})) - q = [left_id, right_id,center_id] if target_winner==left_id else [right_id, left_id,center_id] + num_reported_answers = butler.experiment.increment( + key="num_reported_answers_for_" + query["alg_label"] + ) + + n = experiment["args"]["n"] + if num_reported_answers % ((n + 4) / 4) == 0: + butler.job( + "getModel", + json.dumps( + { + "exp_uid": butler.exp_uid, + "args": {"alg_label": query["alg_label"], "logging": True}, + } + ), + ) + q = ( + [left_id, right_id, center_id] + if target_winner == left_id + else [right_id, left_id, center_id] + ) - alg({'left_id':left_id, 'right_id':right_id, 'center_id':center_id, 'target_winner':target_winner}) - return {'target_winner':target_winner, 'q':q} + alg( + { + "left_id": left_id, + "right_id": right_id, + "center_id": center_id, + "target_winner": target_winner, + } + ) + return {"target_winner": target_winner, "q": q} def getModel(self, butler, alg, args): return alg() @@ -66,17 +89,26 @@ def getModel(self, butler, alg, args): def format_responses(self, responses): formatted = [] for response in responses: - if 'target_winner' not in response: + if "target_winner" not in response: continue - targets = {'target_' + target['label']: target['primary_description'] - for target in response['target_indices']} - ids = {target['label'] + '_id': target['target_id'] - for target in response['target_indices']} - winner = {t['target_id'] == response['target_winner']: (t['primary_description'], t['target_id']) - for t in response['target_indices']} - response.update({'target_winner': winner[True][0], 'winner_id': winner[True][1]}) + targets = { + "target_" + target["label"]: target["primary_description"] + for target in response["target_indices"] + } + ids = { + target["label"] + "_id": target["target_id"] + for target in response["target_indices"] + } + winner = { + t["target_id"] + == response["target_winner"]: (t["primary_description"], t["target_id"]) + for t in response["target_indices"] + } + response.update( + {"target_winner": winner[True][0], "winner_id": winner[True][1]} + ) - for key in ['q', '_id', 'target_indices']: + for key in ["q", "_id", "target_indices"]: if key in response: del response[key] response.update(targets) @@ -84,5 +116,3 @@ def format_responses(self, responses): formatted += [response] return formatted - - diff --git a/apps/PoolBasedTripletMDS/tests/test_api.py b/apps/PoolBasedTripletMDS/tests/test_api.py index 4f626cc7..6e48be7d 100644 --- a/apps/PoolBasedTripletMDS/tests/test_api.py +++ b/apps/PoolBasedTripletMDS/tests/test_api.py @@ -1,3 +1,4 @@ +from __future__ import print_function import numpy import numpy.random import random @@ -8,61 +9,85 @@ from multiprocessing import Pool import os import sys + try: import next.apps.test_utils as test_utils except: - file_dir = '/'.join(__file__.split('/')[:-1]) - sys.path.append('{}/../../../next/apps'.format(file_dir)) + file_dir = "/".join(__file__.split("/")[:-1]) + sys.path.append("{}/../../../next/apps".format(file_dir)) import test_utils -app_id = 'PoolBasedTripletMDS' +app_id = "PoolBasedTripletMDS" -def test_api(assert_200=True, num_objects=5, desired_dimension=2, - total_pulls_per_client=4, num_experiments=1, num_clients=6): - x = numpy.linspace(0,1,num_objects) - X_true = numpy.vstack([x,x]).transpose() +def test_api( + assert_200=True, + num_objects=5, + desired_dimension=2, + total_pulls_per_client=4, + num_experiments=1, + num_clients=6, +): + x = numpy.linspace(0, 1, num_objects) + X_true = numpy.vstack([x, x]).transpose() pool = Pool(processes=num_clients) - supported_alg_ids = ['CrowdKernel', 'RandomSampling', - 'UncertaintySampling', 'ValidationSampling', 'STE'] + supported_alg_ids = [ + "CrowdKernel", + "RandomSampling", + "UncertaintySampling", + "ValidationSampling", + "STE", + ] alg_list = [] for idx, alg_id in enumerate(supported_alg_ids): alg_item = {} - alg_item['alg_id'] = alg_id - if alg_id == 'ValidationSampling': - alg_item['alg_label'] = 'Test' - alg_item['params'] = {'query_list': [ - [q1, q2, q3] for q1 in [0, 1, 2] - for q2 in [0, 1, 2] - for q3 in [0, 1, 2] - ]} + alg_item["alg_id"] = alg_id + if alg_id == "ValidationSampling": + alg_item["alg_label"] = "Test" + alg_item["params"] = { + "query_list": [ + [q1, q2, q3] + for q1 in [0, 1, 2] + for q2 in [0, 1, 2] + for q3 in [0, 1, 2] + ] + } else: - alg_item['alg_label'] = alg_id - alg_item['test_alg_label'] = 'Test' + alg_item["alg_label"] = alg_id + alg_item["test_alg_label"] = "Test" alg_list.append(alg_item) params = [] for algorithm in alg_list: - params.append({'alg_label': algorithm['alg_label'], - 'proportion': 1./len(alg_list)}) + params.append( + {"alg_label": algorithm["alg_label"], "proportion": 1. / len(alg_list)} + ) algorithm_management_settings = {} - algorithm_management_settings['mode'] = 'fixed_proportions' - algorithm_management_settings['params'] = params + algorithm_management_settings["mode"] = "fixed_proportions" + algorithm_management_settings["params"] = params # Test POST Experiment initExp_args_dict = {} - initExp_args_dict['app_id'] = 'PoolBasedTripletMDS' - initExp_args_dict['args'] = {} - initExp_args_dict['args']['d'] = desired_dimension - initExp_args_dict['args']['failure_probability'] = 0.01 - initExp_args_dict['args']['participant_to_algorithm_management'] = 'one_to_many' # 'one_to_one' #optional field - initExp_args_dict['args']['algorithm_management_settings'] = algorithm_management_settings #optional field - initExp_args_dict['args']['alg_list'] = alg_list #optional field - initExp_args_dict['args']['instructions'] = 'You want instructions, here are your test instructions' - initExp_args_dict['args']['debrief'] = 'You want a debrief, here is your test debrief' - initExp_args_dict['args']['targets'] = {} - initExp_args_dict['args']['targets']['n'] = num_objects + initExp_args_dict["app_id"] = "PoolBasedTripletMDS" + initExp_args_dict["args"] = {} + initExp_args_dict["args"]["d"] = desired_dimension + initExp_args_dict["args"]["failure_probability"] = 0.01 + initExp_args_dict["args"][ + "participant_to_algorithm_management" + ] = "one_to_many" # 'one_to_one' #optional field + initExp_args_dict["args"][ + "algorithm_management_settings" + ] = algorithm_management_settings # optional field + initExp_args_dict["args"]["alg_list"] = alg_list # optional field + initExp_args_dict["args"][ + "instructions" + ] = "You want instructions, here are your test instructions" + initExp_args_dict["args"][ + "debrief" + ] = "You want a debrief, here is your test debrief" + initExp_args_dict["args"]["targets"] = {} + initExp_args_dict["args"]["targets"]["n"] = num_objects exp_info = [] for ell in range(num_experiments): @@ -73,83 +98,90 @@ def test_api(assert_200=True, num_objects=5, desired_dimension=2, participants = [] pool_args = [] for i in range(num_clients): - participant_uid = '%030x' % random.randrange(16**30) + participant_uid = "%030x" % random.randrange(16 ** 30) participants.append(participant_uid) experiment = numpy.random.choice(exp_info) - exp_uid = experiment['exp_uid'] - pool_args.append( (exp_uid,participant_uid,total_pulls_per_client,X_true,assert_200) ) + exp_uid = experiment["exp_uid"] + pool_args.append( + (exp_uid, participant_uid, total_pulls_per_client, X_true, assert_200) + ) results = pool.map(simulate_one_client, pool_args) for result in results: - print result + print(result) test_utils.getModel(exp_uid, app_id, supported_alg_ids, alg_list) -def simulate_one_client( input_args ): - exp_uid,participant_uid,total_pulls,X_true,assert_200 = input_args - +def simulate_one_client(input_args): + exp_uid, participant_uid, total_pulls, X_true, assert_200 = input_args getQuery_times = [] processAnswer_times = [] for t in range(total_pulls): - print "Participant {1} has taken {0} pulls".format(t,participant_uid) + print("Participant {1} has taken {0} pulls".format(t, participant_uid)) # test POST getQuery # - widget = random.choice([True] + 4*[False]) + widget = random.choice([True] + 4 * [False]) widget = True - getQuery_args_dict = {'args': {'participant_uid': participant_uid, - 'widget': widget}, - 'exp_uid': exp_uid} + getQuery_args_dict = { + "args": {"participant_uid": participant_uid, "widget": widget}, + "exp_uid": exp_uid, + } query_dict, dt = test_utils.getQuery(getQuery_args_dict) getQuery_times += [dt] if widget: - query_dict = query_dict['args'] - query_uid = query_dict['query_uid'] - targets = query_dict['target_indices'] + query_dict = query_dict["args"] + query_uid = query_dict["query_uid"] + targets = query_dict["target_indices"] # print targets for target in targets: - if target['label'] == 'center': - index_center = target['target_id'] - elif target['label'] == 'left': - index_left = target['target_id'] - elif target['label'] == 'right': - index_right = target['target_id'] + if target["label"] == "center": + index_center = target["target_id"] + elif target["label"] == "left": + index_left = target["target_id"] + elif target["label"] == "right": + index_right = target["target_id"] ts = test_utils.response_delay() # sleep for a bit to simulate response time - direction = norm(X_true[index_left]-X_true[index_center])-norm(X_true[index_right]-X_true[index_center]) + direction = norm(X_true[index_left] - X_true[index_center]) - norm( + X_true[index_right] - X_true[index_center] + ) r = numpy.random.rand() - if r<.1: - direction = - direction - if direction<0.: + if r < .1: + direction = -direction + if direction < 0.: target_winner = index_left else: target_winner = index_right response_time = time.time() - ts - # test POST processAnswer processAnswer_args_dict = {} processAnswer_args_dict["exp_uid"] = exp_uid processAnswer_args_dict["args"] = {} processAnswer_args_dict["args"]["query_uid"] = query_uid processAnswer_args_dict["args"]["target_winner"] = target_winner - processAnswer_args_dict["args"]['response_time'] = response_time + processAnswer_args_dict["args"]["response_time"] = response_time - processAnswer_json_response, dt = test_utils.processAnswer(processAnswer_args_dict) + processAnswer_json_response, dt = test_utils.processAnswer( + processAnswer_args_dict + ) processAnswer_times.append(dt) - r = test_utils.format_times(getQuery_times, processAnswer_times, total_pulls, - participant_uid) + r = test_utils.format_times( + getQuery_times, processAnswer_times, total_pulls, participant_uid + ) return r -if __name__ == '__main__': + +if __name__ == "__main__": test_api() # test_api(assert_200=False, num_objects=5, desired_dimension=2, - # total_pulls_per_client=100, num_experiments=1, - # num_clients=5, delta=0.01) + # total_pulls_per_client=100, num_experiments=1, + # num_clients=5, delta=0.01) diff --git a/apps/Tests/algs/TestAlg.py b/apps/Tests/algs/TestAlg.py index 90c974a6..6f4e9d6b 100644 --- a/apps/Tests/algs/TestAlg.py +++ b/apps/Tests/algs/TestAlg.py @@ -1,21 +1,23 @@ +from __future__ import print_function + class MyAlg: def initExp(self, butler): - butler.algorithms.set(key='algorithms_foo', value='algorithms_bar') + butler.algorithms.set(key="algorithms_foo", value="algorithms_bar") return True def getQuery(self, butler): - assert butler.experiment.get(key='experiment_foo') == 'experiment_bar' - assert butler.algorithms.get(key='algorithms_foo') == 'algorithms_bar' + assert butler.experiment.get(key="experiment_foo") == "experiment_bar" + assert butler.algorithms.get(key="algorithms_foo") == "algorithms_bar" return True def processAnswer(self, butler): - assert butler.experiment.get(key='experiment_foo') == 'experiment_bar' - assert butler.algorithms.get(key='algorithms_foo') == 'algorithms_bar' + assert butler.experiment.get(key="experiment_foo") == "experiment_bar" + assert butler.algorithms.get(key="algorithms_foo") == "algorithms_bar" return True diff --git a/apps/Tests/dashboard/Dashboard.py b/apps/Tests/dashboard/Dashboard.py index 56f6f2d4..31a4a7d3 100644 --- a/apps/Tests/dashboard/Dashboard.py +++ b/apps/Tests/dashboard/Dashboard.py @@ -1,3 +1,4 @@ +from __future__ import print_function from next.apps.AppDashboard import AppDashboard diff --git a/apps/Tests/myApp.py b/apps/Tests/myApp.py index 73f86f0b..cf8ea87b 100644 --- a/apps/Tests/myApp.py +++ b/apps/Tests/myApp.py @@ -1,15 +1,17 @@ +from __future__ import print_function import json import next.utils as utils import next.apps.SimpleTargetManager + class MyApp: - def __init__(self,db): - self.app_id = 'Tests' + def __init__(self, db): + self.app_id = "Tests" self.TargetManager = next.apps.SimpleTargetManager.SimpleTargetManager(db) def initExp(self, butler, init_algs, args): - butler.experiment.set(key='experiment_foo', value='experiment_bar') + butler.experiment.set(key="experiment_foo", value="experiment_bar") init_algs({}) @@ -17,7 +19,7 @@ def initExp(self, butler, init_algs, args): def getQuery(self, butler, alg, args): - assert butler.experiment.get(key='experiment_foo') == 'experiment_bar' + assert butler.experiment.get(key="experiment_foo") == "experiment_bar" assert alg() @@ -27,7 +29,7 @@ def processAnswer(self, butler, alg, args): assert alg({}) - assert butler.experiment.get(key='experiment_foo') == 'experiment_bar' + assert butler.experiment.get(key="experiment_foo") == "experiment_bar" return {} diff --git a/apps/Tests/tests/test_api.py b/apps/Tests/tests/test_api.py index 9f5ef92b..95c2dd3e 100644 --- a/apps/Tests/tests/test_api.py +++ b/apps/Tests/tests/test_api.py @@ -1,3 +1,4 @@ +from __future__ import print_function import numpy import numpy.random import random @@ -8,47 +9,61 @@ from multiprocessing import Pool import os import sys + try: import next.apps.test_utils as test_utils except: - file_dir = '/'.join(__file__.split('/')[:-1]) - sys.path.append('{}/../../../next/apps'.format(file_dir)) + file_dir = "/".join(__file__.split("/")[:-1]) + sys.path.append("{}/../../../next/apps".format(file_dir)) import test_utils -app_id = 'Tests' +app_id = "Tests" -def test_api(assert_200=True, num_objects=5, desired_dimension=2, - total_pulls_per_client=4, num_experiments=1, num_clients=6): +def test_api( + assert_200=True, + num_objects=5, + desired_dimension=2, + total_pulls_per_client=4, + num_experiments=1, + num_clients=6, +): pool = Pool(processes=num_clients) - supported_alg_ids = ['TestAlg'] + supported_alg_ids = ["TestAlg"] alg_list = [] for idx, alg_id in enumerate(supported_alg_ids): alg_item = {} - alg_item['alg_id'] = alg_id - alg_item['alg_label'] = alg_id + alg_item["alg_id"] = alg_id + alg_item["alg_label"] = alg_id alg_list.append(alg_item) params = [] for algorithm in alg_list: - params.append({'alg_label': algorithm['alg_label'], - 'proportion': 1./len(alg_list)}) + params.append( + {"alg_label": algorithm["alg_label"], "proportion": 1. / len(alg_list)} + ) algorithm_management_settings = {} - algorithm_management_settings['mode'] = 'fixed_proportions' - algorithm_management_settings['params'] = params + algorithm_management_settings["mode"] = "fixed_proportions" + algorithm_management_settings["params"] = params # Test POST Experiment initExp_args_dict = {} - initExp_args_dict['app_id'] = 'Tests' - initExp_args_dict['args'] = {} - initExp_args_dict['args']['participant_to_algorithm_management'] = 'one_to_many' - initExp_args_dict['args']['algorithm_management_settings'] = algorithm_management_settings - initExp_args_dict['args']['alg_list'] = alg_list - initExp_args_dict['args']['instructions'] = 'You want instructions, here are your test instructions' - initExp_args_dict['args']['debrief'] = 'You want a debrief, here is your test debrief' - initExp_args_dict['args']['targets'] = {} - initExp_args_dict['args']['targets']['n'] = num_objects + initExp_args_dict["app_id"] = "Tests" + initExp_args_dict["args"] = {} + initExp_args_dict["args"]["participant_to_algorithm_management"] = "one_to_many" + initExp_args_dict["args"][ + "algorithm_management_settings" + ] = algorithm_management_settings + initExp_args_dict["args"]["alg_list"] = alg_list + initExp_args_dict["args"][ + "instructions" + ] = "You want instructions, here are your test instructions" + initExp_args_dict["args"][ + "debrief" + ] = "You want a debrief, here is your test debrief" + initExp_args_dict["args"]["targets"] = {} + initExp_args_dict["args"]["targets"]["n"] = num_objects exp_info = [] for ell in range(num_experiments): @@ -59,16 +74,16 @@ def test_api(assert_200=True, num_objects=5, desired_dimension=2, participants = [] pool_args = [] for i in range(num_clients): - participant_uid = '%030x' % random.randrange(16**30) + participant_uid = "%030x" % random.randrange(16 ** 30) participants.append(participant_uid) experiment = numpy.random.choice(exp_info) - exp_uid = experiment['exp_uid'] + exp_uid = experiment["exp_uid"] pool_args.append((exp_uid, participant_uid, total_pulls_per_client, assert_200)) results = pool.map(simulate_one_client, pool_args) for result in results: - print result + print(result) test_utils.getModel(exp_uid, app_id, supported_alg_ids, alg_list) @@ -81,20 +96,21 @@ def simulate_one_client(input_args): processAnswer_times = [] for t in range(total_pulls): - print "Participant {1} has taken {0} pulls".format(t, participant_uid) + print("Participant {1} has taken {0} pulls".format(t, participant_uid)) # test POST getQuery # - widget = random.choice([True] + 4*[False]) + widget = random.choice([True] + 4 * [False]) widget = True - getQuery_args_dict = {'args': {'participant_uid': participant_uid, - 'widget': widget}, - 'exp_uid': exp_uid} + getQuery_args_dict = { + "args": {"participant_uid": participant_uid, "widget": widget}, + "exp_uid": exp_uid, + } query_dict, dt = test_utils.getQuery(getQuery_args_dict) getQuery_times += [dt] if widget: - query_dict = query_dict['args'] - query_uid = query_dict['query_uid'] + query_dict = query_dict["args"] + query_uid = query_dict["query_uid"] ts = test_utils.response_delay() # sleep for a bit to simulate response time @@ -106,15 +122,19 @@ def simulate_one_client(input_args): processAnswer_args_dict["exp_uid"] = exp_uid processAnswer_args_dict["args"] = {} processAnswer_args_dict["args"]["query_uid"] = query_uid - processAnswer_args_dict["args"]['response_time'] = response_time + processAnswer_args_dict["args"]["response_time"] = response_time - processAnswer_json_response, dt = test_utils.processAnswer(processAnswer_args_dict) + processAnswer_json_response, dt = test_utils.processAnswer( + processAnswer_args_dict + ) processAnswer_times.append(dt) - r = test_utils.format_times(getQuery_times, processAnswer_times, total_pulls, - participant_uid) + r = test_utils.format_times( + getQuery_times, processAnswer_times, total_pulls, participant_uid + ) return r -if __name__ == '__main__': + +if __name__ == "__main__": test_api() diff --git a/next/api/api.py b/next/api/api.py index 0287d169..e0af79ea 100644 --- a/next/api/api.py +++ b/next/api/api.py @@ -6,38 +6,38 @@ import next.constants as constants from flask import Flask + app = Flask(__name__) -app.register_blueprint(api_blueprint.api, url_prefix='/api') -app.register_blueprint(assistant, url_prefix='/assistant') -app.register_blueprint(home, url_prefix='/home') +app.register_blueprint(api_blueprint.api, url_prefix="/api") +app.register_blueprint(assistant, url_prefix="/assistant") +app.register_blueprint(home, url_prefix="/home") if constants.SITE_KEY: - dashboard_prefix = '/dashboard/{}'.format(constants.SITE_KEY) + dashboard_prefix = "/dashboard/{}".format(constants.SITE_KEY) else: - dashboard_prefix = '/dashboard' + dashboard_prefix = "/dashboard" app.register_blueprint(dashboard, url_prefix=dashboard_prefix) -app.register_blueprint(query_page, url_prefix='/query') +app.register_blueprint(query_page, url_prefix="/query") + @app.context_processor def inject_global_templatevars(): - return dict(next_git_hash=constants.GIT_HASH, - next_version=constants.VERSION) + return dict(next_git_hash=constants.GIT_HASH, next_version=constants.VERSION) + import logging import sys + # Log to standard out. Remember to turn off in production app.logger.addHandler(logging.StreamHandler(sys.stdout)) app.logger.setLevel(logging.DEBUG) -#Handle internal errors using a custom error message +# Handle internal errors using a custom error message import json + + @app.errorhandler(404) def internal_system_error(error): - response = { - 'meta':{ - 'status':'FAIL', - 'code':404, - 'message':'Resource not found' - } + response = { + "meta": {"status": "FAIL", "code": 404, "message": "Resource not found"} } return json.dumps(response), 404 - diff --git a/next/api/api_blueprint.py b/next/api/api_blueprint.py index e5447dac..a933d5b8 100644 --- a/next/api/api_blueprint.py +++ b/next/api/api_blueprint.py @@ -2,40 +2,44 @@ from next.api import api_util # Initialize flask.Flask application and restful.api objects -api = Blueprint('api', - __name__, - template_folder='templates', - static_folder='static') +api = Blueprint("api", __name__, template_folder="templates", static_folder="static") api_interface = api_util.NextBackendApi(api) # Format: Resource Class, get url, post url (when applicable) from next.api.resources.experiment import Experiment -api_interface.add_resource(Experiment, - '/experiment', - '/experiment/') + +api_interface.add_resource(Experiment, "/experiment", "/experiment/") from next.api.app_handler import AppHandler -api_interface.add_resource(AppHandler, - '/experiment//custom/function_name', - '/experiment/custom/') + +api_interface.add_resource( + AppHandler, + "/experiment//custom/function_name", + "/experiment/custom/", +) from next.api.resources.get_query import getQuery -api_interface.add_resource(getQuery, - '/experiment//getQuery', - '/experiment/getQuery') + +api_interface.add_resource( + getQuery, "/experiment//getQuery", "/experiment/getQuery" +) from next.api.resources.process_answer import processAnswer -api_interface.add_resource(processAnswer, '/experiment/processAnswer') + +api_interface.add_resource(processAnswer, "/experiment/processAnswer") from next.api.resources.logs import Logs -api_interface.add_resource(Logs, - '/experiment//logs', - '/experiment//logs/') + +api_interface.add_resource( + Logs, + "/experiment//logs", + "/experiment//logs/", +) from next.api.resources.participants import Participants -api_interface.add_resource(Participants, - '/experiment//participants') + +api_interface.add_resource(Participants, "/experiment//participants") from next.api.resources.targets import Targets -api_interface.add_resource(Targets, - '/experiment//targets') + +api_interface.add_resource(Targets, "/experiment//targets") diff --git a/next/api/api_util.py b/next/api/api_util.py index 2f14ede0..af900838 100644 --- a/next/api/api_util.py +++ b/next/api/api_util.py @@ -4,7 +4,7 @@ Author: Lalit Jain, lalitkumarj@gmail.com """ - +from __future__ import print_function import time def timeit(f): @@ -65,7 +65,7 @@ class NextBackendApi(Api): def handle_error(self, e, **kwargs): exc_type, exc_value, tb = sys.exc_info() backend_error = traceback.format_exc(tb) - print "backend_error", backend_error,exc_type, exc_value, tb, traceback.format_exc(tb) + print("backend_error", backend_error,exc_type, exc_value, tb, traceback.format_exc(tb)) # Catch internal system errors code = getattr(e, 'code', 500) diff --git a/next/api/resource_manager.py b/next/api/resource_manager.py index a7406e9f..d1a8e744 100644 --- a/next/api/resource_manager.py +++ b/next/api/resource_manager.py @@ -3,10 +3,13 @@ import next.utils as utils from next.database_client.DatabaseAPI import DatabaseAPI + db = DatabaseAPI() from next.logging_client.LoggerAPI import LoggerAPI + ell = LoggerAPI() + class ResourceManager: """ resource_manager @@ -48,7 +51,7 @@ def get_app_ids(self): """ return utils.get_supported_apps() - def get_app_about(self,app_id, apps_dir='apps/'): + def get_app_about(self, app_id, apps_dir="apps/"): """ Returns a string description of the app defined by app_id (good for a blurb on a website perhaps) @@ -61,11 +64,11 @@ def get_app_about(self,app_id, apps_dir='apps/'): Usage: ::\n rm.get_app_about('DuelingBanditsPureExploration') """ - filename = apps_dir + '{0}/{0}.yaml'.format(app_id) - info = yaml.load(open(filename, 'rb')) - return info['initExp']['description'] + filename = apps_dir + "{0}/{0}.yaml".format(app_id) + info = yaml.load(open(filename, "rb")) + return info["initExp"]["description"] - def get_app_alg_ids(self,app_id, app_dir='apps/'): + def get_app_alg_ids(self, app_id, app_dir="apps/"): """ Returns a list of all implemented alg_id's for a particular app_id @@ -78,12 +81,12 @@ def get_app_alg_ids(self,app_id, app_dir='apps/'): Usage: ::\n rm.get_app_alg_ids('PoolBasedTripletMDS') """ - filename = app_dir + '{0}/{0}.yaml'.format(app_id) - exp = yaml.load(open(filename, 'rb')) - args = exp['initExp']['values']['args']['values'] - return args['alg_list']['values']['values']['alg_id']['values'] + filename = app_dir + "{0}/{0}.yaml".format(app_id) + exp = yaml.load(open(filename, "rb")) + args = exp["initExp"]["values"]["args"]["values"] + return args["alg_list"]["values"]["values"]["alg_id"]["values"] - def get_app_exp_uids(self,app_id): + def get_app_exp_uids(self, app_id): """ Returns a dictionary of lists of exp_uid's indexed by app_id @@ -96,15 +99,15 @@ def get_app_exp_uids(self,app_id): Usage: ::\n rm.get_app_exp_uids('PoolBasedTripletMDS') """ - docs = db.get_docs_with_filter(app_id+':experiments',{}) + docs = db.get_docs_with_filter(app_id + ":experiments", {}) exp_uids = [] for doc in docs: - exp_uids.append(str(doc['exp_uid'])) + exp_uids.append(str(doc["exp_uid"])) return exp_uids - def get_app_exp_uid_start_date(self,exp_uid): + def get_app_exp_uid_start_date(self, exp_uid): """ Returns date in a string when experiment was initiazlied @@ -118,28 +121,25 @@ def get_app_exp_uid_start_date(self,exp_uid): rm.get_app_exp_uid_start_date('PoolBasedTripletMDS') """ - start_date = db.get('experiments_admin',exp_uid,'start_date') + start_date = db.get("experiments_admin", exp_uid, "start_date") if isinstance(start_date, datetime): return start_date else: return utils.str2datetime(start_date) - def is_exp_retired(self, exp_uid): app_id = self.get_app_id(exp_uid) - is_retired = db.get(app_id+':experiments', exp_uid, 'retired') + is_retired = db.get(app_id + ":experiments", exp_uid, "retired") return is_retired or False - def set_exp_retired(self, exp_uid, retired=True): app_id = self.get_app_id(exp_uid) - db.set(app_id+':experiments', exp_uid, 'retired', retired) - + db.set(app_id + ":experiments", exp_uid, "retired", retired) - def get_experiment(self,exp_uid): + def get_experiment(self, exp_uid): """ Gets an experiment from an exp_uid. Returns none if the exp_uid is not found. @@ -158,14 +158,14 @@ def get_experiment(self,exp_uid): if app_id == None: return None - docs = db.get_docs_with_filter(app_id+':experiments',{'exp_uid':exp_uid}) + docs = db.get_docs_with_filter(app_id + ":experiments", {"exp_uid": exp_uid}) - if len(docs)>0: + if len(docs) > 0: return docs[0] else: return None - def get_app_id(self,exp_uid): + def get_app_id(self, exp_uid): """ Gets an app_id from an exp_uid. Returns none if the exp_uid is not found. This should be coming from cache so it should be very fast @@ -180,10 +180,9 @@ def get_app_id(self,exp_uid): app_id = rm.get_app_id('b5242319c78df48f4ff31e78de5857') """ - return db.get('experiments_admin',exp_uid,'app_id') + return db.get("experiments_admin", exp_uid, "app_id") - - def get_algs_doc_for_exp_uid(self,exp_uid): + def get_algs_doc_for_exp_uid(self, exp_uid): """ Returns the algorithm docs used in exp_uid @@ -199,9 +198,9 @@ def get_algs_doc_for_exp_uid(self,exp_uid): alg_list = rm.get_algs_doc_for_exp_uid('b5242319c78df48f4ff31e78de5857') """ app_id = self.get_app_id(exp_uid) - return db.get_docs_with_filter(app_id+':algorithms',{'exp_uid':exp_uid}) + return db.get_docs_with_filter(app_id + ":algorithms", {"exp_uid": exp_uid}) - def get_algs_for_exp_uid(self,exp_uid): + def get_algs_for_exp_uid(self, exp_uid): """ Returns a list of algs' data used in exp_uid @@ -217,17 +216,17 @@ def get_algs_for_exp_uid(self,exp_uid): alg_list = rm.get_algs_for_exp_uid('b5242319c78df48f4ff31e78de5857') """ app_id = self.get_app_id(exp_uid) - args = db.get(app_id+':experiments',exp_uid,'args') + args = db.get(app_id + ":experiments", exp_uid, "args") alg_list = [] - for alg in args['alg_list']: + for alg in args["alg_list"]: tmp = {} - tmp['alg_id'] = alg['alg_id'] - tmp['alg_label'] = alg['alg_label'] + tmp["alg_id"] = alg["alg_id"] + tmp["alg_label"] = alg["alg_label"] alg_list.append(tmp) return alg_list - def get_git_hash_for_exp_uid(self,exp_uid): + def get_git_hash_for_exp_uid(self, exp_uid): """ Returns git_hash of when exp_uid was initialized @@ -240,9 +239,9 @@ def get_git_hash_for_exp_uid(self,exp_uid): """ app_id = self.get_app_id(exp_uid) - return db.get(app_id+':experiments',exp_uid,'git_hash') + return db.get(app_id + ":experiments", exp_uid, "git_hash") - def get_participant_uids(self,exp_uid): + def get_participant_uids(self, exp_uid): """ Given an exp_uid, returns list of participant_uid's involved with experiment @@ -256,15 +255,17 @@ def get_participant_uids(self,exp_uid): participant_uids = resource_manager.get_participant_uids(exp_uid) """ app_id = self.get_app_id(exp_uid) - participants = db.get_docs_with_filter(app_id+':participants',{'exp_uid':exp_uid}) + participants = db.get_docs_with_filter( + app_id + ":participants", {"exp_uid": exp_uid} + ) participant_uid_list = [] for participant in participants: - participant_uid = participant['participant_uid'] + participant_uid = participant["participant_uid"] participant_uid_list.append(participant_uid) return participant_uid_list - def get_participant_data(self,participant_uid, exp_uid): + def get_participant_data(self, participant_uid, exp_uid): """ Given a participant_id and an exp_uid, returns the associated set of responses. @@ -278,11 +279,12 @@ def get_participant_data(self,participant_uid, exp_uid): responses = resource_manager.get_participant_data(participant_uid,exp_uid) """ app_id = self.get_app_id(exp_uid) - queries = db.get_docs_with_filter(app_id+':queries',{'participant_uid':participant_uid}) + queries = db.get_docs_with_filter( + app_id + ":queries", {"participant_uid": participant_uid} + ) return queries - - def get_experiment_logs(self,exp_uid): + def get_experiment_logs(self, exp_uid): """ Given an exp_uid, returns all logs associated with the experiment. @@ -298,16 +300,18 @@ def get_experiment_logs(self,exp_uid): app_id = self.get_app_id(exp_uid) - log_types = ['APP-EXCEPTION','ALG-DURATION','ALG-EVALUATION'] + log_types = ["APP-EXCEPTION", "ALG-DURATION", "ALG-EVALUATION"] all_logs = [] for log_type in log_types: - logs = ell.get_logs_with_filter(app_id+':'+log_type,{'exp_uid':exp_uid}) + logs = ell.get_logs_with_filter( + app_id + ":" + log_type, {"exp_uid": exp_uid} + ) all_logs.extend(logs) return all_logs - def get_experiment_logs_of_type(self,exp_uid,log_type): + def get_experiment_logs_of_type(self, exp_uid, log_type): """ Given an exp_uid, returns all logs associated with the experiment. @@ -323,8 +327,11 @@ def get_experiment_logs_of_type(self,exp_uid,log_type): app_id = self.get_app_id(exp_uid) - log_types = ['APP-CALL','APP-RESPONSE','APP-EXCEPTION','ALG-DURATION','ALG-EVALUATION'] - return ell.get_logs_with_filter(app_id+':'+log_type,{'exp_uid':exp_uid}) - - - + log_types = [ + "APP-CALL", + "APP-RESPONSE", + "APP-EXCEPTION", + "ALG-DURATION", + "ALG-EVALUATION", + ] + return ell.get_logs_with_filter(app_id + ":" + log_type, {"exp_uid": exp_uid}) diff --git a/next/api/resources/experiment.py b/next/api/resources/experiment.py index 197cfd78..952f1b37 100644 --- a/next/api/resources/experiment.py +++ b/next/api/resources/experiment.py @@ -8,6 +8,7 @@ from next.api.resource_manager import ResourceManager from next.api.api_util import * from next.api.api_util import APIArgument + resource_manager = ResourceManager() broker = next.broker.broker.JobBroker() @@ -18,64 +19,58 @@ # Custom errors for GET and POST verbs on experiment resource meta_error = { - 'ExpDoesNotExistError': { - 'message': "No experiment with the specified experiment ID exists.", - 'code': 400, - 'status': 'FAIL' + "ExpDoesNotExistError": { + "message": "No experiment with the specified experiment ID exists.", + "code": 400, + "status": "FAIL", }, - - 'InitExpError': { - 'message': "Failed to initialize experiment. Please verify that you have specified the correct application specific parameters.", - 'code': 400, - 'status': 'FAIL' + "InitExpError": { + "message": "Failed to initialize experiment. Please verify that you have specified the correct application specific parameters.", + "code": 400, + "status": "FAIL", }, } -meta_success = { - 'code': 200, - 'status': 'OK' -} +meta_success = {"code": 200, "status": "OK"} + class Experiment(Resource): def get(self, exp_uid): get_parser = exp_parser.copy() - get_parser.add_argument('exp_uid', type=str, required=True ) - get_parser.add_argument('args', type=dict, required=False ) + get_parser.add_argument("exp_uid", type=str, required=True) + get_parser.add_argument("args", type=dict, required=False) # Fetch experiment data from resource manager experiment = resource_manager.get_experiment(exp_uid) algorithms = resource_manager.get_algs_doc_for_exp_uid(exp_uid) - experiment['algorithms'] = algorithms + experiment["algorithms"] = algorithms # Throw error if no such experiment exists if not experiment: - return attach_meta({}, meta_error['ExpDoesNotExistError']), 400 + return attach_meta({}, meta_error["ExpDoesNotExistError"]), 400 else: return attach_meta(experiment, meta_success), 200 def post(self): post_parser = exp_parser.copy() - post_parser.add_argument('app_id', type=str, required=True) - post_parser.add_argument('args', type=dict, required=True) + post_parser.add_argument("app_id", type=str, required=True) + post_parser.add_argument("args", type=dict, required=True) # Validate args with post_parser args_data = post_parser.parse_args() - app_id = args_data['app_id'] + app_id = args_data["app_id"] # Create and set exp_uid - exp_uid = '%030x' % random.randrange(16**30) + exp_uid = "%030x" % random.randrange(16 ** 30) # Args from dict to json type args_json = json.dumps(args_data) # Execute initExp through the broker - response_json,didSucceed,message = broker.applyAsync(app_id, - exp_uid, - 'initExp', - json.dumps(args_data)) - - if not didSucceed: - return attach_meta({}, meta_error['InitExpError'], backend_error=message), 400 - - return attach_meta({'exp_uid':exp_uid}, meta_success), 200 - - - + response_json, didSucceed, message = broker.applyAsync( + app_id, exp_uid, "initExp", json.dumps(args_data) + ) + if not didSucceed: + return ( + attach_meta({}, meta_error["InitExpError"], backend_error=message), + 400, + ) + return attach_meta({"exp_uid": exp_uid}, meta_success), 200 diff --git a/next/api/resources/get_query.py b/next/api/resources/get_query.py index 45294867..4b6e9108 100644 --- a/next/api/resources/get_query.py +++ b/next/api/resources/get_query.py @@ -12,64 +12,73 @@ import next.utils as utils from next.api.resource_manager import ResourceManager from jinja2 import Environment, FileSystemLoader + resource_manager = ResourceManager() broker = next.broker.broker.JobBroker() # Request parser. Checks that necessary dictionary keys are available in a given resource. -# We rely on learningLib functions to ensure that all necessary arguments are available and parsed. +# We rely on learningLib functions to ensure that all necessary arguments are available and parsed. post_parser = reqparse.RequestParser(argument_class=APIArgument) # Custom errors for GET and POST verbs on experiment resource meta_error = { - 'ExpDoesNotExistError': { - 'message': "No experiment with the specified experiment ID exists.", - 'code': 400, - 'status':'FAIL' + "ExpDoesNotExistError": { + "message": "No experiment with the specified experiment ID exists.", + "code": 400, + "status": "FAIL", }, - 'QueryGenerationError': { - 'message': "Failed to generate query. Please verify that you have specified the correct application specific query parameters.", - 'code': 400, - 'status': 'FAIL', + "QueryGenerationError": { + "message": "Failed to generate query. Please verify that you have specified the correct application specific query parameters.", + "code": 400, + "status": "FAIL", }, } -meta_success = { - 'code': 200, - 'status': 'OK' -} +meta_success = {"code": 200, "status": "OK"} # Query resource class class getQuery(Resource): def post(self): - post_parser.add_argument('exp_uid', type=str, required=True) - post_parser.add_argument('args', type=dict, required=False) + post_parser.add_argument("exp_uid", type=str, required=True) + post_parser.add_argument("args", type=dict, required=False) # Validate args with post_parser args_data = post_parser.parse_args() # Pull app_id and exp_uid from parsed args - exp_uid = args_data['exp_uid'] + exp_uid = args_data["exp_uid"] # Fetch app_id data from resource manager app_id = resource_manager.get_app_id(exp_uid) # Standardized participant_uid - if 'participant_uid' in args_data['args'].keys(): - args_data['args']['participant_uid'] = exp_uid+"_" + \ - str(args_data['args']['participant_uid']) + if "participant_uid" in args_data["args"].keys(): + args_data["args"]["participant_uid"] = ( + exp_uid + "_" + str(args_data["args"]["participant_uid"]) + ) - render_widget = args_data['args'].get('widget',False) + render_widget = args_data["args"].get("widget", False) - # Execute getQuery - response_json,didSucceed,message = broker.applyAsync(app_id,exp_uid,"getQuery", json.dumps(args_data)) + # Execute getQuery + response_json, didSucceed, message = broker.applyAsync( + app_id, exp_uid, "getQuery", json.dumps(args_data) + ) response_dict = json.loads(response_json) if not didSucceed: - return attach_meta({},meta_error['QueryGenerationError'], backend_error=message) + return attach_meta( + {}, meta_error["QueryGenerationError"], backend_error=message + ) if render_widget: - TEMPLATES_DIRECTORY = 'apps/{}/widgets'.format(resource_manager.get_app_id(exp_uid)) + TEMPLATES_DIRECTORY = "apps/{}/widgets".format( + resource_manager.get_app_id(exp_uid) + ) env = Environment(loader=FileSystemLoader(TEMPLATES_DIRECTORY)) - template=env.get_template("getQuery_widget.html") - return {'html':template.render(query=response_dict), 'args':response_dict}, 200, {'Access-Control-Allow-Origin':'*', 'Content-Type':'application/json'} - - return attach_meta(response_dict,meta_success), 200 - - + template = env.get_template("getQuery_widget.html") + return ( + {"html": template.render(query=response_dict), "args": response_dict}, + 200, + { + "Access-Control-Allow-Origin": "*", + "Content-Type": "application/json", + }, + ) + return attach_meta(response_dict, meta_success), 200 diff --git a/next/api/resources/logs.py b/next/api/resources/logs.py index 09ad0f01..c74ceac1 100644 --- a/next/api/resources/logs.py +++ b/next/api/resources/logs.py @@ -4,11 +4,11 @@ Logs resource for all logs associated with a specified experiment. """ -''' +""" example use: get a tripletMDS query: curl -X GET http://localhost:8001/api/experiment/[exp_uid]/logs -''' +""" from flask import Flask, request, send_file from flask_restful import Resource, reqparse @@ -24,26 +24,22 @@ resource_manager = ResourceManager() # Request parser. Checks that necessary dictionary keys are available in a given resource. -# We rely on learningLib functions to ensure that all necessary arguments are available and parsed. +# We rely on learningLib functions to ensure that all necessary arguments are available and parsed. post_parser = reqparse.RequestParser(argument_class=APIArgument) # Custom errors for GET and POST verbs on experiment resource meta_error = { - 'ExpDoesNotExistError': { - 'message': "No experiment with the specified experiment ID exists.", - 'code': 400, - 'status':'FAIL' - }, + "ExpDoesNotExistError": { + "message": "No experiment with the specified experiment ID exists.", + "code": 400, + "status": "FAIL", + } } -meta_success = { - 'code': 200, - 'status': 'OK' -} +meta_success = {"code": 200, "status": "OK"} # Logs resource class class Logs(Resource): - def get(self, exp_uid, log_type=None): """ .. http:get:: /experiment//logs/ @@ -77,35 +73,35 @@ def get(self, exp_uid, log_type=None): :statuscode 200: Logs successfully returned :statuscode 400: Logs failed to be generated - """ + """ zip_true = False - if request.args.get('zip'): + if request.args.get("zip"): try: - zip_true = eval(request.args.get('zip')) + zip_true = eval(request.args.get("zip")) except: pass - # Get logs for exp_uid from resource_manager if log_type: - experiment_logs = resource_manager.get_experiment_logs_of_type(exp_uid, - log_type) - all_logs = {'log_data': experiment_logs} - return attach_meta(all_logs,meta_success), 200 + experiment_logs = resource_manager.get_experiment_logs_of_type( + exp_uid, log_type + ) + all_logs = {"log_data": experiment_logs} + return attach_meta(all_logs, meta_success), 200 else: experiment_logs = resource_manager.get_experiment_logs(exp_uid) - all_logs = {'log_data': experiment_logs} + all_logs = {"log_data": experiment_logs} if zip_true: zip_logs = BytesIO() - with zipfile.ZipFile(zip_logs, 'w') as zf: - zf.writestr('logs.json', json.dumps(all_logs)) + with zipfile.ZipFile(zip_logs, "w") as zf: + zf.writestr("logs.json", json.dumps(all_logs)) zip_logs.seek(0) - return send_file(zip_logs, - attachment_filename='logs.zip', - as_attachment='True') + return send_file( + zip_logs, attachment_filename="logs.zip", as_attachment="True" + ) else: - return attach_meta(all_logs,meta_success), 200 + return attach_meta(all_logs, meta_success), 200 if not experiment_logs: - return attach_meta({'message':'No logs to report.'},meta_success), 200 + return attach_meta({"message": "No logs to report."}, meta_success), 200 diff --git a/next/api/resources/participants.py b/next/api/resources/participants.py index 6041304a..2166995b 100644 --- a/next/api/resources/participants.py +++ b/next/api/resources/participants.py @@ -4,11 +4,11 @@ Resource for accessing all participant data related to a resource """ -''' +""" example use: get a tripletMDS query: curl -X GET http://localhost:8001/api/experiment/[exp_uid]/participants -''' +""" from StringIO import StringIO import pandas as pd from flask import Flask, send_file, request, abort @@ -16,7 +16,7 @@ import traceback import json -from io import BytesIO +from io import BytesIO import zipfile import next.utils @@ -25,29 +25,28 @@ from next.api.api_util import APIArgument from next.api.resource_manager import ResourceManager from next.database_client.DatabaseAPI import DatabaseAPI + db = DatabaseAPI() from next.logging_client.LoggerAPI import LoggerAPI + ell = LoggerAPI() resource_manager = ResourceManager() # Request parser. Checks that necessary dictionary keys are available in a given resource. -# We rely on learningLib functions to ensure that all necessary arguments are available and parsed. +# We rely on learningLib functions to ensure that all necessary arguments are available and parsed. post_parser = reqparse.RequestParser(argument_class=APIArgument) # Custom errors for GET and POST verbs on experiment resource meta_error = { - 'ExpDoesNotExistError': { - 'message': "No experiment with the specified experiment ID exists.", - 'code': 400, - 'status':'FAIL' - }, + "ExpDoesNotExistError": { + "message": "No experiment with the specified experiment ID exists.", + "code": 400, + "status": "FAIL", + } } -meta_success = { - 'code': 200, - 'status': 'OK' -} +meta_success = {"code": 200, "status": "OK"} # Participants resource class class Participants(Resource): @@ -85,13 +84,13 @@ def get(self, exp_uid): :statuscode 200: Participants responses successfully returned :statuscode 400: Participants responses failed to be generated """ - true_values ={1, '1', 'True', 'true'} + true_values = {1, "1", "True", "true"} zip_true = False - if 'zip' in request.args.keys(): - zip_true = True if request.args.get('zip') in true_values else False + if "zip" in request.args.keys(): + zip_true = True if request.args.get("zip") in true_values else False csv = False - if 'csv' in request.args.keys(): - csv = True if request.args.get('csv') in true_values else False + if "csv" in request.args.keys(): + csv = True if request.args.get("csv") in true_values else False # Get all participants for exp_uid from resource_manager participant_uids = resource_manager.get_participant_uids(exp_uid) @@ -99,16 +98,14 @@ def get(self, exp_uid): # Iterate through list of all participants for specified exp_uid for participant in participant_uids: - response = resource_manager.get_participant_data(participant, - exp_uid) + response = resource_manager.get_participant_data(participant, exp_uid) # Append participant query responses to list participant_responses[participant] = response if csv: responses = [] for participant in participant_uids: - response = resource_manager.get_participant_data(participant, - exp_uid) + response = resource_manager.get_participant_data(participant, exp_uid) for r in response: responses += [r] @@ -116,44 +113,52 @@ def get(self, exp_uid): response_file = parse_responses(responses) except ValueError as e: message = str(e) - message += '\n\n' + str(traceback.format_exc()) + message += "\n\n" + str(traceback.format_exc()) utils.debug_print(message) return message - all_responses = {'participant_responses': participant_responses} + all_responses = {"participant_responses": participant_responses} if zip_true: - filename, content = ('responses.json', json.dumps(all_responses)) - if request.args.get('csv'): - filename, content = ('responses.csv', response_file.getvalue()) + filename, content = ("responses.json", json.dumps(all_responses)) + if request.args.get("csv"): + filename, content = ("responses.csv", response_file.getvalue()) zip_responses = BytesIO() - with zipfile.ZipFile(zip_responses, 'w', - compression=zipfile.ZIP_DEFLATED) as zf: + with zipfile.ZipFile( + zip_responses, "w", compression=zipfile.ZIP_DEFLATED + ) as zf: zf.writestr(filename, content) zip_responses.seek(0) - return send_file(zip_responses, - attachment_filename=filename + '.zip', - as_attachment='True') + return send_file( + zip_responses, + attachment_filename=filename + ".zip", + as_attachment="True", + ) else: return api_util.attach_meta(all_responses, meta_success), 200 + def parse_responses(responses): if len(responses) == 0: - raise ValueError('ERROR: responses have not been recorded') - exp_uid = responses[0]['exp_uid'] + raise ValueError("ERROR: responses have not been recorded") + exp_uid = responses[0]["exp_uid"] app_id = resource_manager.get_app_id(exp_uid) myApp = utils.get_app(app_id, exp_uid, db, ell).myApp - if not hasattr(myApp, 'format_responses'): - raise ValueError('ERROR: myApp.format_responses does not exist for {}'.format(app_id)) + if not hasattr(myApp, "format_responses"): + raise ValueError( + "ERROR: myApp.format_responses does not exist for {}".format(app_id) + ) r = myApp.format_responses(responses) if type(r) != list and type(r[0]) != dict: - raise ValueError('ERROR: myApp.format_responses should return a list of dictionaries') + raise ValueError( + "ERROR: myApp.format_responses should return a list of dictionaries" + ) df = pd.DataFrame(r) str_file = StringIO() - df.to_csv(str_file, encoding='utf-8') + df.to_csv(str_file, encoding="utf-8") return str_file diff --git a/next/api/resources/process_answer.py b/next/api/resources/process_answer.py index 6dbb539a..859ca952 100644 --- a/next/api/resources/process_answer.py +++ b/next/api/resources/process_answer.py @@ -3,14 +3,15 @@ author: Christopher Fernandez, Lalit Jain Ansewr resource for restful answering of experiment queries in next_backend. """ -''' +""" example usage: answer a tripletMDS query: curl -H "Content-Type: application/json" \ -d '{"exp_uid": "DFPDISJFSA", "app_id": "PoolBasedTripletMDS", "args": {"alg_uid": "33f6d2c3f898cc5b4c528002bfe1351f", "target_indices": [{"index": 6, "flag": 0, "label": "center"}, {"index": 8, "flag": 0, "label": "left"}, {"index": 5, "flag": 0, "label": "right"}], "index_winner": 8, "timestamp_query_generated": "2015-02-11 14:59:20.494993"} }' \ -X POST http://localhost:8001/experiment/answer -''' +""" +from __future__ import print_function from flask import Flask from flask_restful import Resource, reqparse @@ -28,49 +29,46 @@ broker = next.broker.broker.JobBroker() # Request parser. Checks that necessary dictionary keys are available in a given resource. -# We rely on learningLib functions to ensure that all necessary arguments are available and parsed. +# We rely on learningLib functions to ensure that all necessary arguments are available and parsed. post_parser = reqparse.RequestParser(argument_class=APIArgument) # Custom errors for GET and POST verbs on experiment resource custom_errors = { - 'ReportAnswerError': { - 'message': "Failed to report answer to the specified experiment. Please verify that you have specified the correct application specific query parameters.", - 'code': 400, - 'status': 'FAIL', - }, + "ReportAnswerError": { + "message": "Failed to report answer to the specified experiment. Please verify that you have specified the correct application specific query parameters.", + "code": 400, + "status": "FAIL", + } } -meta_success = { - 'code':200, - 'status':'OK' - } +meta_success = {"code": 200, "status": "OK"} # Answer resource class class processAnswer(Resource): def post(self): - post_parser.add_argument('exp_uid', type=str, required=True) - post_parser.add_argument('args', type=dict, required=True) + post_parser.add_argument("exp_uid", type=str, required=True) + post_parser.add_argument("args", type=dict, required=True) # Validate args with post_parser args_data = post_parser.parse_args() # Pull app_id and exp_uid from parsed args exp_uid = args_data["exp_uid"] - args_data['args']['response_time'] = float(args_data['args']['response_time']) + args_data["args"]["response_time"] = float(args_data["args"]["response_time"]) # Fetch app_id data from resource manager app_id = resource_manager.get_app_id(exp_uid) # Parse out a target_winner. If the argument doesn't exist, return a meta dictionary error. - args_json = json.dumps(args_data) - # Execute processAnswer - response_json,didSucceed,message = broker.applyAsync(app_id, - exp_uid, - 'processAnswer', - args_json) + args_json = json.dumps(args_data) + # Execute processAnswer + response_json, didSucceed, message = broker.applyAsync( + app_id, exp_uid, "processAnswer", args_json + ) if didSucceed: return attach_meta(eval(response_json), meta_success), 200 else: - print "Failed to processAnswer", message - return attach_meta({},custom_errors['ReportAnswerError'], backend_error=message) - + print("Failed to processAnswer", message) + return attach_meta( + {}, custom_errors["ReportAnswerError"], backend_error=message + ) diff --git a/next/api/resources/targets.py b/next/api/resources/targets.py index 8f8501aa..e12bb3d7 100644 --- a/next/api/resources/targets.py +++ b/next/api/resources/targets.py @@ -16,6 +16,7 @@ from next.database_client.DatabaseAPI import DatabaseAPI from next.logging_client.LoggerAPI import LoggerAPI from next.apps.App import App + db = DatabaseAPI() ell = LoggerAPI() @@ -27,17 +28,14 @@ # Custom errors for GET and POST verbs on experiment resource meta_error = { - 'ExpDoesNotExistError': { - 'message': "No experiment with the specified experiment ID exists.", - 'code': 400, - 'status':'FAIL' - }, + "ExpDoesNotExistError": { + "message": "No experiment with the specified experiment ID exists.", + "code": 400, + "status": "FAIL", + } } -meta_success = { - 'code': 200, - 'status': 'OK' -} +meta_success = {"code": 200, "status": "OK"} # Participants resource class class Targets(Resource): diff --git a/next/apps/AppDashboard.py b/next/apps/AppDashboard.py index b0af4bed..c06db08d 100644 --- a/next/apps/AppDashboard.py +++ b/next/apps/AppDashboard.py @@ -6,73 +6,82 @@ import next.utils as utils import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt import mpld3 - MAX_SAMPLES_PER_PLOT = 100 -class AppDashboard(object): - def __init__(self, db, ell): - self.db = db - self.ell = ell +class AppDashboard(object): + def __init__(self, db, ell): + self.db = db + self.ell = ell - def basic_info(self,app,butler): - """ + def basic_info(self, app, butler): + """ returns basic statistics like number of queries, participants, etc. """ - experiment_dict = butler.experiment.get() + experiment_dict = butler.experiment.get() - #git_hash = rm.get_git_hash_for_exp_uid(exp_uid) - git_hash = experiment_dict.get('git_hash','None') + # git_hash = rm.get_git_hash_for_exp_uid(exp_uid) + git_hash = experiment_dict.get("git_hash", "None") - # start_date = utils.str2datetime(butler.admin.get(uid=app.exp_uid)['start_date']) - start_date = experiment_dict.get('start_date','Unknown')+' UTC' + # start_date = utils.str2datetime(butler.admin.get(uid=app.exp_uid)['start_date']) + start_date = experiment_dict.get("start_date", "Unknown") + " UTC" - # participant_uids = rm.get_participant_uids(exp_uid) - participants = butler.participants.get(pattern={'exp_uid':app.exp_uid}) - num_participants = len(participants) + # participant_uids = rm.get_participant_uids(exp_uid) + participants = butler.participants.get(pattern={"exp_uid": app.exp_uid}) + num_participants = len(participants) - queries = butler.queries.get(pattern={'exp_uid':app.exp_uid}) - num_queries = len(queries) + queries = butler.queries.get(pattern={"exp_uid": app.exp_uid}) + num_queries = len(queries) - return_dict = {'git_hash':git_hash, - 'exp_start_data':start_date, - 'num_participants':num_participants, - 'num_queries':num_queries, - 'meta':{'last_dashboard_update':'<1 minute ago'}} - return return_dict + return_dict = { + "git_hash": git_hash, + "exp_start_data": start_date, + "num_participants": num_participants, + "num_queries": num_queries, + "meta": {"last_dashboard_update": "<1 minute ago"}, + } + return return_dict - def api_activity_histogram(self, app, butler): - """ + def api_activity_histogram(self, app, butler): + """ Description: returns the data to plot all API activity (for all algorithms) in a histogram with respect to time for any task in {getQuery,processAnswer,predict} Expected output (in dict): (dict) MPLD3 plot dictionary """ - queries = butler.queries.get(pattern={'exp_uid':app.exp_uid}) - #self.db.get_docs_with_filter(app_id+':queries',{'exp_uid':exp_uid}) - start_date = utils.str2datetime(butler.admin.get(uid=app.exp_uid)['start_date']) - numerical_timestamps = [(utils.str2datetime(item['timestamp_query_generated'])-start_date).total_seconds() - for item in queries] - fig, ax = plt.subplots(subplot_kw=dict(axisbg='#FFFFFF'),figsize=(12,1.5)) - ax.hist(numerical_timestamps,min(int(1+4*numpy.sqrt(len(numerical_timestamps))),300),alpha=0.5,color='black') - ax.set_frame_on(False) - ax.get_xaxis().set_ticks([]) - ax.get_yaxis().set_ticks([]) - ax.get_yaxis().set_visible(False) - ax.set_xlim(0, max(numerical_timestamps)) - plot_dict = mpld3.fig_to_dict(fig) - plt.close() - return plot_dict - - - - def compute_duration_multiline_plot(self, app, butler, task): - """ + queries = butler.queries.get(pattern={"exp_uid": app.exp_uid}) + # self.db.get_docs_with_filter(app_id+':queries',{'exp_uid':exp_uid}) + start_date = utils.str2datetime(butler.admin.get(uid=app.exp_uid)["start_date"]) + numerical_timestamps = [ + ( + utils.str2datetime(item["timestamp_query_generated"]) - start_date + ).total_seconds() + for item in queries + ] + fig, ax = plt.subplots(subplot_kw=dict(axisbg="#FFFFFF"), figsize=(12, 1.5)) + ax.hist( + numerical_timestamps, + min(int(1 + 4 * numpy.sqrt(len(numerical_timestamps))), 300), + alpha=0.5, + color="black", + ) + ax.set_frame_on(False) + ax.get_xaxis().set_ticks([]) + ax.get_yaxis().set_ticks([]) + ax.get_yaxis().set_visible(False) + ax.set_xlim(0, max(numerical_timestamps)) + plot_dict = mpld3.fig_to_dict(fig) + plt.close() + return plot_dict + + def compute_duration_multiline_plot(self, app, butler, task): + """ Description: Returns multiline plot where there is a one-to-one mapping lines to algorithms and each line indicates the durations to complete the task (wrt to the api call) @@ -83,85 +92,92 @@ def compute_duration_multiline_plot(self, app, butler, task): (dict) MPLD3 plot dictionary """ - alg_list = butler.experiment.get(key='args')['alg_list'] - x_min = numpy.float('inf') - x_max = -numpy.float('inf') - y_min = numpy.float('inf') - y_max = -numpy.float('inf') - list_of_alg_dicts = [] - - for algorithm in alg_list: - alg_label = algorithm['alg_label'] - list_of_log_dict = butler.ell.get_logs_with_filter(app.app_id+':ALG-DURATION', - {'exp_uid':app.exp_uid,'alg_label':alg_label,'task':task}) - list_of_log_dict = sorted(list_of_log_dict, key=lambda item: utils.str2datetime(item['timestamp']) ) - - x = [] - y = [] - t = [] - k=0 - for item in list_of_log_dict: - k+=1 - x.append(k) - y.append( item.get('app_duration',0.) + item.get('duration_enqueued',0.) ) - t.append(str(item['timestamp'])[:-3]) - - x = numpy.array(x) - y = numpy.array(y) - t = numpy.array(t) - num_items = len(list_of_log_dict) - multiplier = min(num_items,MAX_SAMPLES_PER_PLOT) - incr_inds = [ r*num_items/multiplier for r in range(multiplier)] - max_inds = list(numpy.argsort(-y)[0:multiplier]) - final_inds = sorted(set(incr_inds + max_inds)) - x = list(x[final_inds]) - y = list(y[final_inds]) - t = list(t[final_inds]) - - alg_dict = {} - alg_dict['legend_label'] = alg_label - alg_dict['x'] = x - alg_dict['y'] = y - alg_dict['t'] = t - try: - x_min = min(x_min,min(x)) - x_max = max(x_max,max(x)) - y_min = min(y_min,min(y)) - y_max = max(y_max,max(y)) - except: - pass - - list_of_alg_dicts.append(alg_dict) - - return_dict = {} - return_dict['data'] = list_of_alg_dicts - return_dict['plot_type'] = 'multi_line_plot' - return_dict['x_label'] = 'API Call' - return_dict['x_min'] = x_min - return_dict['x_max'] = x_max - return_dict['y_label'] = 'Duration (s)' - return_dict['y_min'] = y_min - return_dict['y_max'] = y_max - - fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE')) - for alg_dict in list_of_alg_dicts: - ax.plot(alg_dict['x'],alg_dict['y'],label=alg_dict['legend_label']) - ax.set_xlabel('API Call') - ax.set_ylabel('Duration (s)') - ax.set_xlim([x_min,x_max]) - ax.set_ylim([y_min,y_max]) - ax.grid(color='white', linestyle='solid') - ax.set_title(task, size=14) - legend = ax.legend(loc=2,ncol=3,mode="expand") - for label in legend.get_texts(): - label.set_fontsize('small') - plot_dict = mpld3.fig_to_dict(fig) - plt.close() - return plot_dict - - - def compute_duration_detailed_stacked_area_plot(self,app,butler,task,alg_label,detailedDB=False): - """ + alg_list = butler.experiment.get(key="args")["alg_list"] + x_min = numpy.float("inf") + x_max = -numpy.float("inf") + y_min = numpy.float("inf") + y_max = -numpy.float("inf") + list_of_alg_dicts = [] + + for algorithm in alg_list: + alg_label = algorithm["alg_label"] + list_of_log_dict = butler.ell.get_logs_with_filter( + app.app_id + ":ALG-DURATION", + {"exp_uid": app.exp_uid, "alg_label": alg_label, "task": task}, + ) + list_of_log_dict = sorted( + list_of_log_dict, key=lambda item: utils.str2datetime(item["timestamp"]) + ) + + x = [] + y = [] + t = [] + k = 0 + for item in list_of_log_dict: + k += 1 + x.append(k) + y.append( + item.get("app_duration", 0.) + item.get("duration_enqueued", 0.) + ) + t.append(str(item["timestamp"])[:-3]) + + x = numpy.array(x) + y = numpy.array(y) + t = numpy.array(t) + num_items = len(list_of_log_dict) + multiplier = min(num_items, MAX_SAMPLES_PER_PLOT) + incr_inds = [r * num_items / multiplier for r in range(multiplier)] + max_inds = list(numpy.argsort(-y)[0:multiplier]) + final_inds = sorted(set(incr_inds + max_inds)) + x = list(x[final_inds]) + y = list(y[final_inds]) + t = list(t[final_inds]) + + alg_dict = {} + alg_dict["legend_label"] = alg_label + alg_dict["x"] = x + alg_dict["y"] = y + alg_dict["t"] = t + try: + x_min = min(x_min, min(x)) + x_max = max(x_max, max(x)) + y_min = min(y_min, min(y)) + y_max = max(y_max, max(y)) + except: + pass + + list_of_alg_dicts.append(alg_dict) + + return_dict = {} + return_dict["data"] = list_of_alg_dicts + return_dict["plot_type"] = "multi_line_plot" + return_dict["x_label"] = "API Call" + return_dict["x_min"] = x_min + return_dict["x_max"] = x_max + return_dict["y_label"] = "Duration (s)" + return_dict["y_min"] = y_min + return_dict["y_max"] = y_max + + fig, ax = plt.subplots(subplot_kw=dict(axisbg="#EEEEEE")) + for alg_dict in list_of_alg_dicts: + ax.plot(alg_dict["x"], alg_dict["y"], label=alg_dict["legend_label"]) + ax.set_xlabel("API Call") + ax.set_ylabel("Duration (s)") + ax.set_xlim([x_min, x_max]) + ax.set_ylim([y_min, y_max]) + ax.grid(color="white", linestyle="solid") + ax.set_title(task, size=14) + legend = ax.legend(loc=2, ncol=3, mode="expand") + for label in legend.get_texts(): + label.set_fontsize("small") + plot_dict = mpld3.fig_to_dict(fig) + plt.close() + return plot_dict + + def compute_duration_detailed_stacked_area_plot( + self, app, butler, task, alg_label, detailedDB=False + ): + """ Description: Returns stacked area plot for a particular algorithm and task where the durations are broken down into compute,db_set,db_get (for cpu, database_set, database_get) @@ -172,78 +188,90 @@ def compute_duration_detailed_stacked_area_plot(self,app,butler,task,alg_label,d Expected output (in dict): (dict) MPLD3 plot dictionary """ - list_of_log_dict = butler.ell.get_logs_with_filter(app.app_id+':ALG-DURATION', - {'exp_uid':app.exp_uid,'alg_label':alg_label,'task':task}) - list_of_log_dict = sorted(list_of_log_dict, key=lambda item: utils.str2datetime(item['timestamp']) ) - - y = [] - for item in list_of_log_dict: - y.append( item.get('app_duration',0.) + item.get('duration_enqueued',0.) ) - y = numpy.array(y) - num_items = len(list_of_log_dict) - multiplier = min(num_items,MAX_SAMPLES_PER_PLOT) - incr_inds = [ k*num_items/multiplier for k in range(multiplier)] - max_inds = list(numpy.argsort(-y)[0:multiplier]) - final_inds = sorted(set(incr_inds + max_inds)) - - x = [] - t = [] - enqueued = [] - admin = [] - dbGet = [] - dbSet = [] - compute = [] - - max_y_value = 0. - min_y_value = float('inf') - for idx in final_inds: - item = list_of_log_dict[idx] - x.append(idx+1) - t.append(str(item.get('timestamp',''))) - - _alg_duration = item.get('duration',0.) - _alg_duration_dbGet = item.get('duration_dbGet',0.) - _alg_duration_dbSet = item.get('duration_dbSet',0.) - _duration_enqueued = item.get('duration_enqueued',0.) - _app_duration = item.get('app_duration',0.) - - if (_app_duration+_duration_enqueued) > max_y_value: - max_y_value = _app_duration + _duration_enqueued - if (_app_duration+_duration_enqueued) < min_y_value: - min_y_value = _app_duration + _duration_enqueued - - enqueued.append(_duration_enqueued) - admin.append(_app_duration-_alg_duration) - dbSet.append(_alg_duration_dbSet) - dbGet.append(_alg_duration_dbGet) - compute.append( _alg_duration - _alg_duration_dbSet - _alg_duration_dbGet ) - - try: - min_x = min(x) - max_x = max(x) - except: - min_x = 0. - max_x = 0. - - fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE')) - stack_coll = ax.stackplot(x,compute,dbGet,dbSet,admin,enqueued, alpha=.5) - ax.set_xlabel('API Call') - ax.set_ylabel('Duration (s)') - ax.set_xlim([min_x,max_x]) - ax.set_ylim([0.,max_y_value]) - ax.grid(color='white', linestyle='solid') - ax.set_title(alg_label+' - '+task, size=14) - proxy_rects = [plt.Rectangle((0, 0), 1, 1, alpha=.5,fc=pc.get_facecolor()[0]) for pc in stack_coll] - legend = ax.legend(proxy_rects, ['compute','dbGet','dbSet','admin','enqueued'],loc=2,ncol=3,mode="expand") - for label in legend.get_texts(): - label.set_fontsize('small') - plot_dict = mpld3.fig_to_dict(fig) - plt.close() - return plot_dict - - - def response_time_histogram(self,app,butler,alg_label): - """ + list_of_log_dict = butler.ell.get_logs_with_filter( + app.app_id + ":ALG-DURATION", + {"exp_uid": app.exp_uid, "alg_label": alg_label, "task": task}, + ) + list_of_log_dict = sorted( + list_of_log_dict, key=lambda item: utils.str2datetime(item["timestamp"]) + ) + + y = [] + for item in list_of_log_dict: + y.append(item.get("app_duration", 0.) + item.get("duration_enqueued", 0.)) + y = numpy.array(y) + num_items = len(list_of_log_dict) + multiplier = min(num_items, MAX_SAMPLES_PER_PLOT) + incr_inds = [k * num_items / multiplier for k in range(multiplier)] + max_inds = list(numpy.argsort(-y)[0:multiplier]) + final_inds = sorted(set(incr_inds + max_inds)) + + x = [] + t = [] + enqueued = [] + admin = [] + dbGet = [] + dbSet = [] + compute = [] + + max_y_value = 0. + min_y_value = float("inf") + for idx in final_inds: + item = list_of_log_dict[idx] + x.append(idx + 1) + t.append(str(item.get("timestamp", ""))) + + _alg_duration = item.get("duration", 0.) + _alg_duration_dbGet = item.get("duration_dbGet", 0.) + _alg_duration_dbSet = item.get("duration_dbSet", 0.) + _duration_enqueued = item.get("duration_enqueued", 0.) + _app_duration = item.get("app_duration", 0.) + + if (_app_duration + _duration_enqueued) > max_y_value: + max_y_value = _app_duration + _duration_enqueued + if (_app_duration + _duration_enqueued) < min_y_value: + min_y_value = _app_duration + _duration_enqueued + + enqueued.append(_duration_enqueued) + admin.append(_app_duration - _alg_duration) + dbSet.append(_alg_duration_dbSet) + dbGet.append(_alg_duration_dbGet) + compute.append(_alg_duration - _alg_duration_dbSet - _alg_duration_dbGet) + + try: + min_x = min(x) + max_x = max(x) + except: + min_x = 0. + max_x = 0. + + fig, ax = plt.subplots(subplot_kw=dict(axisbg="#EEEEEE")) + stack_coll = ax.stackplot(x, compute, dbGet, dbSet, admin, enqueued, alpha=.5) + ax.set_xlabel("API Call") + ax.set_ylabel("Duration (s)") + ax.set_xlim([min_x, max_x]) + ax.set_ylim([0., max_y_value]) + ax.grid(color="white", linestyle="solid") + ax.set_title(alg_label + " - " + task, size=14) + proxy_rects = [ + plt.Rectangle((0, 0), 1, 1, alpha=.5, fc=pc.get_facecolor()[0]) + for pc in stack_coll + ] + legend = ax.legend( + proxy_rects, + ["compute", "dbGet", "dbSet", "admin", "enqueued"], + loc=2, + ncol=3, + mode="expand", + ) + for label in legend.get_texts(): + label.set_fontsize("small") + plot_dict = mpld3.fig_to_dict(fig) + plt.close() + return plot_dict + + def response_time_histogram(self, app, butler, alg_label): + """ Description: returns the data to plot response time histogram of processAnswer for each algorithm Expected input: @@ -252,28 +280,36 @@ def response_time_histogram(self,app,butler,alg_label): Expected output (in dict): (dict) MPLD3 plot dictionary """ - list_of_query_dict = self.db.get_docs_with_filter(app.app_id+':queries',{'exp_uid':app.exp_uid,'alg_label':alg_label}) - t = [] - for item in list_of_query_dict: - try: - t.append(item['response_time']) - except: - pass - - fig, ax = plt.subplots(subplot_kw=dict(axisbg='#FFFFFF')) - ax.hist(t, bins=min(len(t), MAX_SAMPLES_PER_PLOT), range=(0,30),alpha=0.5,color='black') - ax.set_xlim(0, 30) - ax.set_axis_off() - ax.set_xlabel('Durations (s)') - ax.set_ylabel('Count') - ax.set_title(alg_label + " - response time", size=14) - plot_dict = mpld3.fig_to_dict(fig) - plt.close() - - return plot_dict - - def network_delay_histogram(self, app, butler, alg_label): - """ + list_of_query_dict = self.db.get_docs_with_filter( + app.app_id + ":queries", {"exp_uid": app.exp_uid, "alg_label": alg_label} + ) + t = [] + for item in list_of_query_dict: + try: + t.append(item["response_time"]) + except: + pass + + fig, ax = plt.subplots(subplot_kw=dict(axisbg="#FFFFFF")) + ax.hist( + t, + bins=min(len(t), MAX_SAMPLES_PER_PLOT), + range=(0, 30), + alpha=0.5, + color="black", + ) + ax.set_xlim(0, 30) + ax.set_axis_off() + ax.set_xlabel("Durations (s)") + ax.set_ylabel("Count") + ax.set_title(alg_label + " - response time", size=14) + plot_dict = mpld3.fig_to_dict(fig) + plt.close() + + return plot_dict + + def network_delay_histogram(self, app, butler, alg_label): + """ Description: returns the data to network delay histogram of the time it takes to getQuery+processAnswer for each algorithm Expected input: @@ -282,24 +318,25 @@ def network_delay_histogram(self, app, butler, alg_label): Expected output (in dict): (dict) MPLD3 plot dictionary """ - list_of_query_dict = self.db.get_docs_with_filter(app.app_id+':queries',{'exp_uid':app.exp_uid,'alg_label':alg_label}) - - t = [] - for item in list_of_query_dict: - try: - t.append(item['network_delay']) - except: - pass - - fig, ax = plt.subplots(subplot_kw=dict(axisbg='#FFFFFF')) - ax.hist(t,MAX_SAMPLES_PER_PLOT,range=(0,5),alpha=0.5,color='black') - ax.set_xlim(0, 5) - ax.set_axis_off() - ax.set_xlabel('Durations (s)') - ax.set_ylabel('Count') - ax.set_title(alg_label + " - network delay", size=14) - plot_dict = mpld3.fig_to_dict(fig) - plt.close() - - return plot_dict - + list_of_query_dict = self.db.get_docs_with_filter( + app.app_id + ":queries", {"exp_uid": app.exp_uid, "alg_label": alg_label} + ) + + t = [] + for item in list_of_query_dict: + try: + t.append(item["network_delay"]) + except: + pass + + fig, ax = plt.subplots(subplot_kw=dict(axisbg="#FFFFFF")) + ax.hist(t, MAX_SAMPLES_PER_PLOT, range=(0, 5), alpha=0.5, color="black") + ax.set_xlim(0, 5) + ax.set_axis_off() + ax.set_xlabel("Durations (s)") + ax.set_ylabel("Count") + ax.set_title(alg_label + " - network delay", size=14) + plot_dict = mpld3.fig_to_dict(fig) + plt.close() + + return plot_dict diff --git a/next/apps/Butler.py b/next/apps/Butler.py index cfb7a84b..4989d9d7 100644 --- a/next/apps/Butler.py +++ b/next/apps/Butler.py @@ -9,23 +9,29 @@ class Memory(object): - def __init__(self, collection='', exp_uid='', uid_prefix=''): + def __init__(self, collection="", exp_uid="", uid_prefix=""): self.key_prefix = collection + uid_prefix.format(exp_uid=exp_uid) self.cache = None self.max_entry_size = 500000000 # 500MB def check_prefix(self): - if self.key_prefix == '': - utils.debug_print("butler.memory is deprecated." - " Change to butler.experiment.memory or butler.algorithm.memory, etc." - " wherever appropriate") + if self.key_prefix == "": + utils.debug_print( + "butler.memory is deprecated." + " Change to butler.experiment.memory or butler.algorithm.memory, etc." + " wherever appropriate" + ) def ensure_connection(self): try: if self.cache is None: - self.cache = redis.StrictRedis(host=constants.MINIONREDIS_HOST, port=constants.MINIONREDIS_PORT) + self.cache = redis.StrictRedis( + host=constants.MINIONREDIS_HOST, port=constants.MINIONREDIS_PORT + ) except Exception as e: - raise Exception("Butler.Collection.Memory could not connect with RedisDB: {}".format(e)) + raise Exception( + "Butler.Collection.Memory could not connect with RedisDB: {}".format(e) + ) def num_entries(self, size): if size % self.max_entry_size == 0: @@ -43,7 +49,9 @@ def set(self, key, value): utils.debug_print("Setting {} in {} entries".format(l, n)) for i in range(n): k = key + ":" + str(i) - self.cache.set(k, value[i*self.max_entry_size:(i+1)*self.max_entry_size]) + self.cache.set( + k, value[i * self.max_entry_size : (i + 1) * self.max_entry_size] + ) return self.cache.set(key, "{}:{}".format(str(n), str(l))) except Exception as e: utils.debug_print("Butler.Collection.Memory.set exception: {}".format(e)) @@ -65,7 +73,9 @@ def set_file(self, key, f): self.cache.set(k, v) return self.cache.set(key, "{}:{}".format(str(n), str(l))) except Exception as e: - utils.debug_print("Butler.Collection.Memory.set_file exception: {}".format(e)) + utils.debug_print( + "Butler.Collection.Memory.set_file exception: {}".format(e) + ) return False def get(self, key): @@ -104,7 +114,9 @@ def get_file(self, key): f.seek(0, 0) return f except Exception as e: - utils.debug_print("Butler.Collection.Memory.get_file exception: {}".format(e)) + utils.debug_print( + "Butler.Collection.Memory.get_file exception: {}".format(e) + ) return None def lock(self, name, **kwargs): @@ -137,44 +149,49 @@ def __init__(self, collection, uid_prefix, exp_uid, db, timing=True): self.timing = timing self.memory = Memory(collection, exp_uid, uid_prefix) - def timed(op_type='set'): + def timed(op_type="set"): def decorator(f): @wraps(f) def wrapper(self, *args, **kwargs): result, dt = utils.timeit(f)(self, *args, **kwargs) - if op_type == 'set': + if op_type == "set": self.set_durations += dt - elif op_type == 'get': + elif op_type == "get": self.get_durations += dt return result + return wrapper - return decorator + return decorator - @timed(op_type='set') + @timed(op_type="set") def set(self, uid="", key=None, value=None, exp=None): """ Set an object in the collection, or an entry in an object in the collection. * key == None: collection[uid] = value * key != None: collection[uid][key] = value """ - uid = (self.uid_prefix+uid).format(exp_uid=(self.exp_uid if exp is None else exp)) + uid = (self.uid_prefix + uid).format( + exp_uid=(self.exp_uid if exp is None else exp) + ) if not key: self.db.set_doc(self.collection, uid, value) else: self.db.set(self.collection, uid, key, value) - @timed(op_type='set') + @timed(op_type="set") def set_many(self, uid="", key_value_dict=None, exp=None): """ For each key in key_value_dict, sets value by key_value_dict[key] """ - uid = (self.uid_prefix+uid).format(exp_uid=(self.exp_uid if exp is None else exp)) + uid = (self.uid_prefix + uid).format( + exp_uid=(self.exp_uid if exp is None else exp) + ) return self.db.set_many(self.collection, uid, key_value_dict) - @timed(op_type='get') + @timed(op_type="get") def get(self, uid="", key=None, pattern=None, exp=None): """ Get an object from the collection (possibly by pattern), or an entry (or entries) from an object in the collection. @@ -183,7 +200,9 @@ def get(self, uid="", key=None, pattern=None, exp=None): * key != None and pattern == None and type(key) == list: return {k: collection[uid][k] for k in key} * pattern != None: return collection[uid] matching pattern """ - uid = (self.uid_prefix+uid).format(exp_uid=(self.exp_uid if exp is None else exp)) + uid = (self.uid_prefix + uid).format( + exp_uid=(self.exp_uid if exp is None else exp) + ) if key is None and pattern is None: return self.db.get_doc(self.collection, uid) elif key: @@ -194,24 +213,28 @@ def get(self, uid="", key=None, pattern=None, exp=None): else: return self.db.get_docs_with_filter(self.collection, pattern) - @timed(op_type='get') + @timed(op_type="get") def get_and_delete(self, uid="", key=None, exp=None): """ Get a value from the collection corresponding to the key and then delete the (key,value). """ - uid = (self.uid_prefix+uid).format(exp_uid=(self.exp_uid if exp is None else exp)) + uid = (self.uid_prefix + uid).format( + exp_uid=(self.exp_uid if exp is None else exp) + ) value = self.db.get_and_delete(self.collection, uid, key) return value - @timed(op_type='get') - def exists(self, uid="", key='_id', exp=None): + @timed(op_type="get") + def exists(self, uid="", key="_id", exp=None): """ Check if an object with the specified uid exists """ - uid = (self.uid_prefix+uid).format(exp_uid=(self.exp_uid if exp is None else exp)) + uid = (self.uid_prefix + uid).format( + exp_uid=(self.exp_uid if exp is None else exp) + ) return self.db.exists(self.collection, uid, key) - @timed(op_type='get') + @timed(op_type="get") def increment(self, uid="", key=None, exp=None, value=1): """ Increment a value (or values) in the collection. @@ -219,28 +242,34 @@ def increment(self, uid="", key=None, exp=None, value=1): * value: How much the value should be incremented by. """ - uid = (self.uid_prefix+uid).format(exp_uid=(self.exp_uid if exp is None else exp)) + uid = (self.uid_prefix + uid).format( + exp_uid=(self.exp_uid if exp is None else exp) + ) return self.db.increment(self.collection, uid, key, value) - @timed(op_type='get') + @timed(op_type="get") def increment_many(self, uid="", key_value_dict=None, exp=None): """ For each key in key_value_dict, increments value by key_value_dict[key] * values: How much the value should be incremented by. """ - uid = (self.uid_prefix+uid).format(exp_uid=(self.exp_uid if exp is None else exp)) + uid = (self.uid_prefix + uid).format( + exp_uid=(self.exp_uid if exp is None else exp) + ) return self.db.increment_many(self.collection, uid, key_value_dict) - @timed(op_type='set') + @timed(op_type="set") def append(self, uid="", key=None, value=None, exp=None): """ Append a value to collection[uid][key] (which is assumed to be a list) """ - uid = (self.uid_prefix+uid).format(exp_uid=(self.exp_uid if exp == None else exp)) + uid = (self.uid_prefix + uid).format( + exp_uid=(self.exp_uid if exp == None else exp) + ) self.db.append_list(self.collection, uid, key, value) - @timed(op_type='get') + @timed(op_type="get") def pop(self, uid="", key=None, value=-1, exp=None): """ Pop a value from collection[uid][key] (which is assumed to be a list) @@ -248,14 +277,19 @@ def pop(self, uid="", key=None, value=-1, exp=None): value=0 pops the first element of the list Other values for "value" will throw error and return a None (not supported in Mongo) """ - uid = (self.uid_prefix+uid).format(exp_uid=(self.exp_uid if exp == None else exp)) + uid = (self.uid_prefix + uid).format( + exp_uid=(self.exp_uid if exp == None else exp) + ) return self.db.pop_list(self.collection, uid, key, value) def getDurations(self): """ For book keeping purposes only """ - return {'duration_dbSet': self.set_durations, 'duration_dbGet': self.get_durations} + return { + "duration_dbSet": self.set_durations, + "duration_dbGet": self.get_durations, + } class Butler(object): @@ -271,29 +305,51 @@ def __init__(self, app_id, exp_uid, targets, db, ell, alg_label=None, alg_id=Non if self.targets.db is None: self.targets.db = self.db - self.queries = Collection(self.app_id+":queries", "", self.exp_uid, db) + self.queries = Collection(self.app_id + ":queries", "", self.exp_uid, db) self.admin = Collection("experiments_admin", "", self.exp_uid, db) - self.experiment = Collection(self.app_id+":experiments", "{exp_uid}", self.exp_uid, db) + self.experiment = Collection( + self.app_id + ":experiments", "{exp_uid}", self.exp_uid, db + ) if alg_label is None: - self.algorithms = Collection(self.app_id+":algorithms", "{exp_uid}_", self.exp_uid, db) + self.algorithms = Collection( + self.app_id + ":algorithms", "{exp_uid}_", self.exp_uid, db + ) else: - self.algorithms = Collection(self.app_id+":algorithms", "{exp_uid}_"+alg_label, self.exp_uid, db) - self.participants = Collection(self.app_id+":participants", "", self.exp_uid, db) - self.dashboard = Collection(self.app_id+":dashboard", "", self.exp_uid, db) - self.other = Collection(self.app_id+":other", "{exp_uid}_", self.exp_uid, db) + self.algorithms = Collection( + self.app_id + ":algorithms", "{exp_uid}_" + alg_label, self.exp_uid, db + ) + self.participants = Collection( + self.app_id + ":participants", "", self.exp_uid, db + ) + self.dashboard = Collection(self.app_id + ":dashboard", "", self.exp_uid, db) + self.other = Collection(self.app_id + ":other", "{exp_uid}_", self.exp_uid, db) def log(self, log_name, log_value): - self.ell.log(self.app_id+":"+log_name, log_value) + self.ell.log(self.app_id + ":" + log_name, log_value) def job(self, task, task_args_json, ignore_result=True, time_limit=0): if self.alg_label: - res = self.db.submit_job(self.app_id, self.exp_uid, - task, task_args_json, - self.exp_uid + '_' + self.alg_label, - ignore_result, time_limit, - alg_id=self.alg_id, alg_label=self.alg_label) + res = self.db.submit_job( + self.app_id, + self.exp_uid, + task, + task_args_json, + self.exp_uid + "_" + self.alg_label, + ignore_result, + time_limit, + alg_id=self.alg_id, + alg_label=self.alg_label, + ) else: - res = self.db.submit_job(self.app_id, self.exp_uid, task, task_args_json, None, ignore_result, time_limit) + res = self.db.submit_job( + self.app_id, + self.exp_uid, + task, + task_args_json, + None, + ignore_result, + time_limit, + ) if not ignore_result: return res diff --git a/next/apps/SimpleTargetManager.py b/next/apps/SimpleTargetManager.py index 9aed39a3..9c6e59ba 100644 --- a/next/apps/SimpleTargetManager.py +++ b/next/apps/SimpleTargetManager.py @@ -1,17 +1,18 @@ import next.utils as utils + class SimpleTargetManager(object): - def __init__(self,db): - self.bucket_id = 'targets' + def __init__(self, db): + self.bucket_id = "targets" self.db = db def set_targetset(self, exp_uid, targetset): """ Update the default target docs in the DB if a user uploads a target set. """ - for i,target in enumerate(targetset): - target['target_id'] = i - target['exp_uid'] = exp_uid + for i, target in enumerate(targetset): + target["target_id"] = i + target["exp_uid"] = exp_uid try: self.db.set_doc(self.bucket_id, None, target) @@ -22,7 +23,7 @@ def get_targetset(self, exp_uid): """ Gets the entire targetset for a given experiment as a list of dictionaries. """ - targetset = self.db.get_docs_with_filter(self.bucket_id, {'exp_uid': exp_uid}) + targetset = self.db.get_docs_with_filter(self.bucket_id, {"exp_uid": exp_uid}) if targetset is None: raise Exception("Target set for experiment {} is empty".format(targetset)) # targetset = mongotized_target_blob.pop(0) @@ -34,9 +35,9 @@ def get_target_item(self, exp_uid, target_id): """ # Get an individual target form the DB given exp_uid and index try: - got_target = self.db.get_docs_with_filter(self.bucket_id, - {'exp_uid': exp_uid, - 'target_id': target_id}) + got_target = self.db.get_docs_with_filter( + self.bucket_id, {"exp_uid": exp_uid, "target_id": target_id} + ) except Exception as e: raise Exception("Failed to get_target_item: " + str(e)) @@ -45,28 +46,36 @@ def get_target_item(self, exp_uid, target_id): target = got_target.pop(0) except: # targets are numbers - target = {'target_id':target_id, - 'primary_description':str(target_id), - 'primary_type':'text', - 'alt_description':str(target_id), - 'alt_type':'text'} + target = { + "target_id": target_id, + "primary_description": str(target_id), + "primary_type": "text", + "alt_description": str(target_id), + "alt_type": "text", + } # This line might fail; only tested under the except: statement above - #del target['exp_uid'] + # del target['exp_uid'] return target def get_target_mapping(self, exp_uid): # Get all docs for specified exp_uid - mongotized_target_blob = self.db.get_docs_with_filter(self.bucket_id, {'exp_uid': exp_uid}) + mongotized_target_blob = self.db.get_docs_with_filter( + self.bucket_id, {"exp_uid": exp_uid} + ) # If no docs with exp_uid can be retreived, throw an error if mongotized_target_blob is None: - raise DatabaseException("No documents with exp_uid {} could be retrieved".format(exp_uid)) + raise DatabaseException( + "No documents with exp_uid {} could be retrieved".format(exp_uid) + ) # Pop target_blob_dict out of list for i in range(len(mongotized_target_blob)): - if 'targetless' in mongotized_target_blob[i].keys(): + if "targetless" in mongotized_target_blob[i].keys(): mongotized_target_blob.pop(i) break try: - mongotized_target_blob = sorted(mongotized_target_blob,key = lambda x: x.get('target_id',0)) + mongotized_target_blob = sorted( + mongotized_target_blob, key=lambda x: x.get("target_id", 0) + ) except: pass diff --git a/next/apps/test_utils.py b/next/apps/test_utils.py index 2997ea1a..d7009fe7 100644 --- a/next/apps/test_utils.py +++ b/next/apps/test_utils.py @@ -1,129 +1,169 @@ """ This file is to provide a unifed testing framework for NEXT. """ +from __future__ import print_function import requests import os import json import time import numpy as np -HOSTNAME = os.environ.get('NEXT_BACKEND_GLOBAL_HOST', 'localhost') \ - + ':' + os.environ.get('NEXT_BACKEND_GLOBAL_PORT', '8000') +HOSTNAME = ( + os.environ.get("NEXT_BACKEND_GLOBAL_HOST", "localhost") + + ":" + + os.environ.get("NEXT_BACKEND_GLOBAL_PORT", "8000") +) def initExp(initExp_args_dict, assert_200=True): - url = "http://"+HOSTNAME+"/api/experiment" - response = requests.post(url, json.dumps(initExp_args_dict), - headers={'content-type':'application/json'}) - print("POST initExp response =",response.text, response.status_code) - if assert_200: assert response.status_code is 200 + url = "http://" + HOSTNAME + "/api/experiment" + response = requests.post( + url, json.dumps(initExp_args_dict), headers={"content-type": "application/json"} + ) + print("POST initExp response =", response.text, response.status_code) + if assert_200: + assert response.status_code is 200 initExp_response_dict = json.loads(response.text) - exp_uid = initExp_response_dict['exp_uid'] + exp_uid = initExp_response_dict["exp_uid"] ################################################# # Test initExperiment ################################################# - url = "http://"+HOSTNAME+"/api/experiment/"+exp_uid + url = "http://" + HOSTNAME + "/api/experiment/" + exp_uid response = requests.get(url) - print "GET experiment response =", response.text, response.status_code - if assert_200: assert response.status_code is 200 + print("GET experiment response =", response.text, response.status_code) + if assert_200: + assert response.status_code is 200 initExp_response_dict = json.loads(response.text) - return initExp_response_dict, {'exp_uid': exp_uid} + return initExp_response_dict, {"exp_uid": exp_uid} def getQuery(getQuery_args_dict, assert_200=True, verbose=False): - url = 'http://'+HOSTNAME+'/api/experiment/getQuery' - response,dt = timeit(requests.post)(url, json.dumps(getQuery_args_dict),headers={'content-type':'application/json'}) + url = "http://" + HOSTNAME + "/api/experiment/getQuery" + response, dt = timeit(requests.post)( + url, + json.dumps(getQuery_args_dict), + headers={"content-type": "application/json"}, + ) if verbose: - print "POST getQuery response = ", response.text, response.status_code - if assert_200: assert response.status_code is 200 + print("POST getQuery response = ", response.text, response.status_code) + if assert_200: + assert response.status_code is 200 if verbose: - print "POST getQuery duration = ", dt + print("POST getQuery duration = ", dt) query_dict = json.loads(response.text) return query_dict, dt def processAnswer(processAnswer_args_dict, assert_200=True, verbose=False): - url = 'http://'+HOSTNAME+'/api/experiment/processAnswer' + url = "http://" + HOSTNAME + "/api/experiment/processAnswer" if verbose: - print "POST processAnswer args = ", processAnswer_args_dict - response,dt = timeit(requests.post)(url, json.dumps(processAnswer_args_dict), headers={'content-type':'application/json'}) + print("POST processAnswer args = ", processAnswer_args_dict) + response, dt = timeit(requests.post)( + url, + json.dumps(processAnswer_args_dict), + headers={"content-type": "application/json"}, + ) if verbose: - print "POST processAnswer response", response.text, response.status_code - if assert_200: assert response.status_code is 200 + print("POST processAnswer response", response.text, response.status_code) + if assert_200: + assert response.status_code is 200 if verbose: - print "POST processAnswer duration = ", dt - print + print("POST processAnswer duration = ", dt) + print() processAnswer_json_response = eval(response.text) return processAnswer_json_response, dt + def timeit(f): """ Refer to next.utils.timeit for further documentation """ + def timed(*args, **kw): ts = time.time() result = f(*args, **kw) te = time.time() - if type(result)==tuple: - return result + ((te-ts),) + if type(result) == tuple: + return result + ((te - ts),) else: - return result,(te-ts) + return result, (te - ts) + return timed def getModel(exp_uid, app_id, supported_alg_ids, alg_list, assert_200=True): # Test loading the dashboard - dashboard_url = ("http://" + HOSTNAME + "/dashboard" - "/experiment_dashboard/{}/{}".format(exp_uid, app_id)) + dashboard_url = ( + "http://" + HOSTNAME + "/dashboard" + "/experiment_dashboard/{}/{}".format(exp_uid, app_id) + ) response = requests.get(dashboard_url) - if assert_200: assert response.status_code is 200 - - stats_url = ("http://" + HOSTNAME + "/dashboard" - "/get_stats".format(exp_uid, app_id)) - - args = {'exp_uid': exp_uid, 'args': {'params': {'alg_label': - supported_alg_ids[0]}}} - args = {'exp_uid': exp_uid, 'args': {'params': {}}} - alg_label = alg_list[0]['alg_label'] - params = {'api_activity_histogram': {}, - 'compute_duration_multiline_plot': {'task': 'getQuery'}, - 'compute_duration_detailed_stacked_area_plot': {'alg_label': alg_label, 'task': 'getQuery'}, - 'response_time_histogram': {'alg_label': alg_label}, - 'network_delay_histogram': {'alg_label': alg_label}} - for stat_id in ['api_activity_histogram', - 'compute_duration_multiline_plot', - 'compute_duration_detailed_stacked_area_plot', - 'response_time_histogram', - 'network_delay_histogram']: - args['args']['params'] = params[stat_id] - args['args']['stat_id'] = stat_id + if assert_200: + assert response.status_code is 200 + + stats_url = "http://" + HOSTNAME + "/dashboard" "/get_stats".format(exp_uid, app_id) + + args = {"exp_uid": exp_uid, "args": {"params": {"alg_label": supported_alg_ids[0]}}} + args = {"exp_uid": exp_uid, "args": {"params": {}}} + alg_label = alg_list[0]["alg_label"] + params = { + "api_activity_histogram": {}, + "compute_duration_multiline_plot": {"task": "getQuery"}, + "compute_duration_detailed_stacked_area_plot": { + "alg_label": alg_label, + "task": "getQuery", + }, + "response_time_histogram": {"alg_label": alg_label}, + "network_delay_histogram": {"alg_label": alg_label}, + } + for stat_id in [ + "api_activity_histogram", + "compute_duration_multiline_plot", + "compute_duration_detailed_stacked_area_plot", + "response_time_histogram", + "network_delay_histogram", + ]: + args["args"]["params"] = params[stat_id] + args["args"]["stat_id"] = stat_id response = requests.post(stats_url, json=args) - if assert_200: assert response.status_code is 200 + if assert_200: + assert response.status_code is 200 def getExp(exp_uid, assert_200=True): - url = "http://"+HOSTNAME+"/api/experiment/"+exp_uid + url = "http://" + HOSTNAME + "/api/experiment/" + exp_uid response = requests.get(url) - print "GET experiment response =",response.text, response.status_code - if assert_200: assert response.status_code is 200 + print("GET experiment response =", response.text, response.status_code) + if assert_200: + assert response.status_code is 200 initExp_response_dict = json.loads(response.text) return initExp_response_dict -def format_times(getQuery_times, processAnswer_times, total_pulls, - participant_uid): + +def format_times(getQuery_times, processAnswer_times, total_pulls, participant_uid): processAnswer_times.sort() getQuery_times.sort() - return_str = '%s \n\t getQuery\t : %f (5), %f (50), %f (95)\n\t processAnswer\t : %f (5), %f (50), %f (95)\n' % (participant_uid,getQuery_times[int(.05*total_pulls)],getQuery_times[int(.50*total_pulls)],getQuery_times[int(.95*total_pulls)],processAnswer_times[int(.05*total_pulls)],processAnswer_times[int(.50*total_pulls)],processAnswer_times[int(.95*total_pulls)]) + return_str = ( + "%s \n\t getQuery\t : %f (5), %f (50), %f (95)\n\t processAnswer\t : %f (5), %f (50), %f (95)\n" + % ( + participant_uid, + getQuery_times[int(.05 * total_pulls)], + getQuery_times[int(.50 * total_pulls)], + getQuery_times[int(.95 * total_pulls)], + processAnswer_times[int(.05 * total_pulls)], + processAnswer_times[int(.50 * total_pulls)], + processAnswer_times[int(.95 * total_pulls)], + ) + ) return return_str def response_delay(std=0.05, mean=0.1): ts = time.time() - sleep_time = np.abs(np.random.randn()*std + mean) + sleep_time = np.abs(np.random.randn() * std + mean) time.sleep(sleep_time) return ts - diff --git a/next/apps/tests/test_collection.py b/next/apps/tests/test_collection.py index 5d649957..39565d80 100644 --- a/next/apps/tests/test_collection.py +++ b/next/apps/tests/test_collection.py @@ -1,108 +1,123 @@ import pytest -from next.database_client.DatabaseAPI import to_db_fmt, from_db_fmt, DatabaseAPI, DatabaseException +from next.database_client.DatabaseAPI import ( + to_db_fmt, + from_db_fmt, + DatabaseAPI, + DatabaseException, +) from next.apps.Butler import Collection # IMPORTANT NOTE: only uses the `test_data` database; it gets cleared after each test session -MONGO_HOST, MONGO_PORT = 'localhost', 27017 -MONGO_DB = 'test_data' +MONGO_HOST, MONGO_PORT = "localhost", 27017 +MONGO_DB = "test_data" # === fixtures === -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def db(): db = DatabaseAPI(MONGO_HOST, MONGO_PORT, MONGO_DB) yield db db.client.drop_database(MONGO_DB) db.client.close() + # === tests === def test_uid_prefix(db): - B = 'test_uid_prefix' - c = Collection(B, '', 'exp_uid', db) - c.set('obj', value={'f': 2}) - assert db.get_doc(B, 'obj')['f'] == 2 + B = "test_uid_prefix" + c = Collection(B, "", "exp_uid", db) + c.set("obj", value={"f": 2}) + assert db.get_doc(B, "obj")["f"] == 2 + + c = Collection(B, "asdf:", "exp_uid", db) + c.set("obj", value={"f": 2}) + assert db.get_doc(B, "asdf:obj")["f"] == 2 - c = Collection(B, 'asdf:', 'exp_uid', db) - c.set('obj', value={'f': 2}) - assert db.get_doc(B, 'asdf:obj')['f'] == 2 + c = Collection(B, "asdf_{exp_uid}:", "exp_uid", db) + c.set("obj", value={"f": 2}) + assert db.get_doc(B, "asdf_exp_uid:obj")["f"] == 2 - c = Collection(B, 'asdf_{exp_uid}:', 'exp_uid', db) - c.set('obj', value={'f': 2}) - assert db.get_doc(B, 'asdf_exp_uid:obj')['f'] == 2 def test_set(db): - B = 'test_set' - c = Collection(B, '', '', db) - c.set('c', value={'x': 2}) - c.set('c', 'y', 3) - assert db.get(B, 'c', 'x') == 2 - assert db.get(B, 'c', 'y') == 3 + B = "test_set" + c = Collection(B, "", "", db) + c.set("c", value={"x": 2}) + c.set("c", "y", 3) + assert db.get(B, "c", "x") == 2 + assert db.get(B, "c", "y") == 3 + def test_get(db): - B = 'test_get' - c = Collection(B, '', '', db) - db.set_doc(B, 'd', {'x': 2, 'z': 4}) - assert c.get('d')['x'] == 2 and c.get('d')['z'] == 4 - assert c.get('d', 'x') == 2 and c.get('d', 'z') == 4 + B = "test_get" + c = Collection(B, "", "", db) + db.set_doc(B, "d", {"x": 2, "z": 4}) + assert c.get("d")["x"] == 2 and c.get("d")["z"] == 4 + assert c.get("d", "x") == 2 and c.get("d", "z") == 4 + def test_get_and_delete(db): - B = 'test_get_and_delete' - c = Collection(B, '', '', db) - db.set_doc(B, 'asdf', {'a': 3}) - assert c.get_and_delete('asdf', 'a') == 3 - assert db.get(B, 'asdf', 'a') is None + B = "test_get_and_delete" + c = Collection(B, "", "", db) + db.set_doc(B, "asdf", {"a": 3}) + assert c.get_and_delete("asdf", "a") == 3 + assert db.get(B, "asdf", "a") is None + def test_exists(db): - B = 'test_exists' - c = Collection(B, '', '', db) - assert not c.exists('f') - db.set_doc(B, 'f', {}) - assert c.exists('f') + B = "test_exists" + c = Collection(B, "", "", db) + assert not c.exists("f") + db.set_doc(B, "f", {}) + assert c.exists("f") + def test_increment(db): - B = 'test_increment' - c = Collection(B, '', '', db) - db.set_doc(B, 'f', {'a': 0}) - c.increment('f', 'a') - assert db.get(B, 'f', 'a') == 1 - c.increment('f', 'a', value=2) - assert db.get(B, 'f', 'a') == 3 + B = "test_increment" + c = Collection(B, "", "", db) + db.set_doc(B, "f", {"a": 0}) + c.increment("f", "a") + assert db.get(B, "f", "a") == 1 + c.increment("f", "a", value=2) + assert db.get(B, "f", "a") == 3 + def test_increment_many(db): - B = 'test_increment_many' - c = Collection(B, '', '', db) - db.set_doc(B, 'f', {'a': 0, 'b': 1}) - c.increment_many('f', {'a': -1, 'b': 2}) - assert db.get(B, 'f', 'a') == -1 and db.get(B, 'f', 'b') == 3 + B = "test_increment_many" + c = Collection(B, "", "", db) + db.set_doc(B, "f", {"a": 0, "b": 1}) + c.increment_many("f", {"a": -1, "b": 2}) + assert db.get(B, "f", "a") == -1 and db.get(B, "f", "b") == 3 + def test_append(db): - B = 'test_append' - c = Collection(B, '', '', db) - db.set_doc(B, 'f', {'a': [1,3]}) - c.append('f', 'a', 10) - assert db.get(B, 'f', 'a') == [1,3,10] + B = "test_append" + c = Collection(B, "", "", db) + db.set_doc(B, "f", {"a": [1, 3]}) + c.append("f", "a", 10) + assert db.get(B, "f", "a") == [1, 3, 10] + def test_pop(db): - B = 'test_pop' - c = Collection(B, '', '', db) - db.set_doc(B, 'f', {'a': [1,3,10]}) + B = "test_pop" + c = Collection(B, "", "", db) + db.set_doc(B, "f", {"a": [1, 3, 10]}) # pop one by one and check that everything is as expected - assert c.pop('f', 'a') == 10 - assert db.get(B, 'f', 'a') == [1,3] - assert c.pop('f', 'a') == 3 - assert db.get(B, 'f', 'a') == [1] - assert c.pop('f', 'a') == 1 - assert db.get(B, 'f', 'a') == [] + assert c.pop("f", "a") == 10 + assert db.get(B, "f", "a") == [1, 3] + assert c.pop("f", "a") == 3 + assert db.get(B, "f", "a") == [1] + assert c.pop("f", "a") == 1 + assert db.get(B, "f", "a") == [] with pytest.raises(IndexError): - c.pop('f', 'a') + c.pop("f", "a") # TODO: test pop from head + def test_timing(db): - B = 'test_timing' - c = Collection(B, '', '', db) + B = "test_timing" + c = Collection(B, "", "", db) assert c.get_durations == 0 and c.set_durations == 0 - c.set('f', 'a', 'aafhjk') + c.set("f", "a", "aafhjk") assert c.set_durations > 0 and c.get_durations == 0 - c.get('f', 'a') + c.get("f", "a") assert c.set_durations > 0 and c.get_durations > 0 diff --git a/next/assistant/assistant_blueprint.py b/next/assistant/assistant_blueprint.py index a210155e..bc0e1547 100644 --- a/next/assistant/assistant_blueprint.py +++ b/next/assistant/assistant_blueprint.py @@ -14,43 +14,57 @@ import sys import json -assistant = Blueprint('assistant', - __name__, - template_folder='../lib/pijemont/templates', - static_folder='../lib/pijemont/static') +assistant = Blueprint( + "assistant", + __name__, + template_folder="../lib/pijemont/templates", + static_folder="../lib/pijemont/static", +) assistant_api = Api(assistant) broker = JobBroker() -@assistant.route('/init//form') + +@assistant.route("/init//form") def init_form(app_id=None): if app_id: - filename = '{0}/{0}.yaml'.format(app_id) + filename = "{0}/{0}.yaml".format(app_id) + + api, _ = verifier.load_doc(filename, "apps/") + return render_template( + "form.html", + api_doc=api, + submit="/api/experiment", + function_name="initExp", + base_dir="/assistant/static", + ) - api,_ = verifier.load_doc(filename, 'apps/') - return render_template('form.html',api_doc=api, submit="/api/experiment", function_name="initExp", base_dir="/assistant/static") + message = "Welcome to the next.discovery system.\n " "Available apps {}".format( + ", ".join(utils.get_supported_apps()) + ) - message = ('Welcome to the next.discovery system.\n ' - 'Available apps {}'.format(', '.join(utils.get_supported_apps()))) + return render_template("raw.html", doc=message) - return render_template('raw.html',doc=message) -@assistant.route('/init') +@assistant.route("/init") def init_file(app_id=None): - return render_template('file.html', target="/assistant/init/experiment", base_dir="/assistant/static") + return render_template( + "file.html", target="/assistant/init/experiment", base_dir="/assistant/static" + ) + class ExperimentAssistant(Resource): def deserialise(self, data): - start = data.find('\n') - s = data[:start].decode('ascii') + start = data.find("\n") + s = data[:start].decode("ascii") # print('s',s) - d = [x.split(':') for x in s.split(';')] + d = [x.split(":") for x in s.split(";")] # print('d',d) start += 1 ans = {} - for arg,size in d: + for arg, size in d: size = int(size) # print('a,s',arg,size) - ans[arg] = data[start:start+size] + ans[arg] = data[start : start + size] start += size return ans @@ -60,96 +74,109 @@ def post(self): # Unpacking the YAML/ZIP file for key in args: - if key not in {'bucket_id', 'key_id', 'secret_key'}: - comma_idx = args[key].find(',') - args[key] = args[key][comma_idx + 1:] - if args[key] in {'True', 'False'}: - args[key] = True if args[key] == 'True' else False + if key not in {"bucket_id", "key_id", "secret_key"}: + comma_idx = args[key].find(",") + args[key] = args[key][comma_idx + 1 :] + if args[key] in {"True", "False"}: + args[key] = True if args[key] == "True" else False else: args[key] = base64.decodestring(args[key]) - if all([key not in args for key in ['bucket_id', 'key_id', 'secret_key']]): - args['upload'] = False + if all([key not in args for key in ["bucket_id", "key_id", "secret_key"]]): + args["upload"] = False else: - args['upload'] = True + args["upload"] = True - args['args'] = yaml.load(args['args']) + args["args"] = yaml.load(args["args"]) try: - init_exp_args = args['args'] - if 'targets' in args.keys(): - target_zipfile = args['targets'] - if args.get('upload', True): - bucket_id = args['bucket_id'] - key_id = args['key_id'] - secret_key = args['secret_key'] - - targets = target_unpacker.unpack(target_zipfile, key_id, - secret_key, bucket_id) + init_exp_args = args["args"] + if "targets" in args.keys(): + target_zipfile = args["targets"] + if args.get("upload", True): + bucket_id = args["bucket_id"] + key_id = args["key_id"] + secret_key = args["secret_key"] + + targets = target_unpacker.unpack( + target_zipfile, key_id, secret_key, bucket_id + ) else: filenames = target_unpacker.get_filenames_from_zip(target_zipfile) if len(filenames) != 1: - raise ValueError('Specify exactly one file in the ZIP file') + raise ValueError("Specify exactly one file in the ZIP file") filename = filenames[0] - extension = filename.split('.')[-1] - targets = target_unpacker.unpack_text_file(target_zipfile, - kind=extension) - init_exp_args['args']['targets'] = {'targetset': targets} + extension = filename.split(".")[-1] + targets = target_unpacker.unpack_text_file( + target_zipfile, kind=extension + ) + init_exp_args["args"]["targets"] = {"targetset": targets} - if 'keys_for_all_targets' in init_exp_args['args']: - pairs = init_exp_args['args']['keys_for_all_targets'] + if "keys_for_all_targets" in init_exp_args["args"]: + pairs = init_exp_args["args"]["keys_for_all_targets"] for pair in pairs: - map(lambda target: target.update({pair['key']: pair['value']}), - init_exp_args['args']['targets']['targetset']) - + map( + lambda target: target.update({pair["key"]: pair["value"]}), + init_exp_args["args"]["targets"]["targetset"], + ) # Init the experiment: - app_id = init_exp_args['app_id'] - exp_uid = '%030x' % random.randrange(16**30) + app_id = init_exp_args["app_id"] + exp_uid = "%030x" % random.randrange(16 ** 30) - r = broker.applyAsync(app_id, exp_uid, 'initExp', - json.dumps(init_exp_args)) + r = broker.applyAsync(app_id, exp_uid, "initExp", json.dumps(init_exp_args)) response_json, didSucceed, message = r if not didSucceed: raise ValueError(message) except: tb = traceback.format_exc() info = sys.exc_info() - if hasattr(info[1], 'message') and len(info[1].message) > 0: + if hasattr(info[1], "message") and len(info[1].message) > 0: message = info[1].message - if 'time' in message: - message += ("\nNOTE: error has to do with time; try " - "restarting docker, more detail at " - "https://stackoverflow.com/questions/27674968/amazon-s3-docker-403-forbidden-the-difference-between-the-request-time-and") + if "time" in message: + message += ( + "\nNOTE: error has to do with time; try " + "restarting docker, more detail at " + "https://stackoverflow.com/questions/27674968/amazon-s3-docker-403-forbidden-the-difference-between-the-request-time-and" + ) else: message = str(info[1]) + str(info[-1]) - message = '\n'.join(tb.split('\n')[-5:]) - message = message + '\n\nDetails:\n' + tb + message = "\n".join(tb.split("\n")[-5:]) + message = message + "\n\nDetails:\n" + tb + + return {"success": False, "message": message, "exp_uid": None} + + return { + "success": didSucceed, + "message": message, + "exp_uid": exp_uid, + "app_id": args["args"]["app_id"], + } - return {'success': False, 'message': message, 'exp_uid': None} - return {'success': didSucceed, 'message': message, 'exp_uid': exp_uid, - 'app_id': args['args']['app_id']} +assistant_api.add_resource(ExperimentAssistant, "/init/experiment") -assistant_api.add_resource(ExperimentAssistant,'/init/experiment') -@assistant.route('/doc//') -def docs(app_id=None,form="raw"): +@assistant.route("/doc//") +def docs(app_id=None, form="raw"): if app_id: - filename = '{0}/myApp.yaml'.format(app_id) + filename = "{0}/myApp.yaml".format(app_id) utils.debug_print(filename) - api,blank,pretty = doc_gen.get_docs(filename,'apps/') + api, blank, pretty = doc_gen.get_docs(filename, "apps/") if form == "pretty": - return render_template('doc.html',doc_string=pretty, base_dir="/assistant/static") + return render_template( + "doc.html", doc_string=pretty, base_dir="/assistant/static" + ) elif form == "blank": - return render_template('raw.html',doc=blank) + return render_template("raw.html", doc=blank) elif form == "raw": - return render_template('raw.html',doc=api) + return render_template("raw.html", doc=api) - message = ('Welcome to the next.discovery system.\n ' - 'Available apps {}'.format(', '.join(utils.get_supported_apps()))) + message = "Welcome to the next.discovery system.\n " "Available apps {}".format( + ", ".join(utils.get_supported_apps()) + ) - return render_template('raw.html',doc=message) + return render_template("raw.html", doc=message) diff --git a/next/assistant/pijemont/verifier.py b/next/assistant/pijemont/verifier.py index f3f76c51..bb5f8e4a 100644 --- a/next/assistant/pijemont/verifier.py +++ b/next/assistant/pijemont/verifier.py @@ -6,93 +6,102 @@ import next.utils as utils -DICT = {'dict','dictionary','map'} -LIST = {'list'} -TUPLE = {'tuple'} -ONEOF = {'oneof'} - -NUM = {'num','number','float'} -STRING = {'str','string','multiline'} -ANY = {'any','stuff'} -FILE = {'file'} -BOOL = {'boolean','bool'} - -def load_doc(filename,base_path): +DICT = {"dict", "dictionary", "map"} +LIST = {"list"} +TUPLE = {"tuple"} +ONEOF = {"oneof"} + +NUM = {"num", "number", "float"} +STRING = {"str", "string", "multiline"} +ANY = {"any", "stuff"} +FILE = {"file"} +BOOL = {"boolean", "bool"} + + +def load_doc(filename, base_path): errs = [] with open(filename) as f: ref = yaml.load(f.read()) ds = [] - for ext in ref.pop('extends',[]): - r,e = load_doc(base_path+ext,base_path) + for ext in ref.pop("extends", []): + r, e = load_doc(base_path + ext, base_path) ds += [r] errs += e for d in ds: ref = merge_dict(ref, d) - errs = check_format(ref,'args' in ref[list(ref.keys())[0]]) - return ref,errs + errs = check_format(ref, "args" in ref[list(ref.keys())[0]]) + return ref, errs -def merge_dict(d1,d2,prefer=1): + +def merge_dict(d1, d2, prefer=1): for k in d2: if k in d1: if type(d1[k]) == dict: - d1[k] = merge_dict(d1[k],d2[k]) + d1[k] = merge_dict(d1[k], d2[k]) if prefer == 2: d1[k] = d2[k] else: d1[k] = d2[k] return d1 -def check_format(doc,rets=True): + +def check_format(doc, rets=True): errs = [] if rets: for x in doc: - if 'args' in doc[x]: - errs += check_format_helper({'type':'dict','values':doc[x]['args']},'args/'+x) - if 'rets' in doc[x]: - errs += check_format_helper({'type':'dict','values':doc[x]['rets']},'rets/'+x) + if "args" in doc[x]: + errs += check_format_helper( + {"type": "dict", "values": doc[x]["args"]}, "args/" + x + ) + if "rets" in doc[x]: + errs += check_format_helper( + {"type": "dict", "values": doc[x]["rets"]}, "rets/" + x + ) else: for x in doc: - errs += check_format_helper(doc[x],x) + errs += check_format_helper(doc[x], x) return errs -def check_format_helper(doc,name): + +def check_format_helper(doc, name): errs = [] - - if not 'type' in doc: + + if not "type" in doc: errs += ['{}: "type" key missing'.format(name)] - - diff = set(doc.keys()) - {'type','description','values','optional','default'} + + diff = set(doc.keys()) - {"type", "description", "values", "optional", "default"} if len(diff) > 0: - errs += ["{}: extra keys in spec: {}".format(name,", ".join(list(diff)))] - - if not doc['type'] in DICT | LIST | TUPLE | ONEOF | NUM | STRING | BOOL | ANY | FILE: - errs += ['{}: invlid type: {}'.format(name, doc['type'])] - - if doc['type'] in DICT | LIST | TUPLE | ONEOF and not 'values' in doc: + errs += ["{}: extra keys in spec: {}".format(name, ", ".join(list(diff)))] + + if ( + not doc["type"] + in DICT | LIST | TUPLE | ONEOF | NUM | STRING | BOOL | ANY | FILE + ): + errs += ["{}: invlid type: {}".format(name, doc["type"])] + + if doc["type"] in DICT | LIST | TUPLE | ONEOF and not "values" in doc: errs += ['{}: requires "values" key'.format(name)] if len(errs) > 0: return errs - - if doc['type'] in DICT: - for x in doc['values']: - errs += check_format_helper(doc['values'][x],'{}/{}'.format(name,x)) - - elif doc['type'] in LIST: - errs += check_format_helper(doc['values'],'{}/values'.format(name)) - - elif doc['type'] in TUPLE: - for x in doc['values']: - errs += check_format_helper(doc['values'][x],'{}/{}'.format(name,str(x))) - - elif doc['type'] in ONEOF: - for x in doc['values']: - errs += check_format_helper(doc['values'][x],'{}/{}'.format(name,str(x))) - + + if doc["type"] in DICT: + for x in doc["values"]: + errs += check_format_helper(doc["values"][x], "{}/{}".format(name, x)) + + elif doc["type"] in LIST: + errs += check_format_helper(doc["values"], "{}/values".format(name)) + + elif doc["type"] in TUPLE: + for x in doc["values"]: + errs += check_format_helper(doc["values"][x], "{}/{}".format(name, str(x))) + + elif doc["type"] in ONEOF: + for x in doc["values"]: + errs += check_format_helper(doc["values"][x], "{}/{}".format(name, str(x))) + return errs - - - + def verify(input_dict, reference_dict): """ @@ -103,18 +112,21 @@ def verify(input_dict, reference_dict): - success is a boolean true if there were no problems and false otherwise - list_of_errors is as in verify_helper """ - input_dict, messages = verify_helper("", input_dict, {'type':'dict','values':reference_dict}) + input_dict, messages = verify_helper( + "", input_dict, {"type": "dict", "values": reference_dict} + ) try: - if len(messages)>0: - raise Exception("Failed to verify: {}".format(messages)) - else: - return input_dict + if len(messages) > 0: + raise Exception("Failed to verify: {}".format(messages)) + else: + return input_dict except Exception: - exc_type, exc_value, exc_traceback = sys.exc_info() - print("Exception: {} {}".format(error, traceback.format_exc())) - traceback.print_tb(exc_traceback) - raise Exception(error) + exc_type, exc_value, exc_traceback = sys.exc_info() + print("Exception: {} {}".format(error, traceback.format_exc())) + traceback.print_tb(exc_traceback) + raise Exception(error) + def verify_helper(name, input_element, reference_dict): """ @@ -126,80 +138,121 @@ def verify_helper(name, input_element, reference_dict): - list_of_errors is: [{name: name, message: ...}, ...] """ ans = [] - if reference_dict['type'] in DICT: + if reference_dict["type"] in DICT: if not isinstance(input_element, (dict)): - ans += [{"name":name, "message":"invalid dict"}] + ans += [{"name": name, "message": "invalid dict"}] else: - l1,l2 = compare_dict_keys(input_element, reference_dict['values']) + l1, l2 = compare_dict_keys(input_element, reference_dict["values"]) if len(l1) > 0: - ans += [{"name":name, "message":"extra keys in input: " + ",".join(l1)}] + ans += [ + {"name": name, "message": "extra keys in input: " + ",".join(l1)} + ] else: ok = True for k in l2: - if 'default' in reference_dict['values'][k]: - input_element[k] = reference_dict['values'][k]['default'] - if reference_dict['values'][k]['type'] in NUM: + if "default" in reference_dict["values"][k]: + input_element[k] = reference_dict["values"][k]["default"] + if reference_dict["values"][k]["type"] in NUM: input_element[k] = float(input_element[k]) - elif (not 'optional' in reference_dict['values'][k]) or reference_dict['values'][k]['optional'] == False: - ans += [{"name":name+'/'+k, "message":"required key is absent"}] + elif ( + not "optional" in reference_dict["values"][k] + ) or reference_dict["values"][k]["optional"] == False: + ans += [ + { + "name": name + "/" + k, + "message": "required key is absent", + } + ] ok = False - if(ok): + if ok: for k in input_element: - input_element[k], temp_ans = verify_helper(name + '/' + k, input_element[k], reference_dict['values'][str(k)]) + input_element[k], temp_ans = verify_helper( + name + "/" + k, + input_element[k], + reference_dict["values"][str(k)], + ) ans += temp_ans - elif reference_dict['type'] in LIST: + elif reference_dict["type"] in LIST: if not isinstance(input_element, (list)): - ans += [{"name":name, "message":"invalid list"}] + ans += [{"name": name, "message": "invalid list"}] else: for i in range(len(input_element)): - input_element[i],temp_ans = verify_helper(name+'/'+str(i), input_element[i], reference_dict['values']) + input_element[i], temp_ans = verify_helper( + name + "/" + str(i), input_element[i], reference_dict["values"] + ) ans += temp_ans - elif reference_dict['type'] in TUPLE: - if not isinstance(input_element, (list,tuple)): - ans += [{"name":name, "message":"invalid tuple"}] + elif reference_dict["type"] in TUPLE: + if not isinstance(input_element, (list, tuple)): + ans += [{"name": name, "message": "invalid tuple"}] else: new_tuple = list(input_element) for i in range(len(input_element)): - new_tuple[i], temp_ans = verify_helper(name+'/'+str(i), input_element[i], reference_dict['values'][i]) + new_tuple[i], temp_ans = verify_helper( + name + "/" + str(i), input_element[i], reference_dict["values"][i] + ) ans += temp_ans new_tuple = tuple(new_tuple) - elif reference_dict['type'] in BOOL: + elif reference_dict["type"] in BOOL: if not isinstance(input_element, (bool)): - ans += [{"name":name, "message":"invalid boolean"}] + ans += [{"name": name, "message": "invalid boolean"}] - elif reference_dict['type'] in NUM: + elif reference_dict["type"] in NUM: if not isinstance(input_element, (int, long, float)): - ans += [{"name":name, "message":"invalid number"}] + ans += [{"name": name, "message": "invalid number"}] - elif reference_dict['type'] in STRING: + elif reference_dict["type"] in STRING: if not isinstance(input_element, (str, unicode)): - ans += [{"name":name, "message":"expected a string, got {}".format(type(input_element))}] - elif 'values' in reference_dict and not input_element in reference_dict['values']: - ans += [{"name":name, "message":"argument must be one of the specified strings: "+", ".join(reference_dict['values'])}] + ans += [ + { + "name": name, + "message": "expected a string, got {}".format(type(input_element)), + } + ] + elif ( + "values" in reference_dict and not input_element in reference_dict["values"] + ): + ans += [ + { + "name": name, + "message": "argument must be one of the specified strings: " + + ", ".join(reference_dict["values"]), + } + ] - elif reference_dict['type'] in ONEOF: + elif reference_dict["type"] in ONEOF: count = 0 - for k in reference_dict['values']: + for k in reference_dict["values"]: if k in input_element: count += 1 if count > 1: - ans += [{"name":name+"/"+k,"message":"More than one argument specified for 'oneof arg: " + name}] + ans += [ + { + "name": name + "/" + k, + "message": "More than one argument specified for 'oneof arg: " + + name, + } + ] if count == 0: - if 'default' in reference_dict: - input_element = reference_dict['default'] + if "default" in reference_dict: + input_element = reference_dict["default"] else: - ans += [{"name":name, "message":"no argument provided for 'oneof' arg"}] + ans += [ + {"name": name, "message": "no argument provided for 'oneof' arg"} + ] - elif reference_dict['type'] in ANY | FILE: + elif reference_dict["type"] in ANY | FILE: pass else: - ans += [{"name":name, "message":"invalid type: {}".format(reference_dict['type'])}] + ans += [ + {"name": name, "message": "invalid type: {}".format(reference_dict["type"])} + ] + + return input_element, ans - return input_element,ans def compare_dict_keys(d1, d2): """ @@ -207,13 +260,13 @@ def compare_dict_keys(d1, d2): """ return [k for k in d1 if not k in d2], [k for k in d2 if not k in d1] -if __name__ == '__main__': + +if __name__ == "__main__": if len(sys.argv) > 1: - r,e = load_doc(sys.argv[1]) - print('doc',r) - print('errs',e) + r, e = load_doc(sys.argv[1]) + print("doc", r) + print("errs", e) if len(sys.argv) > 2: - i,e = verify(sys.argv[2],r) - print("Errors",e) - print("Verified input",i) - + i, e = verify(sys.argv[2], r) + print("Errors", e) + print("Verified input", i) diff --git a/next/assistant/s3.py b/next/assistant/s3.py index 08a9070b..95a49e8f 100644 --- a/next/assistant/s3.py +++ b/next/assistant/s3.py @@ -21,9 +21,10 @@ def get_bucket(AWS_BUCKET_NAME, AWS_ID, AWS_KEY): bucket = conn.get_bucket(AWS_BUCKET_NAME, validate=False) return bucket + def upload(filename, file_object, bucket): k = Key(bucket) k.key = filename k.set_contents_from_file(file_object) - k.set_acl('public-read') + k.set_acl("public-read") return k.generate_url(expires_in=0, query_auth=False, force_http=True) diff --git a/next/assistant/target_unpacker.py b/next/assistant/target_unpacker.py index 3640a417..826ba816 100644 --- a/next/assistant/target_unpacker.py +++ b/next/assistant/target_unpacker.py @@ -10,7 +10,7 @@ from collections import OrderedDict if __name__ == "__main__": - sys.path.append('../..') + sys.path.append("../..") import next.utils as utils import next.assistant.s3 as s3 @@ -28,43 +28,59 @@ def zipfile_to_dictionary(zip_file): result: the returned dictionary """ filenames = zip_file.namelist() - filenames = [f for f in filenames if not any([ignore in f.lower() for ignore in - ['ds_store', 'icon', '__macosx']])] - filenames = [f for f in filenames if len(f.split('/')[-1]) > 0] + filenames = [ + f + for f in filenames + if not any([ignore in f.lower() for ignore in ["ds_store", "icon", "__macosx"]]) + ] + filenames = [f for f in filenames if len(f.split("/")[-1]) > 0] filenames = sorted(filenames) files = OrderedDict() for filename in filenames: f = zip_file.read(filename) - name = filename.split('/')[-1] + name = filename.split("/")[-1] files[name] = f return files -def upload_target(filename, file_obj, bucket_name, aws_key, aws_secret_key, - i=None, get_bucket=True): + +def upload_target( + filename, file_obj, bucket_name, aws_key, aws_secret_key, i=None, get_bucket=True +): if get_bucket: bucket = s3.get_bucket(bucket_name, aws_key, aws_secret_key) else: bucket = s3.create_bucket(bucket_name, aws_key, aws_secret_key) - utils.debug_print('Uploading target: {}'.format(filename)) - url = s3.upload(filename, StringIO(file_obj), bucket) - target_types = {'png': 'image', 'jpeg': 'image', 'jpg': 'image', 'gif': 'image', - 'mp4': 'video', 'mov': 'video', - 'txt': 'text', 'csv': 'text'} - filetype = filename.split('.')[-1] + utils.debug_print("Uploading target: {}".format(filename)) + url = s3.upload(filename, StringIO(file_obj), bucket) + target_types = { + "png": "image", + "jpeg": "image", + "jpg": "image", + "gif": "image", + "mp4": "video", + "mov": "video", + "txt": "text", + "csv": "text", + } + filetype = filename.split(".")[-1] if filetype not in target_types: - msg = ('Target not recognized (extension: "{}"). ' - 'Available extensions: {}').format(filetype, list(target_types.keys())) + msg = ( + 'Target not recognized (extension: "{}"). ' "Available extensions: {}" + ).format(filetype, list(target_types.keys())) raise ValueError(msg) - utils.debug_print('Done uploading target: {}'.format(filename)) + utils.debug_print("Done uploading target: {}".format(filename)) + + return { + "target_id": str(i), + "primary_type": target_types[filetype], + "primary_description": url, + "alt_type": "text", + "alt_description": filename, + } - return {'target_id': str(i), - 'primary_type': target_types[filetype], - 'primary_description': url, - 'alt_type': 'text', - 'alt_description': filename} def get_filenames_from_zip(s): base64_zip = io.BytesIO(s) @@ -72,8 +88,7 @@ def get_filenames_from_zip(s): return zip_file.namelist() -def unpack(s, aws_key, aws_secret_key, bucket_name, n_jobs=None, - get_bucket=True): +def unpack(s, aws_key, aws_secret_key, bucket_name, n_jobs=None, get_bucket=True): base64_zip = io.BytesIO(s) zip_file = zipfile.ZipFile(base64_zip) files = zipfile_to_dictionary(zip_file) @@ -82,28 +97,32 @@ def unpack(s, aws_key, aws_secret_key, bucket_name, n_jobs=None, n_jobs = min(len(files), 50) if not bucket_name: - bucket_name = '{}{}'.format(aws_key.lower(), utils.random_string(length=20)) + bucket_name = "{}{}".format(aws_key.lower(), utils.random_string(length=20)) # TODO: trim here for JSON object to append to dictionaries # TODO: manage CSV targets here # TODO: how come creating a S3 bucket isn't working for me? - utils.debug_print('=== Starting upload of targets to S3 ===') + utils.debug_print("=== Starting upload of targets to S3 ===") try: - targets = Parallel(n_jobs=n_jobs, backend='threading') \ - (delayed(upload_target, check_pickle=False) - (name, file, bucket_name, aws_key, aws_secret_key, - i=i, get_bucket=True) - for i, (name, file) in enumerate(files.items())) + targets = Parallel(n_jobs=n_jobs, backend="threading")( + delayed(upload_target, check_pickle=False)( + name, file, bucket_name, aws_key, aws_secret_key, i=i, get_bucket=True + ) + for i, (name, file) in enumerate(files.items()) + ) except: - utils.debug_print('Whoops, parallel S3 upload failed. Trying serially.') - targets = [upload_target(name, file, bucket_name, aws_key, aws_secret_key, - i=i, get_bucket=True) - for i, (name, file) in enumerate(files.items())] + utils.debug_print("Whoops, parallel S3 upload failed. Trying serially.") + targets = [ + upload_target( + name, file, bucket_name, aws_key, aws_secret_key, i=i, get_bucket=True + ) + for i, (name, file) in enumerate(files.items()) + ] return targets -def unpack_text_file(s, kind='csv'): +def unpack_text_file(s, kind="csv"): kind = kind.lower() # always lower case extension base64_zip = io.BytesIO(s) zip_file = zipfile.ZipFile(base64_zip) @@ -111,30 +130,34 @@ def unpack_text_file(s, kind='csv'): # files is has at least one key; (tested before call in assistant_blueprint.py) file_str = files[files.keys()[0]] - if kind in {'csv', 'txt'}: - strings = file_str.split('\n') # -1 because last newline + if kind in {"csv", "txt"}: + strings = file_str.split("\n") # -1 because last newline strings = list(filter(lambda x: len(x) > 0, strings)) - targets = [{'target_id': str(i), - 'primary_type': 'text', - 'primary_description': string, - 'alt_type': 'text', - 'alt_description': string} - for i, string in enumerate(strings)] + targets = [ + { + "target_id": str(i), + "primary_type": "text", + "primary_description": string, + "alt_type": "text", + "alt_description": string, + } + for i, string in enumerate(strings) + ] return targets - elif kind in {'json'}: + elif kind in {"json"}: return json.loads(file_str) else: - raise ValueError('`kind` not regonized in `unpack_text_file`') + raise ValueError("`kind` not regonized in `unpack_text_file`") if __name__ == "__main__": from pprint import pprint - aws_key = os.environ.get('KEY') - aws_secret_access_key = os.environ.get('ACCESS_KEY') - with open('../../examples/strange_fruit_triplet/strangefruit30.zip', 'rb') as f: + aws_key = os.environ.get("KEY") + aws_secret_access_key = os.environ.get("ACCESS_KEY") + + with open("../../examples/strange_fruit_triplet/strangefruit30.zip", "rb") as f: s = f.read() print(s) - targets = unpack(s, aws_key, aws_secret_access_key, - bucket_name='scott_test') + targets = unpack(s, aws_key, aws_secret_access_key, bucket_name="scott_test") pprint(targets) diff --git a/next/broker/broker.py b/next/broker/broker.py index 1eaee0e1..a5c1eac6 100644 --- a/next/broker/broker.py +++ b/next/broker/broker.py @@ -1,6 +1,6 @@ import next.utils as utils -from datetime import datetime,timedelta +from datetime import datetime, timedelta import celery from next.broker.celery_app import tasks as tasks @@ -15,6 +15,7 @@ import time import next.utils as utils + class JobBroker: # Initialization method for the broker @@ -23,7 +24,11 @@ def __init__(self): self.hostname = None # location of hashes - self.r = redis.StrictRedis(host=next.constants.RABBITREDIS_HOSTNAME, port=next.constants.RABBITREDIS_PORT, db=0) + self.r = redis.StrictRedis( + host=next.constants.RABBITREDIS_HOSTNAME, + port=next.constants.RABBITREDIS_PORT, + db=0, + ) def applyAsync(self, app_id, exp_uid, task_name, args, ignore_result=False): """ @@ -37,22 +42,20 @@ def applyAsync(self, app_id, exp_uid, task_name, args, ignore_result=False): task_name(app_id, exp_id, args) """ - submit_timestamp = utils.datetimeNow('string') - domain = self.__get_domain_for_job(app_id+"_"+exp_uid) + submit_timestamp = utils.datetimeNow("string") + domain = self.__get_domain_for_job(app_id + "_" + exp_uid) if next.constants.CELERY_ON: - result = tasks.apply.apply_async(args=[app_id, - exp_uid, - task_name, - args, - submit_timestamp], - exchange='async@'+domain, - routing_key='async@'+domain) + result = tasks.apply.apply_async( + args=[app_id, exp_uid, task_name, args, submit_timestamp], + exchange="async@" + domain, + routing_key="async@" + domain, + ) if ignore_result: return True else: return result.get(interval=0.001) else: - result = tasks.apply(app_id,exp_uid,task_name, args, submit_timestamp) + result = tasks.apply(app_id, exp_uid, task_name, args, submit_timestamp) if ignore_result: return True else: @@ -70,28 +73,37 @@ def dashboardAsync(self, app_id, exp_uid, args, ignore_result=False): task_name(app_id, exp_id, args) """ - submit_timestamp = utils.datetimeNow('string') - domain = self.__get_domain_for_job(app_id+"_"+exp_uid) + submit_timestamp = utils.datetimeNow("string") + domain = self.__get_domain_for_job(app_id + "_" + exp_uid) if next.constants.CELERY_ON: - result = tasks.apply_dashboard.apply_async(args=[app_id, - exp_uid, - args, - submit_timestamp], - exchange='dashboard@'+domain, - routing_key='dashboard@'+domain) + result = tasks.apply_dashboard.apply_async( + args=[app_id, exp_uid, args, submit_timestamp], + exchange="dashboard@" + domain, + routing_key="dashboard@" + domain, + ) if ignore_result: return True else: return result.get(interval=0.001) else: - result = tasks.apply_dashboard(app_id,exp_uid, args, submit_timestamp) + result = tasks.apply_dashboard(app_id, exp_uid, args, submit_timestamp) if ignore_result: return True else: return result - - def applySyncByNamespace(self, app_id, exp_uid, alg_id, alg_label, task_name, args, namespace=None, ignore_result=False,time_limit=0): + def applySyncByNamespace( + self, + app_id, + exp_uid, + alg_id, + alg_label, + task_name, + args, + namespace=None, + ignore_result=False, + time_limit=0, + ): """ Run a task (task_name) on a set of args with a given app_id, and exp_uid asynchronously. Waits for computation to finish and returns the answer unless ignore_result=True in which case its a non-blocking call. @@ -101,28 +113,28 @@ def applySyncByNamespace(self, app_id, exp_uid, alg_id, alg_label, task_name, ar (string) app_id, (string) exp_id, (string) task_name, (json) args """ - submit_timestamp = utils.datetimeNow('string') - if namespace==None: - namespace=exp_uid - domain = self.__get_domain_for_job(app_id+"_"+exp_uid) + submit_timestamp = utils.datetimeNow("string") + if namespace == None: + namespace = exp_uid + domain = self.__get_domain_for_job(app_id + "_" + exp_uid) num_queues = next.constants.CELERY_SYNC_WORKER_COUNT # assign namespaces to queues (with worker of concurrency 1) in round-robbin try: - namespace_cnt = int(self.r.get(namespace+"_cnt")) + namespace_cnt = int(self.r.get(namespace + "_cnt")) except: pipe = self.r.pipeline(True) while 1: try: - pipe.watch(namespace+"_cnt","namespace_counter") - if not pipe.exists(namespace+"_cnt"): - if not pipe.exists('namespace_counter'): + pipe.watch(namespace + "_cnt", "namespace_counter") + if not pipe.exists(namespace + "_cnt"): + if not pipe.exists("namespace_counter"): namespace_counter = 0 else: - namespace_counter = pipe.get('namespace_counter') + namespace_counter = pipe.get("namespace_counter") pipe.multi() - pipe.set(namespace+"_cnt",int(namespace_counter)+1) - pipe.set('namespace_counter',int(namespace_counter)+1) + pipe.set(namespace + "_cnt", int(namespace_counter) + 1) + pipe.set("namespace_counter", int(namespace_counter) + 1) pipe.execute() else: pipe.unwatch() @@ -131,10 +143,10 @@ def applySyncByNamespace(self, app_id, exp_uid, alg_id, alg_label, task_name, ar continue finally: pipe.reset() - namespace_cnt = int(self.r.get(namespace+"_cnt")) + namespace_cnt = int(self.r.get(namespace + "_cnt")) queue_number = (namespace_cnt % num_queues) + 1 - queue_name = 'sync_queue_'+str(queue_number)+'@'+domain + queue_name = "sync_queue_" + str(queue_number) + "@" + domain job_uid = utils.getNewUID() if time_limit == 0: soft_time_limit = None @@ -143,20 +155,40 @@ def applySyncByNamespace(self, app_id, exp_uid, alg_id, alg_label, task_name, ar soft_time_limit = time_limit hard_time_limit = time_limit + .01 if next.constants.CELERY_ON: - result = tasks.apply_sync_by_namespace.apply_async(args=[app_id,exp_uid, - alg_id,alg_label, - task_name, args, - namespace, job_uid, - submit_timestamp, time_limit], - queue=queue_name, - soft_time_limit=soft_time_limit, - time_limit=hard_time_limit) + result = tasks.apply_sync_by_namespace.apply_async( + args=[ + app_id, + exp_uid, + alg_id, + alg_label, + task_name, + args, + namespace, + job_uid, + submit_timestamp, + time_limit, + ], + queue=queue_name, + soft_time_limit=soft_time_limit, + time_limit=hard_time_limit, + ) if ignore_result: return True else: return result.get(interval=.001) else: - result = tasks.apply_sync_by_namespace(app_id,exp_uid,alg_id,alg_label,task_name, args, namespace, job_uid, submit_timestamp, time_limit) + result = tasks.apply_sync_by_namespace( + app_id, + exp_uid, + alg_id, + alg_label, + task_name, + args, + namespace, + job_uid, + submit_timestamp, + time_limit, + ) if ignore_result: return True else: @@ -172,21 +204,30 @@ def __get_domain_for_job(self, job_id): This implementation assumes just a single master node and no workers so only a single hostname (e.g. localhost) has celery workers. """ - if self.r.exists('MINIONWORKER_HOSTNAME'): - self.hostname = self.r.get('MINIONWORKER_HOSTNAME') - utils.debug_print('Found hostname: {} (Redis)'.format(self.hostname)) + if self.r.exists("MINIONWORKER_HOSTNAME"): + self.hostname = self.r.get("MINIONWORKER_HOSTNAME") + utils.debug_print("Found hostname: {} (Redis)".format(self.hostname)) else: - with open('/etc/hosts', 'r') as fid: + with open("/etc/hosts", "r") as fid: for line in fid: - if 'MINIONWORKER' in line: - self.hostname = line.split('\t')[1].split(' ')[1] - self.r.set('MINIONWORKER_HOSTNAME', self.hostname, ex=360) # expire after 10 minutes - utils.debug_print('Found hostname: {} (/etc/hosts)'.format(self.hostname)) + if "MINIONWORKER" in line: + self.hostname = line.split("\t")[1].split(" ")[1] + self.r.set( + "MINIONWORKER_HOSTNAME", self.hostname, ex=360 + ) # expire after 10 minutes + utils.debug_print( + "Found hostname: {} (/etc/hosts)".format(self.hostname) + ) break if self.hostname is None: import socket + self.hostname = socket.gethostname() - self.r.set('MINIONWORKER_HOSTNAME', self.hostname, ex=360) # expire after 10 minutes - utils.debug_print('Found hostname: {} (socket.gethostname())'.format(self.hostname)) + self.r.set( + "MINIONWORKER_HOSTNAME", self.hostname, ex=360 + ) # expire after 10 minutes + utils.debug_print( + "Found hostname: {} (socket.gethostname())".format(self.hostname) + ) return self.hostname diff --git a/next/broker/celery_app/celery_broker.py b/next/broker/celery_app/celery_broker.py index c3578021..b53715ad 100644 --- a/next/broker/celery_app/celery_broker.py +++ b/next/broker/celery_app/celery_broker.py @@ -2,11 +2,10 @@ from celery import Celery -app = Celery('celery', - include=['next.broker.celery_app.tasks']) +app = Celery("celery", include=["next.broker.celery_app.tasks"]) # Configuration file for the worker. The default values can be tnitialized from salt module -app.config_from_object('next.constants') +app.config_from_object("next.constants") -if __name__ == '__main__': +if __name__ == "__main__": app.start() diff --git a/next/broker/celery_app/tasks.py b/next/broker/celery_app/tasks.py index a44795d4..57a9e30c 100644 --- a/next/broker/celery_app/tasks.py +++ b/next/broker/celery_app/tasks.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import absolute_import, print_function from .celery_broker import app import celery.signals import os @@ -113,7 +113,7 @@ def apply(app_id, exp_uid, task_name, args_in_json, enqueue_timestamp): ell.log( app_id+':ALG-DURATION', log_entry_durations ) else: return_value = (args_out_json,didSucceed,message) - print '#### Finished %s, time_enqueued=%s, execution_time=%s ####' % (task_name,time_enqueued,dt) + print('#### Finished %s, time_enqueued=%s, execution_time=%s ####' % (task_name,time_enqueued,dt)) return return_value def apply_dashboard(app_id, exp_uid, args_in_json, enqueue_timestamp): @@ -183,7 +183,7 @@ def apply_sync_by_namespace(app_id, exp_uid, alg_id, alg_label, task_name, args, time_enqueued = delta_datetime.seconds + delta_datetime.microseconds/1000000. try: - print '>>>>>>>> Starting namespace:%s, job_uid=%s, time_enqueued=%s <<<<<<<<<' % (namespace,job_uid,time_enqueued) + print('>>>>>>>> Starting namespace:%s, job_uid=%s, time_enqueued=%s <<<<<<<<<' % (namespace,job_uid,time_enqueued)) # get stateless app next_app = next.utils.get_app(app_id, exp_uid, db, ell) target_manager = next_app.myApp.TargetManager @@ -196,11 +196,11 @@ def apply_sync_by_namespace(app_id, exp_uid, alg_id, alg_label, task_name, args, log_entry_durations['duration_enqueued'] = time_enqueued log_entry_durations['timestamp'] = next.utils.datetimeNow() ell.log( app_id+':ALG-DURATION', log_entry_durations) - print '########## Finished namespace:%s, job_uid=%s, time_enqueued=%s, execution_time=%s ##########' % (namespace,job_uid,time_enqueued,dt) + print('########## Finished namespace:%s, job_uid=%s, time_enqueued=%s, execution_time=%s ##########' % (namespace,job_uid,time_enqueued,dt)) return except Exception, error: exc_type, exc_value, exc_traceback = sys.exc_info() - print "tasks Exception: {} {}".format(error, traceback.format_exc()) + print("tasks Exception: {} {}".format(error, traceback.format_exc())) traceback.print_tb(exc_traceback) # error = traceback.format_exc() diff --git a/next/constants.py b/next/constants.py index bf3c8f67..6632efe0 100644 --- a/next/constants.py +++ b/next/constants.py @@ -15,79 +15,82 @@ ### NEXT version number ### # Remember to edit this each release! -VERSION = '1.1.1' +VERSION = "1.1.1" # Variable to enable sites. This allows you to build clients and sites on the # NEXT system. SITES_ENABLED = False -DEBUG_ON = os.environ.get('DEBUG_ON', '') +DEBUG_ON = os.environ.get("DEBUG_ON", "") -DASHBOARD_STALENESS_IN_SECONDS = 60*30 +DASHBOARD_STALENESS_IN_SECONDS = 60 * 30 # Backend Host Url -NEXT_BACKEND_GLOBAL_HOST = os.environ.get('NEXT_BACKEND_GLOBAL_HOST', 'localhost') -NEXT_BACKEND_GLOBAL_PORT = os.environ.get('NEXT_BACKEND_GLOBAL_PORT', '8000') +NEXT_BACKEND_GLOBAL_HOST = os.environ.get("NEXT_BACKEND_GLOBAL_HOST", "localhost") +NEXT_BACKEND_GLOBAL_PORT = os.environ.get("NEXT_BACKEND_GLOBAL_PORT", "8000") -AWS_ACCESS_ID = os.environ.get('AWS_ACCESS_ID', '') -AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', '') +AWS_ACCESS_ID = os.environ.get("AWS_ACCESS_ID", "") +AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", "") -SITE_KEY = os.environ.get('SITE_KEY', None) -if SITE_KEY==None or SITE_KEY=='None': - SITE_KEY=None +SITE_KEY = os.environ.get("SITE_KEY", None) +if SITE_KEY == None or SITE_KEY == "None": + SITE_KEY = None -GIT_HASH = os.environ.get('GIT_HASH', '') -if GIT_HASH=='': +GIT_HASH = os.environ.get("GIT_HASH", "") +if GIT_HASH == "": import subprocess + try: - GIT_HASH = subprocess.check_output(['git', 'rev-parse', 'HEAD'])[0:-1] + GIT_HASH = subprocess.check_output(["git", "rev-parse", "HEAD"])[0:-1] except: - GIT_HASH = '' + GIT_HASH = "" -MINIONREDIS_HOST = os.environ.get('MINIONREDIS_PORT_6379_TCP_ADDR', 'localhost') -MINIONREDIS_PORT = int(os.environ.get('MINIONREDIS_PORT_6379_TCP_PORT', 6379)) -MINIONREDIS_PASS = os.environ.get('MINIONREDIS_ENV_REDIS_PASS', '') +MINIONREDIS_HOST = os.environ.get("MINIONREDIS_PORT_6379_TCP_ADDR", "localhost") +MINIONREDIS_PORT = int(os.environ.get("MINIONREDIS_PORT_6379_TCP_PORT", 6379)) +MINIONREDIS_PASS = os.environ.get("MINIONREDIS_ENV_REDIS_PASS", "") # PermStore constants -MONGODB_HOST = os.environ.get('MONGODB_PORT_27017_TCP_ADDR','localhost') -MONGODB_PORT = int(os.environ.get('MONGODB_PORT_27017_TCP_PORT', 27017) ) +MONGODB_HOST = os.environ.get("MONGODB_PORT_27017_TCP_ADDR", "localhost") +MONGODB_PORT = int(os.environ.get("MONGODB_PORT_27017_TCP_PORT", 27017)) # Database client constants -app_data_database_id = 'app_data' -logs_database_id = 'logs' +app_data_database_id = "app_data" +logs_database_id = "logs" maxStringLengthInInspectDatabase = 200 -RABBIT_HOSTNAME = os.environ.get('RABBIT_PORT_5672_TCP_ADDR', 'localhost') -RABBIT_PORT= int(os.environ.get('RABBIT_PORT_5672_TCP_PORT', 5672)) +RABBIT_HOSTNAME = os.environ.get("RABBIT_PORT_5672_TCP_ADDR", "localhost") +RABBIT_PORT = int(os.environ.get("RABBIT_PORT_5672_TCP_PORT", 5672)) -BROKER_URL = 'amqp://{user}:{password}@{hostname}:{port}/{vhost}/'.format( - user=os.environ.get('RABBIT_ENV_RABBITMQ_USER', 'guest'), - password=os.environ.get('RABBIT_ENV_RABBITMQ_PASS', 'guest'), +BROKER_URL = "amqp://{user}:{password}@{hostname}:{port}/{vhost}/".format( + user=os.environ.get("RABBIT_ENV_RABBITMQ_USER", "guest"), + password=os.environ.get("RABBIT_ENV_RABBITMQ_PASS", "guest"), hostname=RABBIT_HOSTNAME, port=RABBIT_PORT, - vhost=os.environ.get('RABBIT_ENV_VHOST', '')) + vhost=os.environ.get("RABBIT_ENV_VHOST", ""), +) -RABBITREDIS_HOSTNAME = os.environ.get('RABBITREDIS_PORT_6379_TCP_ADDR', 'localhost') -RABBITREDIS_PORT = int(os.environ.get('RABBITREDIS_PORT_6379_TCP_PORT', 6379)) +RABBITREDIS_HOSTNAME = os.environ.get("RABBITREDIS_PORT_6379_TCP_ADDR", "localhost") +RABBITREDIS_PORT = int(os.environ.get("RABBITREDIS_PORT_6379_TCP_PORT", 6379)) # https://github.com/celery/celery/issues/1909 describes the tradeoffs of redis and rabbitmq for results backend -CELERY_RESULT_BACKEND = 'redis://{hostname}:{port}/{db}/'.format( +CELERY_RESULT_BACKEND = "redis://{hostname}:{port}/{db}/".format( hostname=RABBITREDIS_HOSTNAME, port=RABBITREDIS_PORT, - db=os.environ.get('RABBITREDIS_DB', '0')) + db=os.environ.get("RABBITREDIS_DB", "0"), +) # CELERY_RESULT_BACKEND = BROKER_URL -CELERY_TASK_RESULT_EXPIRES=60 -CELERY_TASK_SERIALIZER='json' -CELERY_ACCEPT_CONTENT=['json'] # Ignore other content -CELERY_RESULT_SERIALIZER='json' +CELERY_TASK_RESULT_EXPIRES = 60 +CELERY_TASK_SERIALIZER = "json" +CELERY_ACCEPT_CONTENT = ["json"] # Ignore other content +CELERY_RESULT_SERIALIZER = "json" -CELERY_ON = eval(os.environ.get('CELERY_ON','True')) +CELERY_ON = eval(os.environ.get("CELERY_ON", "True")) -CELERY_SYNC_WORKER_COUNT = int(os.environ.get('CELERY_SYNC_WORKER_COUNT',1)) +CELERY_SYNC_WORKER_COUNT = int(os.environ.get("CELERY_SYNC_WORKER_COUNT", 1)) # from kombu import Exchange, Queue # exchange_name = 'sync@{hostname}'.format( @@ -101,4 +104,3 @@ # all_queues += (Queue(name=queue_name,exchange=sync_exchange),) # CELERY_QUEUES = all_queues - diff --git a/next/dashboard/dashboard.py b/next/dashboard/dashboard.py index 2624678d..6f6608bd 100644 --- a/next/dashboard/dashboard.py +++ b/next/dashboard/dashboard.py @@ -5,6 +5,7 @@ Flask controller for dashboards. """ +from __future__ import print_function import os import json import yaml @@ -20,33 +21,40 @@ import next.utils as utils # Declare this as the dashboard blueprint -dashboard = Blueprint('dashboard', - __name__, - template_folder='templates', - static_folder='static') +dashboard = Blueprint( + "dashboard", __name__, template_folder="templates", static_folder="static" +) rm = ResourceManager() broker = next.broker.broker.JobBroker() import next.apps.Butler as Butler + Butler = Butler.Butler # add database commands dashboard_interface = api_util.NextBackendApi(dashboard) from next.dashboard.database import DatabaseBackup, DatabaseRestore -dashboard_interface.add_resource(DatabaseBackup,'/database/databasebackup', endpoint='databasebackup') -dashboard_interface.add_resource(DatabaseRestore,'/database/databaserestore', endpoint='databaserestore') + +dashboard_interface.add_resource( + DatabaseBackup, "/database/databasebackup", endpoint="databasebackup" +) +dashboard_interface.add_resource( + DatabaseRestore, "/database/databaserestore", endpoint="databaserestore" +) if constants.SITE_KEY: - DASHBOARD_URL = '/dashboard/{}'.format(constants.SITE_KEY) + DASHBOARD_URL = "/dashboard/{}".format(constants.SITE_KEY) else: - DASHBOARD_URL = '/dashboard' + DASHBOARD_URL = "/dashboard" + @dashboard.context_processor def inject_to_templates(): return dict(dashboard_url=DASHBOARD_URL) -@dashboard.route('/experiment_list') + +@dashboard.route("/experiment_list") def experiment_list(): """ Endpoint that renders a page with a simple list of all experiments. @@ -57,61 +65,70 @@ def experiment_list(): for exp_uid in rm.get_app_exp_uids(app_id): start_date = rm.get_app_exp_uid_start_date(exp_uid) try: - experiments.append({'exp_uid': exp_uid, - 'app_id': app_id, - 'start_date': start_date, - 'num_participants':len(rm.get_participant_uids(exp_uid)), - 'retired': rm.is_exp_retired(exp_uid), - }) + experiments.append( + { + "exp_uid": exp_uid, + "app_id": app_id, + "start_date": start_date, + "num_participants": len(rm.get_participant_uids(exp_uid)), + "retired": rm.is_exp_retired(exp_uid), + } + ) except IndexError as e: - print e + print(e) pass - return render_template('experiment_list.html', - experiments=sorted(experiments, - key=lambda e: e['start_date'], - reverse=True)) + return render_template( + "experiment_list.html", + experiments=sorted(experiments, key=lambda e: e["start_date"], reverse=True), + ) -@dashboard.route('/get_stats', methods=['POST']) + +@dashboard.route("/get_stats", methods=["POST"]) def get_stats(): args_dict = request.json - exp_uid = args_dict['exp_uid'] + exp_uid = args_dict["exp_uid"] app_id = rm.get_app_id(exp_uid) - response_json, didSucceed, message = broker.dashboardAsync(app_id,exp_uid,args_dict) - response_dict = json.loads(response_json,parse_float=lambda o:round(float(o),4)) + response_json, didSucceed, message = broker.dashboardAsync( + app_id, exp_uid, args_dict + ) + response_dict = json.loads(response_json, parse_float=lambda o: round(float(o), 4)) response_json = json.dumps(response_dict) return response_json -@dashboard.route('/system_monitor') +@dashboard.route("/system_monitor") def system_monitor(): """ Endpoint that renders a page with a simple list of all monitoring. """ - host_url = 'http://{}:{}'.format(constants.NEXT_BACKEND_GLOBAL_HOST, - constants.NEXT_BACKEND_GLOBAL_PORT) - - rabbit_url = 'http://{}:{}'.format(constants.NEXT_BACKEND_GLOBAL_HOST, - 15672) - cadvisor_url = 'http://{}:{}'.format(constants.NEXT_BACKEND_GLOBAL_HOST, - 8888) - mongodb_url = 'http://{}:{}'.format(constants.NEXT_BACKEND_GLOBAL_HOST, - 28017) - return render_template('system_monitor.html', - rabbit_url=rabbit_url, - cadvisor_url=cadvisor_url, - mongodb_url=mongodb_url) - -@dashboard.route('/experiment//retire', methods=['POST']) + host_url = "http://{}:{}".format( + constants.NEXT_BACKEND_GLOBAL_HOST, constants.NEXT_BACKEND_GLOBAL_PORT + ) + + rabbit_url = "http://{}:{}".format(constants.NEXT_BACKEND_GLOBAL_HOST, 15672) + cadvisor_url = "http://{}:{}".format(constants.NEXT_BACKEND_GLOBAL_HOST, 8888) + mongodb_url = "http://{}:{}".format(constants.NEXT_BACKEND_GLOBAL_HOST, 28017) + return render_template( + "system_monitor.html", + rabbit_url=rabbit_url, + cadvisor_url=cadvisor_url, + mongodb_url=mongodb_url, + ) + + +@dashboard.route("/experiment//retire", methods=["POST"]) def retire_exp(exp_uid): - retired = request.form.get('retired', default=True, - type=flask_restful.inputs.boolean) + retired = request.form.get( + "retired", default=True, type=flask_restful.inputs.boolean + ) rm.set_exp_retired(exp_uid, retired) - return '', 200 + return "", 200 + -@dashboard.route('/experiment_dashboard//') +@dashboard.route("/experiment_dashboard//") def experiment_dashboard(exp_uid, app_id): """ Endpoint that renders the experiment dashboard. @@ -120,32 +137,44 @@ def experiment_dashboard(exp_uid, app_id): (string) exp_uid, exp_uid for a current experiment. """ - simple_flag = int(request.args.get('simple',0)) - force_recompute = int(request.args.get('force_recompute',1)) + simple_flag = int(request.args.get("simple", 0)) + force_recompute = int(request.args.get("force_recompute", 1)) if rm.get_experiment(exp_uid) is None: - return render_template('exp_404.html', exp_uid=exp_uid), 404 + return render_template("exp_404.html", exp_uid=exp_uid), 404 # Not a particularly good way to do this. alg_label_list = rm.get_algs_for_exp_uid(exp_uid) - alg_list = [{'alg_label':alg['alg_label'], - 'alg_label_clean':'_'.join(alg['alg_label'].split())} - for alg in alg_label_list] + alg_list = [ + { + "alg_label": alg["alg_label"], + "alg_label_clean": "_".join(alg["alg_label"].split()), + } + for alg in alg_label_list + ] # -- Directly use Jinja2 to load and render the app-specific dashboard template. - env = Environment(loader=ChoiceLoader([PackageLoader('apps.{}'.format(app_id), - 'dashboard'), - PackageLoader('next.dashboard', - 'templates')])) - template = env.get_template('myAppDashboard.html'.format(app_id)) # looks for /next/apps/{{ app_id }}/dashboard/{{ app_id }}.html + env = Environment( + loader=ChoiceLoader( + [ + PackageLoader("apps.{}".format(app_id), "dashboard"), + PackageLoader("next.dashboard", "templates"), + ] + ) + ) + template = env.get_template( + "myAppDashboard.html".format(app_id) + ) # looks for /next/apps/{{ app_id }}/dashboard/{{ app_id }}.html # The context we pass to the dashboard template. - ctx = dict(app_id=app_id, - exp_uid=exp_uid, - alg_list=alg_list, - exceptions_present=False,#exceptions_present(exp_uid), - url_for=url_for, - simple_flag=int(simple_flag), - force_recompute=int(force_recompute)) + ctx = dict( + app_id=app_id, + exp_uid=exp_uid, + alg_list=alg_list, + exceptions_present=False, # exceptions_present(exp_uid), + url_for=url_for, + simple_flag=int(simple_flag), + force_recompute=int(force_recompute), + ) # Inject standard Flask context + context processors current_app.update_template_context(ctx) @@ -154,8 +183,7 @@ def experiment_dashboard(exp_uid, app_id): def exceptions_present(exp_uid): - url = '/api/experiment/{}/logs/APP-EXCEPTION'.format(exp_uid) + url = "/api/experiment/{}/logs/APP-EXCEPTION".format(exp_uid) r = requests.get(url) - logs = yaml.load(r.content)['log_data'] + logs = yaml.load(r.content)["log_data"] return True if len(logs) > 0 else False - diff --git a/next/dashboard/database.py b/next/dashboard/database.py index 06b21960..84340b14 100644 --- a/next/dashboard/database.py +++ b/next/dashboard/database.py @@ -3,6 +3,7 @@ author: Christopher Fernandez, Lalit Jain Logs resource for all logs associated with a specified experiment. """ +from __future__ import print_function from flask import Response, request, redirect from flask_restful import Resource, reqparse @@ -21,10 +22,7 @@ # Request parser. Checks that necessary dictionary keys are available. # learningLib functions ensure that all necessary arguments are available. post_parser = reqparse.RequestParser(argument_class=APIArgument) -meta_success = { - 'code': 200, - 'status': 'OK' -} +meta_success = {"code": 200, "status": "OK"} # Logs resource class class DatabaseBackup(Resource): @@ -51,16 +49,17 @@ def get(self): :statuscode 200: Database backup successfully returned :statuscode 400: database backup failed to be generated """ - exp_uid_list = request.args.getlist('exp_uid') ## returns a list - print exp_uid_list - name = '{}.{}'.format(str(next.utils.datetimeNow().strftime("%Y-%m-%d_%H:%M:%S")), - 'tar.gz') - location = make_mongodump(name,exp_uid_list) + exp_uid_list = request.args.getlist("exp_uid") ## returns a list + name = "{}.{}".format( + str(next.utils.datetimeNow().strftime("%Y-%m-%d_%H:%M:%S")), "tar.gz" + ) + location = make_mongodump(name, exp_uid_list) zip_file = file(location) - return Response(zip_file, - mimetype='application/octet-stream', - headers={'Content-Disposition': - 'attachment;filename={}'.format(name)}) + return Response( + zip_file, + mimetype="application/octet-stream", + headers={"Content-Disposition": "attachment;filename={}".format(name)}, + ) class DatabaseRestore(Resource): @@ -87,16 +86,16 @@ def post(self): :statuscode 200: Database backup successfully returned :statuscode 400: database backup failed to be generated """ - zip_file = request.files['primary_file'] + zip_file = request.files["primary_file"] # zip_file is a file object - subprocess.call('mkdir -p /dump',shell=True) - filename = '/dump/mongo_dump_restore.tar.gz' + subprocess.call("mkdir -p /dump", shell=True) + filename = "/dump/mongo_dump_restore.tar.gz" zip_file.save(filename) restore_mongodump(filename) - subprocess.call('rm '+filename,shell=True) + subprocess.call("rm " + filename, shell=True) if constants.SITE_KEY: - dashboard_prefix = '/dashboard/{}'.format(constants.SITE_KEY) + dashboard_prefix = "/dashboard/{}".format(constants.SITE_KEY) else: - dashboard_prefix = '/dashboard' - return redirect(dashboard_prefix + '/experiment_list') + dashboard_prefix = "/dashboard" + return redirect(dashboard_prefix + "/experiment_list") diff --git a/next/database/daemon_database_backup.py b/next/database/daemon_database_backup.py index 9c1bb7e9..0bde4f00 100755 --- a/next/database/daemon_database_backup.py +++ b/next/database/daemon_database_backup.py @@ -7,8 +7,9 @@ where {hostname} and {port} are as they are below """ - +from __future__ import print_function import sys + sys.path.append("/next_backend") import time @@ -20,18 +21,10 @@ import next.constants as constants import os -while(1): - - timestamp = utils.datetimeNow() - print "[ %s ] Calling database daemon..." % str(timestamp) - subprocess.call('python ./next/database/database_backup.py',shell=True) - - time.sleep(3600*6) # once every 6 hours - - - - - +while 1: + timestamp = utils.datetimeNow() + print("[ %s ] Calling database daemon..." % str(timestamp)) + subprocess.call("python ./next/database/database_backup.py", shell=True) - + time.sleep(3600 * 6) # once every 6 hours diff --git a/next/database/database_backup.py b/next/database/database_backup.py index 07717ff9..fd98a60c 100755 --- a/next/database/database_backup.py +++ b/next/database/database_backup.py @@ -7,7 +7,9 @@ where {hostname} and {port} are as they are below """ +from __future__ import print_function import sys + sys.path.append("/next_backend") import time @@ -21,47 +23,48 @@ import os -NEXT_BACKEND_GLOBAL_HOST = os.environ.get('NEXT_BACKEND_GLOBAL_HOST', 'localhost') -AWS_BUCKET_NAME = os.environ.get('AWS_BUCKET_NAME','next-database-backups') +NEXT_BACKEND_GLOBAL_HOST = os.environ.get("NEXT_BACKEND_GLOBAL_HOST", "localhost") +AWS_BUCKET_NAME = os.environ.get("AWS_BUCKET_NAME", "next-database-backups") timestamp = utils.datetimeNow() -print "[ %s ] starting backup of MongoDB to S3..." % str(timestamp) +print("[ %s ] starting backup of MongoDB to S3..." % str(timestamp)) + +print("[ %s ] constants.AWS_ACCESS_ID = %s" % (str(timestamp), constants.AWS_ACCESS_ID)) -print "[ %s ] constants.AWS_ACCESS_ID = %s" % (str(timestamp),constants.AWS_ACCESS_ID) - -tar_file = '' +tar_file = "" try: - tar_file = sys.argv[1] + tar_file = sys.argv[1] except: - tar_file = 'mongo_dump_{hostname}_{timestamp}.tar.gz'.format( hostname=NEXT_BACKEND_GLOBAL_HOST, timestamp= timestamp.strftime("%Y-%m-%d_%H:%M:%S") ) + tar_file = "mongo_dump_{hostname}_{timestamp}.tar.gz".format( + hostname=NEXT_BACKEND_GLOBAL_HOST, + timestamp=timestamp.strftime("%Y-%m-%d_%H:%M:%S"), + ) tar_filename = db_lib.make_mongodump(tar_file) from boto.s3.connection import S3Connection from boto.s3.key import Key import boto + # boto.set_stream_logger('boto') try: - conn = S3Connection(constants.AWS_ACCESS_ID,constants.AWS_SECRET_ACCESS_KEY) - b = conn.get_bucket(AWS_BUCKET_NAME) - - k = Key(b) - k.key = tar_file - bytes_saved = k.set_contents_from_filename( tar_filename ) - - timestamp = utils.datetimeNow() - print "[ %s ] done with backup of MongoDB to S3... %d bytes saved" % (str(timestamp),bytes_saved) + conn = S3Connection(constants.AWS_ACCESS_ID, constants.AWS_SECRET_ACCESS_KEY) + b = conn.get_bucket(AWS_BUCKET_NAME) + + k = Key(b) + k.key = tar_file + bytes_saved = k.set_contents_from_filename(tar_filename) + + timestamp = utils.datetimeNow() + print( + "[ %s ] done with backup of MongoDB to S3... %d bytes saved" + % (str(timestamp), bytes_saved) + ) except: - error = traceback.format_exc() - timestamp = utils.datetimeNow() - print "[ %s ] FAILED TO CONNECT TO S3... saving locally" % str(timestamp) - print error - -subprocess.call('rm {tar_filename}'.format(tar_filename=tar_filename),shell=True) - - - - + error = traceback.format_exc() + timestamp = utils.datetimeNow() + print("[ %s ] FAILED TO CONNECT TO S3... saving locally" % str(timestamp)) + print(error) - +subprocess.call("rm {tar_filename}".format(tar_filename=tar_filename), shell=True) diff --git a/next/database/database_lib.py b/next/database/database_lib.py index 0000dee7..ecb9822e 100644 --- a/next/database/database_lib.py +++ b/next/database/database_lib.py @@ -8,51 +8,77 @@ import next.constants as constants from pymongo import MongoClient -def make_mongodump(name,exp_uid_list=[]): + +def make_mongodump(name, exp_uid_list=[]): tmp_dir = tempfile.mkdtemp() - if len(exp_uid_list)==0: - subprocess.call(('/usr/bin/mongodump -vvvvv --host {hostname}:{port} ' - '--out {path}').format(hostname=constants.MONGODB_HOST, - port=constants.MONGODB_PORT, - path=tmp_dir), - shell=True) + if len(exp_uid_list) == 0: + subprocess.call( + ( + "/usr/bin/mongodump -vvvvv --host {hostname}:{port} " "--out {path}" + ).format( + hostname=constants.MONGODB_HOST, + port=constants.MONGODB_PORT, + path=tmp_dir, + ), + shell=True, + ) else: - exp_uid_list_str = '["'+'","'.join(exp_uid_list)+'"]' + exp_uid_list_str = '["' + '","'.join(exp_uid_list) + '"]' - query_str = '\'{ $or: [ {"exp_uid":{$in:%s}}, {"object_id":{$in:%s}} ] }\'' % (exp_uid_list_str,exp_uid_list_str) + query_str = '\'{ $or: [ {"exp_uid":{$in:%s}}, {"object_id":{$in:%s}} ] }\'' % ( + exp_uid_list_str, + exp_uid_list_str, + ) client = MongoClient(constants.MONGODB_HOST, constants.MONGODB_PORT) for db in client.database_names(): - for col in client[db].collection_names(): - subprocess.call(('/usr/bin/mongodump -vvvvv --host {hostname}:{port} ' - '--out {path} -d '+str(db)+' -c '+str(col)+' --query {query_str}').format(hostname=constants.MONGODB_HOST, - port=constants.MONGODB_PORT, - path=tmp_dir, - query_str=query_str), - shell=True) - subprocess.call('mkdir -p /dump',shell=True) - subprocess.call(('tar czf /dump/{name} ' - '-C {path} .').format(name=name,path=tmp_dir), - shell=True) - + for col in client[db].collection_names(): + subprocess.call( + ( + "/usr/bin/mongodump -vvvvv --host {hostname}:{port} " + "--out {path} -d " + + str(db) + + " -c " + + str(col) + + " --query {query_str}" + ).format( + hostname=constants.MONGODB_HOST, + port=constants.MONGODB_PORT, + path=tmp_dir, + query_str=query_str, + ), + shell=True, + ) + subprocess.call("mkdir -p /dump", shell=True) + subprocess.call( + ("tar czf /dump/{name} " "-C {path} .").format(name=name, path=tmp_dir), + shell=True, + ) + shutil.rmtree(tmp_dir) - return '/dump/{}'.format(name) + return "/dump/{}".format(name) + - def remove_mongodump(name): - subprocess.call(('rm /dump/{name}').format(name=name), - shell=True) + subprocess.call(("rm /dump/{name}").format(name=name), shell=True) + def restore_mongodump(src_filename): tmp_dir = tempfile.mkdtemp() - subprocess.call(('tar -xvf {src_filename} -C {dst_path}').format(src_filename=src_filename,dst_path=tmp_dir),shell=True) - subprocess.call(('/usr/bin/mongorestore --host {hostname} --port {port} ' - '{path}').format(hostname=constants.MONGODB_HOST, - port=constants.MONGODB_PORT, - path=tmp_dir), - shell=True) + subprocess.call( + ("tar -xvf {src_filename} -C {dst_path}").format( + src_filename=src_filename, dst_path=tmp_dir + ), + shell=True, + ) + subprocess.call( + ("/usr/bin/mongorestore --host {hostname} --port {port} " "{path}").format( + hostname=constants.MONGODB_HOST, port=constants.MONGODB_PORT, path=tmp_dir + ), + shell=True, + ) shutil.rmtree(tmp_dir) # import next.database.database_lib as db_lib -# >>> db_lib.make_mongodump('test_619') \ No newline at end of file +# >>> db_lib.make_mongodump('test_619') diff --git a/next/database/database_restore.py b/next/database/database_restore.py index 91b42a38..41e3ede4 100755 --- a/next/database/database_restore.py +++ b/next/database/database_restore.py @@ -8,6 +8,7 @@ """ import sys + sys.path.append("/next_backend") import subprocess @@ -15,30 +16,28 @@ import next.database.database_lib as db_lib import os -AWS_BUCKET_NAME = os.environ['AWS_BUCKET_NAME'] +AWS_BUCKET_NAME = os.environ["AWS_BUCKET_NAME"] try: - dump_filename = sys.argv[1] + dump_filename = sys.argv[1] except: - "Must provide a filename from the 'next-database-backups' bucket.\n python daemon_database_restore.py mongo_dump_next-test1.discovery.wisc.edu_2015-04-21_04:50:38.tar.gz" + "Must provide a filename from the 'next-database-backups' bucket.\n python daemon_database_restore.py mongo_dump_next-test1.discovery.wisc.edu_2015-04-21_04:50:38.tar.gz" from boto.s3.connection import S3Connection from boto.s3.key import Key import boto + # boto.set_stream_logger('boto') -conn = S3Connection(constants.AWS_ACCESS_ID,constants.AWS_SECRET_ACCESS_KEY) +conn = S3Connection(constants.AWS_ACCESS_ID, constants.AWS_SECRET_ACCESS_KEY) b = conn.get_bucket(AWS_BUCKET_NAME) k = Key(b) -k.key = dump_filename #'mongo_dump_next-test1.discovery.wisc.edu_2015-04-21_04:50:38.tar.gz' -filename = 'mongo_dump.tar.gz' +k.key = ( + dump_filename +) # 'mongo_dump_next-test1.discovery.wisc.edu_2015-04-21_04:50:38.tar.gz' +filename = "mongo_dump.tar.gz" k.get_contents_to_filename(filename) db_lib.restore_mongodump(filename) -subprocess.call('rm '+filename,shell=True) - - - - - +subprocess.call("rm " + filename, shell=True) diff --git a/next/database_client/DatabaseAPI.py b/next/database_client/DatabaseAPI.py index 1b5469ce..b5e194e3 100644 --- a/next/database_client/DatabaseAPI.py +++ b/next/database_client/DatabaseAPI.py @@ -2,6 +2,7 @@ Layer for interfacing with Mongo. """ +from __future__ import print_function import cPickle import traceback from datetime import datetime @@ -21,12 +22,14 @@ try: import next.broker.broker except: - print "Warning: you will not be able to submit jobs to the broker" + print("Warning: you will not be able to submit jobs to the broker") pass + class DatabaseException(BaseException): pass + def to_db_fmt(x): # leave None as is if x is None: @@ -60,6 +63,7 @@ def to_db_fmt(x): # pickle everything else, wrap in MongoDB `Binary` return Binary(cPickle.dumps(x, protocol=2)) + def from_db_fmt(x): # recursive descent through lists if isinstance(x, list): @@ -84,6 +88,7 @@ def from_db_fmt(x): # not a datatype we need to deserialize! just pass it out return x + class DatabaseAPI(object): """ Serves as an API object that can be passed around. See above for usage @@ -92,8 +97,12 @@ class DatabaseAPI(object): client : PyMongo client """ - def __init__(self, mongo_host=constants.MONGODB_HOST, mongo_port=constants.MONGODB_PORT, - database_name=constants.app_data_database_id): + def __init__( + self, + mongo_host=constants.MONGODB_HOST, + mongo_port=constants.MONGODB_PORT, + database_name=constants.app_data_database_id, + ): self.client = None self.connect_mongo(mongo_host, mongo_port) @@ -119,7 +128,7 @@ def close(): def is_connected(self): try: # The `ismaster` command is very cheap and does not require authentication - self.client.admin.command('ismaster') + self.client.admin.command("ismaster") return True except ConnectionFailure: return False @@ -127,45 +136,61 @@ def is_connected(self): def _bucket(self, bucket_id): return self.client[self.db_name][bucket_id] - def exists(self,bucket_id,doc_uid,key): + def exists(self, bucket_id, doc_uid, key): # if the document isn't found, just set doc to an empty dict, # so that any .get(key) call returns None - doc = self._bucket(bucket_id).find_one({"_id":doc_uid}, - projection={key: True}) or {} + doc = ( + self._bucket(bucket_id).find_one({"_id": doc_uid}, projection={key: True}) + or {} + ) return doc.get(key) is not None - def get(self,bucket_id,doc_uid,key): + def get(self, bucket_id, doc_uid, key): val = self._bucket(bucket_id).find_one({"_id": doc_uid}, {key: True}).get(key) return from_db_fmt(val) - def get_many(self,bucket_id,doc_uid,key_list): + def get_many(self, bucket_id, doc_uid, key_list): projection = {k: True for k in key_list} doc = self._bucket(bucket_id).find_one({"_id": doc_uid}, projection) val = {k: doc.get(k) for k in key_list} return from_db_fmt(val) - def get_and_delete(self,bucket_id,doc_uid,key): - doc = self._bucket(bucket_id).find_one_and_update({"_id": doc_uid}, - update={'$unset': {key: ''}}, projection={key: True}) + def get_and_delete(self, bucket_id, doc_uid, key): + doc = self._bucket(bucket_id).find_one_and_update( + {"_id": doc_uid}, update={"$unset": {key: ""}}, projection={key: True} + ) return from_db_fmt(doc.get(key)) - def increment(self,bucket_id,doc_uid,key,value=1): - return self._bucket(bucket_id).find_one_and_update({"_id": doc_uid}, - update={'$inc': {key: value}}, projection={key: True}, - new=True, upsert=True).get(key) - - def increment_many(self,bucket_id,doc_uid,key_value_dict): + def increment(self, bucket_id, doc_uid, key, value=1): + return ( + self._bucket(bucket_id) + .find_one_and_update( + {"_id": doc_uid}, + update={"$inc": {key: value}}, + projection={key: True}, + new=True, + upsert=True, + ) + .get(key) + ) + + def increment_many(self, bucket_id, doc_uid, key_value_dict): projection = {k: True for k in key_value_dict.keys()} values = {k: v for k, v in key_value_dict.items() if v != 0} - new_doc = self._bucket(bucket_id).find_one_and_update({"_id": doc_uid}, - update={'$inc': values}, projection=projection, new=True, upsert=True) + new_doc = self._bucket(bucket_id).find_one_and_update( + {"_id": doc_uid}, + update={"$inc": values}, + projection=projection, + new=True, + upsert=True, + ) return {k: new_doc.get(k) for k in key_value_dict.keys()} - def get_list(self,bucket_id,doc_uid,key): + def get_list(self, bucket_id, doc_uid, key): return self.get(bucket_id, doc_uid, key) def pop_list(self, bucket_id, doc_uid, key, end): @@ -175,67 +200,103 @@ def pop_list(self, bucket_id, doc_uid, key, end): elif end == -1: mongo_idx = 1 else: - raise IndexError("Can only pop first (index=0) or last (index=-1) element of list!") + raise IndexError( + "Can only pop first (index=0) or last (index=-1) element of list!" + ) - val = self._bucket(bucket_id).find_and_modify({"_id": doc_uid}, - {'$pop': {key: mongo_idx}}).get(key) + val = ( + self._bucket(bucket_id) + .find_and_modify({"_id": doc_uid}, {"$pop": {key: mongo_idx}}) + .get(key) + ) try: return from_db_fmt(val[end]) except IndexError: raise IndexError("Cannot pop from empty list!") - def append_list(self,bucket_id,doc_uid,key,value): - return self._bucket(bucket_id).find_one_and_update({"_id": doc_uid}, - {'$push': {key: to_db_fmt(value)}}, new=True, upsert=True).get(key) - - def set_list(self,bucket_id,doc_uid,key,value): + def append_list(self, bucket_id, doc_uid, key, value): + return ( + self._bucket(bucket_id) + .find_one_and_update( + {"_id": doc_uid}, + {"$push": {key: to_db_fmt(value)}}, + new=True, + upsert=True, + ) + .get(key) + ) + + def set_list(self, bucket_id, doc_uid, key, value): self.set(bucket_id, doc_uid, key, value) - def set_doc(self,bucket_id,doc_uid,doc): + def set_doc(self, bucket_id, doc_uid, doc): if doc_uid is not None: - doc['_id'] = doc_uid - self._bucket(bucket_id).replace_one({"_id": doc_uid}, to_db_fmt(doc), upsert=True) + doc["_id"] = doc_uid + self._bucket(bucket_id).replace_one( + {"_id": doc_uid}, to_db_fmt(doc), upsert=True + ) else: self._bucket(bucket_id).insert_one(to_db_fmt(doc)) - def get_doc(self,bucket_id,doc_uid): + def get_doc(self, bucket_id, doc_uid): return from_db_fmt(self._bucket(bucket_id).find_one({"_id": doc_uid})) - def get_docs_with_filter(self,bucket_id,pattern_dict): + def get_docs_with_filter(self, bucket_id, pattern_dict): docs_cursor = self._bucket(bucket_id).find(pattern_dict) return [from_db_fmt(doc) for doc in docs_cursor] - def set(self,bucket_id,doc_uid,key,value): - self._bucket(bucket_id).update_one({"_id": doc_uid}, - {'$set': {key: to_db_fmt(value)}}, upsert=True) + def set(self, bucket_id, doc_uid, key, value): + self._bucket(bucket_id).update_one( + {"_id": doc_uid}, {"$set": {key: to_db_fmt(value)}}, upsert=True + ) - def set_many(self,bucket_id,doc_uid,key_value_dict): - self._bucket(bucket_id).update_one({"_id": doc_uid}, - {'$set': to_db_fmt(key_value_dict)}) + def set_many(self, bucket_id, doc_uid, key_value_dict): + self._bucket(bucket_id).update_one( + {"_id": doc_uid}, {"$set": to_db_fmt(key_value_dict)} + ) - def delete(self,bucket_id,doc_uid,key): - self._bucket(bucket_id).update_one({"_id": doc_uid}, - {'$unset': {key: True}}) + def delete(self, bucket_id, doc_uid, key): + self._bucket(bucket_id).update_one({"_id": doc_uid}, {"$unset": {key: True}}) - def ensure_index(self,bucket_id,index_dict): + def ensure_index(self, bucket_id, index_dict): self._bucket(bucket_id).create_index(index_dict.items()) - def drop_all_indexes(self,bucket_id): + def drop_all_indexes(self, bucket_id): self._bucket(bucket_id).drop_indexes() - def delete_docs_with_filter(self,bucket_id,pattern_dict): + def delete_docs_with_filter(self, bucket_id, pattern_dict): self._bucket(bucket_id).delete_many(pattern_dict) - def submit_job(self,app_id,exp_uid,task,task_args_json,namespace=None,ignore_result=True,time_limit=0, alg_id=None, alg_label=None): + def submit_job( + self, + app_id, + exp_uid, + task, + task_args_json, + namespace=None, + ignore_result=True, + time_limit=0, + alg_id=None, + alg_label=None, + ): if self.broker is None: self.broker = next.broker.broker.JobBroker() if namespace is None: - result = self.broker.applyAsync(app_id,exp_uid,task,task_args_json,ignore_result=ignore_result) + result = self.broker.applyAsync( + app_id, exp_uid, task, task_args_json, ignore_result=ignore_result + ) else: - result = self.broker.applySyncByNamespace(app_id,exp_uid, - alg_id, alg_label, - task,task_args_json,namespace=namespace, - ignore_result=ignore_result,time_limit=time_limit) + result = self.broker.applySyncByNamespace( + app_id, + exp_uid, + alg_id, + alg_label, + task, + task_args_json, + namespace=namespace, + ignore_result=ignore_result, + time_limit=time_limit, + ) return result diff --git a/next/database_client/test_databaseapi.py b/next/database_client/test_databaseapi.py index c2eb96af..5f029870 100644 --- a/next/database_client/test_databaseapi.py +++ b/next/database_client/test_databaseapi.py @@ -2,214 +2,247 @@ import pymongo -from next.database_client.DatabaseAPI import to_db_fmt, from_db_fmt, DatabaseAPI, DatabaseException +from next.database_client.DatabaseAPI import ( + to_db_fmt, + from_db_fmt, + DatabaseAPI, + DatabaseException, +) # IMPORTANT NOTE: only uses the `test_data` database; it gets cleared after each test session -MONGO_HOST, MONGO_PORT = 'localhost', 27017 -MONGO_DB = 'test_data' +MONGO_HOST, MONGO_PORT = "localhost", 27017 +MONGO_DB = "test_data" # === fixtures === -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def db(): db = DatabaseAPI(MONGO_HOST, MONGO_PORT, MONGO_DB) yield db db.client.drop_database(MONGO_DB) db.client.close() + # === basic tests === def test_connect(db): assert db.is_connected() + def test_reconnection(db): db.connect_mongo(MONGO_HOST, MONGO_PORT) assert db.is_connected() + # === test db functions === def test__bucket(db): - assert db._bucket('foo') == db.client[MONGO_DB]['foo'] + assert db._bucket("foo") == db.client[MONGO_DB]["foo"] + def test_exists(db): - B = 'test_exists' + B = "test_exists" + + doc_uid = db._bucket(B).insert_one({"a_key": 2}).inserted_id - doc_uid = db._bucket(B).insert_one({'a_key': 2}).inserted_id + assert db.exists(B, doc_uid, "a_key") + assert not db.exists(B, doc_uid, "a_nonexistent_key") + assert not db.exists(B, "ashkjfdbkjfns", "a_key") - assert db.exists(B, doc_uid, 'a_key') - assert not db.exists(B, doc_uid, 'a_nonexistent_key') - assert not db.exists(B, 'ashkjfdbkjfns', 'a_key') def test_get(db): - B = 'test_get' + B = "test_get" + + doc_uid = ( + db._bucket(B).insert_one({"a_key": 2, "another_key": [1.0, "f"]}).inserted_id + ) - doc_uid = db._bucket(B).insert_one({'a_key': 2, 'another_key': [1.0, 'f']}).inserted_id + assert db.get(B, doc_uid, "a_key") == 2 + assert db.get(B, doc_uid, "another_key") == [1.0, "f"] + assert not db.get(B, doc_uid, "a_nonexistant_key") - assert db.get(B, doc_uid, 'a_key') == 2 - assert db.get(B, doc_uid, 'another_key') == [1.0, 'f'] - assert not db.get(B, doc_uid, 'a_nonexistant_key') def test_get_many(db): - B = 'test_get_many' - doc = {'a_key': 2, 'another_key': [1.0, 'f'], 'third_key': 'baz'} + B = "test_get_many" + doc = {"a_key": 2, "another_key": [1.0, "f"], "third_key": "baz"} doc_uid = db._bucket(B).insert_one(doc).inserted_id - assert db.get_many(B, doc_uid, ['another_key']) \ - == {'another_key': [1.0, 'f']} - assert db.get_many(B, doc_uid, ['another_key', 'third_key']) \ - == {'another_key': [1.0, 'f'], 'third_key': 'baz'} + assert db.get_many(B, doc_uid, ["another_key"]) == {"another_key": [1.0, "f"]} + assert db.get_many(B, doc_uid, ["another_key", "third_key"]) == { + "another_key": [1.0, "f"], + "third_key": "baz", + } + + assert db.get_many(B, doc_uid, ["totally_nonexistent_key", "another_key"]) == { + "totally_nonexistent_key": None, + "another_key": [1.0, "f"], + } - assert db.get_many(B, doc_uid, ['totally_nonexistent_key', 'another_key']) \ - == {'totally_nonexistent_key': None, 'another_key': [1.0, 'f']} def test_get_and_delete(db): - B = 'test_get_and_delete' - doc = {'a': 2, 'b': [1.0, 'f'], 'c': 'baz'} + B = "test_get_and_delete" + doc = {"a": 2, "b": [1.0, "f"], "c": "baz"} doc_uid = db._bucket(B).insert_one(doc).inserted_id - assert db.exists(B, doc_uid, 'b') - assert db.get_and_delete(B, doc_uid, 'b') == [1.0, 'f'] - assert not db.exists(B, doc_uid, 'b') + assert db.exists(B, doc_uid, "b") + assert db.get_and_delete(B, doc_uid, "b") == [1.0, "f"] + assert not db.exists(B, doc_uid, "b") + def test_increment(db): - B = 'test_increment' + B = "test_increment" + + doc_uid = db._bucket(B).insert_one({"a": 2}).inserted_id - doc_uid = db._bucket(B).insert_one({'a': 2}).inserted_id + assert db.increment(B, doc_uid, "a") == 3 + assert db.get(B, doc_uid, "a") == 3 + assert db.increment(B, doc_uid, "a", -2) == 1 + assert db.get(B, doc_uid, "a") == 1 - assert db.increment(B, doc_uid, 'a') == 3 - assert db.get(B, doc_uid, 'a') == 3 - assert db.increment(B, doc_uid, 'a', -2) == 1 - assert db.get(B, doc_uid, 'a') == 1 def test_increment_many(db): - B = 'test_increment_many' - doc = {'a': 0, 'b': 0, 'c': 0} + B = "test_increment_many" + doc = {"a": 0, "b": 0, "c": 0} doc_uid = db._bucket(B).insert_one(doc).inserted_id - assert db.increment_many(B, doc_uid, {'a': 1, 'b': 5, 'c': -7}) \ - == {'a': 1, 'b': 5, 'c': -7} - assert db.get_many(B, doc_uid, ['a', 'b', 'c']) \ - == {'a': 1, 'b': 5, 'c': -7} + assert db.increment_many(B, doc_uid, {"a": 1, "b": 5, "c": -7}) == { + "a": 1, + "b": 5, + "c": -7, + } + assert db.get_many(B, doc_uid, ["a", "b", "c"]) == {"a": 1, "b": 5, "c": -7} + def test_pop_list(db): - B = 'test_pop_list' - doc = {'a': range(0, 10+1)} + B = "test_pop_list" + doc = {"a": range(0, 10 + 1)} doc_uid = db._bucket(B).insert_one(doc).inserted_id - assert db.pop_list(B, doc_uid, 'a', -1) == 10 - assert db.get(B, doc_uid, 'a') == range(0, 9+1) + assert db.pop_list(B, doc_uid, "a", -1) == 10 + assert db.get(B, doc_uid, "a") == range(0, 9 + 1) - assert db.pop_list(B, doc_uid, 'a', 0) == 0 - assert db.get(B, doc_uid, 'a') == range(1, 9+1) + assert db.pop_list(B, doc_uid, "a", 0) == 0 + assert db.get(B, doc_uid, "a") == range(1, 9 + 1) # popping from an empty list should raise an exception - db.set(B, doc_uid, 'a', []) + db.set(B, doc_uid, "a", []) with pytest.raises(IndexError): - db.pop_list(B, doc_uid, 'a', 0) + db.pop_list(B, doc_uid, "a", 0) + def test_append_list(db): - B = 'test_append_list' + B = "test_append_list" - doc_uid = db._bucket(B).insert_one({'a': [1, 2, 3, 4]}).inserted_id + doc_uid = db._bucket(B).insert_one({"a": [1, 2, 3, 4]}).inserted_id + + assert db.append_list(B, doc_uid, "a", 10) == [1, 2, 3, 4, 10] + assert db.get(B, doc_uid, "a") == [1, 2, 3, 4, 10] - assert db.append_list(B, doc_uid, 'a', 10) == [1, 2, 3, 4, 10] - assert db.get(B, doc_uid, 'a') == [1, 2, 3, 4, 10] def test_set(db): - B = 'test_set_list' + B = "test_set_list" doc_uid = db._bucket(B).insert_one({}).inserted_id - assert db.get(B, doc_uid, 'a') is None - db.set(B, doc_uid, 'a', [1,2,3,4]) - assert db.get(B, doc_uid, 'a') == [1,2,3,4] + assert db.get(B, doc_uid, "a") is None + db.set(B, doc_uid, "a", [1, 2, 3, 4]) + assert db.get(B, doc_uid, "a") == [1, 2, 3, 4] # alias of db.set() - db.set_list(B, doc_uid, 'a', [5,6,7,8]) - assert db.get(B, doc_uid, 'a') == [5,6,7,8] + db.set_list(B, doc_uid, "a", [5, 6, 7, 8]) + assert db.get(B, doc_uid, "a") == [5, 6, 7, 8] + def test_set_many(db): - B = 'test_set_many' + B = "test_set_many" - doc_uid = db._bucket(B).insert_one({'a': 3, 'x': 'bar'}).inserted_id + doc_uid = db._bucket(B).insert_one({"a": 3, "x": "bar"}).inserted_id - assert db.get(B, doc_uid, 'a') == 3 - assert db.get(B, doc_uid, 'x') == 'bar' + assert db.get(B, doc_uid, "a") == 3 + assert db.get(B, doc_uid, "x") == "bar" # db.set_many() takes a dict and sets multiple keys - db.set_many(B, doc_uid, {'a': 4, 'b': 'foo'}) - assert db.get_doc(B, doc_uid) == {'_id': str(doc_uid), 'a': 4, 'b': 'foo', 'x': 'bar'} + db.set_many(B, doc_uid, {"a": 4, "b": "foo"}) + assert db.get_doc(B, doc_uid) == { + "_id": str(doc_uid), + "a": 4, + "b": "foo", + "x": "bar", + } + def test_set_doc(db): - B = 'test_set_doc' + B = "test_set_doc" doc_uid = db._bucket(B).insert_one({}).inserted_id # replace an existing document - assert db.get_doc(B, doc_uid) == {'_id': str(doc_uid)} - db.set_doc(B, doc_uid, {'a': 5, 'b': 'foo'}) - assert db.get_doc(B, doc_uid) == {'_id': str(doc_uid), - 'a': 5, 'b': 'foo'} + assert db.get_doc(B, doc_uid) == {"_id": str(doc_uid)} + db.set_doc(B, doc_uid, {"a": 5, "b": "foo"}) + assert db.get_doc(B, doc_uid) == {"_id": str(doc_uid), "a": 5, "b": "foo"} # add a new document with _id='asdf' - db.set_doc(B, 'asdf', {'a': 3, 'b': 'baz'}) - assert db.get_doc(B, 'asdf') == {'_id': 'asdf', - 'a': 3, 'b': 'baz'} + db.set_doc(B, "asdf", {"a": 3, "b": "baz"}) + assert db.get_doc(B, "asdf") == {"_id": "asdf", "a": 3, "b": "baz"} + def test_get_doc(db): - B = 'test_get_doc' + B = "test_get_doc" + + doc_uid = db._bucket(B).insert_one({"a": 3}).inserted_id - doc_uid = db._bucket(B).insert_one({'a': 3}).inserted_id + assert db.get_doc(B, doc_uid) == {"_id": str(doc_uid), "a": 3} - assert db.get_doc(B, doc_uid) == {'_id': str(doc_uid), 'a': 3} def test_get_docs_with_filter(db): - B = 'test_get_doc' + B = "test_get_doc" - db._bucket(B).insert_many([ - {'a': 3, 'b': 2}, - {'a': 5, 'b': 2}, - {'a': 1, 'b': 3}]) + db._bucket(B).insert_many([{"a": 3, "b": 2}, {"a": 5, "b": 2}, {"a": 1, "b": 3}]) - retrieved_docs = db.get_docs_with_filter(B, {'b': 2}) + retrieved_docs = db.get_docs_with_filter(B, {"b": 2}) # remove `_id`s for asserts - retrieved_docs = [{k: v for k, v in r.items() if k != '_id'} - for r in retrieved_docs] - assert {'a': 3, 'b': 2} in retrieved_docs - assert {'a': 5, 'b': 2} in retrieved_docs - assert {'a': 1, 'b': 3} not in retrieved_docs + retrieved_docs = [ + {k: v for k, v in r.items() if k != "_id"} for r in retrieved_docs + ] + assert {"a": 3, "b": 2} in retrieved_docs + assert {"a": 5, "b": 2} in retrieved_docs + assert {"a": 1, "b": 3} not in retrieved_docs + def test_delete(db): - B = 'test_delete' + B = "test_delete" + + doc_uid = db._bucket(B).insert_one({"a": 3}).inserted_id - doc_uid = db._bucket(B).insert_one({'a': 3}).inserted_id + assert db.get(B, doc_uid, "a") == 3 + db.delete(B, doc_uid, "a") + assert db.get(B, doc_uid, "a") is None - assert db.get(B, doc_uid, 'a') == 3 - db.delete(B, doc_uid, 'a') - assert db.get(B, doc_uid, 'a') is None def test_indexes(db): - B = 'test_indexes' + B = "test_indexes" # index a key, 'a'. we should see that index when listing indexes. - db.ensure_index(B, {'a': pymongo.ASCENDING}) + db.ensure_index(B, {"a": pymongo.ASCENDING}) indexes = list(db._bucket(B).list_indexes()) - assert any([i.get('key').get('a') is not None for i in indexes]) + assert any([i.get("key").get("a") is not None for i in indexes]) # drop indexes. we shouldn't see an index on 'a' now. db.drop_all_indexes(B) indexes = list(db._bucket(B).list_indexes()) - assert all([i.get('key').get('a') is None for i in indexes]) + assert all([i.get("key").get("a") is None for i in indexes]) + def test_delete_docs_with_filter(db): - B = 'test_delete_docs_with_filter' + B = "test_delete_docs_with_filter" - db._bucket(B).insert_many([{'a': 2}, {'a': 2, 'b': 3}, {'a': 6}]) + db._bucket(B).insert_many([{"a": 2}, {"a": 2, "b": 3}, {"a": 6}]) - db.delete_docs_with_filter(B, {'a': 2}) + db.delete_docs_with_filter(B, {"a": 2}) + + docs = [{k: v for k, v in d.items() if k != "_id"} for d in db._bucket(B).find()] + assert docs == [{"a": 6}] - docs = [{k:v for k, v in d.items() if k != '_id'} for d in db._bucket(B).find()] - assert docs == [{'a': 6}] # === test utils === def test_to_db_fmt(): @@ -220,20 +253,21 @@ def test_to_db_fmt(): # standard types should be passed through assert to_db_fmt(1) == 1 assert to_db_fmt(4.2) == 4.2 - assert to_db_fmt('foobarbaz') == 'foobarbaz' - assert to_db_fmt(1+2j) == 1+2j + assert to_db_fmt("foobarbaz") == "foobarbaz" + assert to_db_fmt(1 + 2j) == 1 + 2j # lists and dicts should be recursively formatted - assert to_db_fmt([1, 1+2j, 'foo', [1,2.3]]) == [1, 1+2j, 'foo', [1,2.3]] - assert to_db_fmt({'a': 1, 'b': ['foo', 2]}) == {'a': 1, 'b': ['foo', 2]} + assert to_db_fmt([1, 1 + 2j, "foo", [1, 2.3]]) == [1, 1 + 2j, "foo", [1, 2.3]] + assert to_db_fmt({"a": 1, "b": ["foo", 2]}) == {"a": 1, "b": ["foo", 2]} # numpy arrays should be converted to lists - assert to_db_fmt(np.array([1,2,3])) == [1,2,3] + assert to_db_fmt(np.array([1, 2, 3])) == [1, 2, 3] # objects should be pickled x = object() assert to_db_fmt(x) == Binary(cPickle.dumps(x, protocol=2)) + def test_from_db_fmt(): import cPickle import numpy as np @@ -244,12 +278,12 @@ def does_invert(x): # standard types should invert to the original assert does_invert(1) assert does_invert(4.2) - assert does_invert('foobarbaz') - assert does_invert(1+2j) + assert does_invert("foobarbaz") + assert does_invert(1 + 2j) # lists and dicts should invert - assert does_invert([1, 1+2j, 'foo', [1,2.3]]) - assert does_invert({'a': 1, 'b': ['foo', 2]}) + assert does_invert([1, 1 + 2j, "foo", [1, 2.3]]) + assert does_invert({"a": 1, "b": ["foo", 2]}) # numpy arrays invert to lists - assert from_db_fmt(to_db_fmt(np.array([1,2,3]))) == [1,2,3] + assert from_db_fmt(to_db_fmt(np.array([1, 2, 3]))) == [1, 2, 3] diff --git a/next/home.py b/next/home.py index c2e03ebd..1a537aa9 100644 --- a/next/home.py +++ b/next/home.py @@ -3,11 +3,15 @@ from next.lib.pijemont import doc as doc_gen from next.lib.pijemont import verifier -home = Blueprint('home', __name__, - template_folder='../dashboard/templates', - static_folder='../dashboard/static') +home = Blueprint( + "home", + __name__, + template_folder="../dashboard/templates", + static_folder="../dashboard/static", +) -@home.route('/') + +@home.route("/") def redirect_form(): available_apps = utils.get_supported_apps() - return render_template('home.html', available_apps=available_apps) + return render_template("home.html", available_apps=available_apps) diff --git a/next/lib/docopt.py b/next/lib/docopt.py index 7c6a52df..43230542 100644 --- a/next/lib/docopt.py +++ b/next/lib/docopt.py @@ -10,8 +10,8 @@ import re -__all__ = ['docopt'] -__version__ = '0.6.2' +__all__ = ["docopt"] +__version__ = "0.6.2" class DocoptLanguageError(Exception): @@ -23,14 +23,13 @@ class DocoptExit(SystemExit): """Exit in case user invoked program with incorrect arguments.""" - usage = '' + usage = "" - def __init__(self, message=''): - SystemExit.__init__(self, (message + '\n' + self.usage).strip()) + def __init__(self, message=""): + SystemExit.__init__(self, (message + "\n" + self.usage).strip()) class Pattern(object): - def __eq__(self, other): return repr(self) == repr(other) @@ -44,11 +43,11 @@ def fix(self): def fix_identities(self, uniq=None): """Make pattern-tree tips point to same object if they are equal.""" - if not hasattr(self, 'children'): + if not hasattr(self, "children"): return self uniq = list(set(self.flat())) if uniq is None else uniq for i, child in enumerate(self.children): - if not hasattr(child, 'children'): + if not hasattr(child, "children"): assert child in uniq self.children[i] = uniq[uniq.index(child)] else: @@ -104,7 +103,7 @@ def __init__(self, name, value=None): self.name, self.value = name, value def __repr__(self): - return '%s(%r, %r)' % (self.__class__.__name__, self.name, self.value) + return "%s(%r, %r)" % (self.__class__.__name__, self.name, self.value) def flat(self, *types): return [self] if not types or type(self) in types else [] @@ -114,14 +113,13 @@ def match(self, left, collected=None): pos, match = self.single_match(left) if match is None: return False, left, collected - left_ = left[:pos] + left[pos + 1:] + left_ = left[:pos] + left[pos + 1 :] same_name = [a for a in collected if a.name == self.name] if type(self.value) in (int, list): if type(self.value) is int: increment = 1 else: - increment = ([match.value] if type(match.value) is str - else match.value) + increment = [match.value] if type(match.value) is str else match.value if not same_name: match.value = increment return True, left_, collected + [match] @@ -138,8 +136,10 @@ def __init__(self, *children): self.children = list(children) def __repr__(self): - return '%s(%s)' % (self.__class__.__name__, - ', '.join(repr(a) for a in self.children)) + return "%s(%s)" % ( + self.__class__.__name__, + ", ".join(repr(a) for a in self.children), + ) def flat(self, *types): if type(self) in types: @@ -148,7 +148,6 @@ def flat(self, *types): class Argument(LeafPattern): - def single_match(self, left): for n, pattern in enumerate(left): if type(pattern) is Argument: @@ -157,13 +156,12 @@ def single_match(self, left): @classmethod def parse(class_, source): - name = re.findall('(<\S*?>)', source)[0] - value = re.findall('\[default: (.*)\]', source, flags=re.I) + name = re.findall("(<\S*?>)", source)[0] + value = re.findall("\[default: (.*)\]", source, flags=re.I) return class_(name, value[0] if value else None) class Command(Argument): - def __init__(self, name, value=False): self.name, self.value = name, value @@ -178,7 +176,6 @@ def single_match(self, left): class Option(LeafPattern): - def __init__(self, short=None, long=None, argcount=0, value=False): assert argcount in (0, 1) self.short, self.long, self.argcount = short, long, argcount @@ -187,17 +184,17 @@ def __init__(self, short=None, long=None, argcount=0, value=False): @classmethod def parse(class_, option_description): short, long, argcount, value = None, None, 0, False - options, _, description = option_description.strip().partition(' ') - options = options.replace(',', ' ').replace('=', ' ') + options, _, description = option_description.strip().partition(" ") + options = options.replace(",", " ").replace("=", " ") for s in options.split(): - if s.startswith('--'): + if s.startswith("--"): long = s - elif s.startswith('-'): + elif s.startswith("-"): short = s else: argcount = 1 if argcount: - matched = re.findall('\[default: (.*)\]', description, flags=re.I) + matched = re.findall("\[default: (.*)\]", description, flags=re.I) value = matched[0] if matched else None return class_(short, long, argcount, value) @@ -212,12 +209,15 @@ def name(self): return self.long or self.short def __repr__(self): - return 'Option(%r, %r, %r, %r)' % (self.short, self.long, - self.argcount, self.value) + return "Option(%r, %r, %r, %r)" % ( + self.short, + self.long, + self.argcount, + self.value, + ) class Required(BranchPattern): - def match(self, left, collected=None): collected = [] if collected is None else collected l = left @@ -230,7 +230,6 @@ def match(self, left, collected=None): class Optional(BranchPattern): - def match(self, left, collected=None): collected = [] if collected is None else collected for pattern in self.children: @@ -244,7 +243,6 @@ class OptionsShortcut(Optional): class OneOrMore(BranchPattern): - def match(self, left, collected=None): assert len(self.children) == 1 collected = [] if collected is None else collected @@ -266,7 +264,6 @@ def match(self, left, collected=None): class Either(BranchPattern): - def match(self, left, collected=None): collected = [] if collected is None else collected outcomes = [] @@ -280,15 +277,14 @@ def match(self, left, collected=None): class Tokens(list): - def __init__(self, source, error=DocoptExit): - self += source.split() if hasattr(source, 'split') else source + self += source.split() if hasattr(source, "split") else source self.error = error @staticmethod def from_pattern(source): - source = re.sub(r'([\[\]\(\)\|]|\.\.\.)', r' \1 ', source) - source = [s for s in re.split('\s+|(\S*<.*?>)', source) if s] + source = re.sub(r"([\[\]\(\)\|]|\.\.\.)", r" \1 ", source) + source = [s for s in re.split("\s+|(\S*<.*?>)", source) if s] return Tokens(source, error=DocoptLanguageError) def move(self): @@ -300,31 +296,34 @@ def current(self): def parse_long(tokens, options): """long ::= '--' chars [ ( ' ' | '=' ) chars ] ;""" - long, eq, value = tokens.move().partition('=') - assert long.startswith('--') - value = None if eq == value == '' else value + long, eq, value = tokens.move().partition("=") + assert long.startswith("--") + value = None if eq == value == "" else value similar = [o for o in options if o.long == long] if tokens.error is DocoptExit and similar == []: # if no exact match similar = [o for o in options if o.long and o.long.startswith(long)] if len(similar) > 1: # might be simply specified ambiguously 2+ times? - raise tokens.error('%s is not a unique prefix: %s?' % - (long, ', '.join(o.long for o in similar))) + raise tokens.error( + "%s is not a unique prefix: %s?" + % (long, ", ".join(o.long for o in similar)) + ) elif len(similar) < 1: - argcount = 1 if eq == '=' else 0 + argcount = 1 if eq == "=" else 0 o = Option(None, long, argcount) options.append(o) if tokens.error is DocoptExit: o = Option(None, long, argcount, value if argcount else True) else: - o = Option(similar[0].short, similar[0].long, - similar[0].argcount, similar[0].value) + o = Option( + similar[0].short, similar[0].long, similar[0].argcount, similar[0].value + ) if o.argcount == 0: if value is not None: - raise tokens.error('%s must not have an argument' % o.long) + raise tokens.error("%s must not have an argument" % o.long) else: if value is None: - if tokens.current() in [None, '--']: - raise tokens.error('%s requires argument' % o.long) + if tokens.current() in [None, "--"]: + raise tokens.error("%s requires argument" % o.long) value = tokens.move() if tokens.error is DocoptExit: o.value = value if value is not None else True @@ -334,32 +333,32 @@ def parse_long(tokens, options): def parse_shorts(tokens, options): """shorts ::= '-' ( chars )* [ [ ' ' ] chars ] ;""" token = tokens.move() - assert token.startswith('-') and not token.startswith('--') - left = token.lstrip('-') + assert token.startswith("-") and not token.startswith("--") + left = token.lstrip("-") parsed = [] - while left != '': - short, left = '-' + left[0], left[1:] + while left != "": + short, left = "-" + left[0], left[1:] similar = [o for o in options if o.short == short] if len(similar) > 1: - raise tokens.error('%s is specified ambiguously %d times' % - (short, len(similar))) + raise tokens.error( + "%s is specified ambiguously %d times" % (short, len(similar)) + ) elif len(similar) < 1: o = Option(short, None, 0) options.append(o) if tokens.error is DocoptExit: o = Option(short, None, 0, True) else: # why copying is necessary here? - o = Option(short, similar[0].long, - similar[0].argcount, similar[0].value) + o = Option(short, similar[0].long, similar[0].argcount, similar[0].value) value = None if o.argcount != 0: - if left == '': - if tokens.current() in [None, '--']: - raise tokens.error('%s requires argument' % short) + if left == "": + if tokens.current() in [None, "--"]: + raise tokens.error("%s requires argument" % short) value = tokens.move() else: value = left - left = '' + left = "" if tokens.error is DocoptExit: o.value = value if value is not None else True parsed.append(o) @@ -370,17 +369,17 @@ def parse_pattern(source, options): tokens = Tokens.from_pattern(source) result = parse_expr(tokens, options) if tokens.current() is not None: - raise tokens.error('unexpected ending: %r' % ' '.join(tokens)) + raise tokens.error("unexpected ending: %r" % " ".join(tokens)) return Required(*result) def parse_expr(tokens, options): """expr ::= seq ( '|' seq )* ;""" seq = parse_seq(tokens, options) - if tokens.current() != '|': + if tokens.current() != "|": return seq result = [Required(*seq)] if len(seq) > 1 else seq - while tokens.current() == '|': + while tokens.current() == "|": tokens.move() seq = parse_seq(tokens, options) result += [Required(*seq)] if len(seq) > 1 else seq @@ -390,9 +389,9 @@ def parse_expr(tokens, options): def parse_seq(tokens, options): """seq ::= ( atom [ '...' ] )* ;""" result = [] - while tokens.current() not in [None, ']', ')', '|']: + while tokens.current() not in [None, "]", ")", "|"]: atom = parse_atom(tokens, options) - if tokens.current() == '...': + if tokens.current() == "...": atom = [OneOrMore(*atom)] tokens.move() result += atom @@ -405,21 +404,21 @@ def parse_atom(tokens, options): """ token = tokens.current() result = [] - if token in '([': + if token in "([": tokens.move() - matching, pattern = {'(': [')', Required], '[': [']', Optional]}[token] + matching, pattern = {"(": [")", Required], "[": ["]", Optional]}[token] result = pattern(*parse_expr(tokens, options)) if tokens.move() != matching: raise tokens.error("unmatched '%s'" % token) return [result] - elif token == 'options': + elif token == "options": tokens.move() return [OptionsShortcut()] - elif token.startswith('--') and token != '--': + elif token.startswith("--") and token != "--": return parse_long(tokens, options) - elif token.startswith('-') and token not in ('-', '--'): + elif token.startswith("-") and token not in ("-", "--"): return parse_shorts(tokens, options) - elif token.startswith('<') and token.endswith('>') or token.isupper(): + elif token.startswith("<") and token.endswith(">") or token.isupper(): return [Argument(tokens.move())] else: return [Command(tokens.move())] @@ -436,11 +435,11 @@ def parse_argv(tokens, options, options_first=False): """ parsed = [] while tokens.current() is not None: - if tokens.current() == '--': + if tokens.current() == "--": return parsed + [Argument(None, v) for v in tokens] - elif tokens.current().startswith('--'): + elif tokens.current().startswith("--"): parsed += parse_long(tokens, options) - elif tokens.current().startswith('-') and tokens.current() != '-': + elif tokens.current().startswith("-") and tokens.current() != "-": parsed += parse_shorts(tokens, options) elif options_first: return parsed + [Argument(None, v) for v in tokens] @@ -451,40 +450,42 @@ def parse_argv(tokens, options, options_first=False): def parse_defaults(doc): defaults = [] - for s in parse_section('options:', doc): + for s in parse_section("options:", doc): # FIXME corner case "bla: options: --foo" - _, _, s = s.partition(':') # get rid of "options:" - split = re.split('\n[ \t]*(-\S+?)', '\n' + s)[1:] + _, _, s = s.partition(":") # get rid of "options:" + split = re.split("\n[ \t]*(-\S+?)", "\n" + s)[1:] split = [s1 + s2 for s1, s2 in zip(split[::2], split[1::2])] - options = [Option.parse(s) for s in split if s.startswith('-')] + options = [Option.parse(s) for s in split if s.startswith("-")] defaults += options return defaults def parse_section(name, source): - pattern = re.compile('^([^\n]*' + name + '[^\n]*\n?(?:[ \t].*?(?:\n|$))*)', - re.IGNORECASE | re.MULTILINE) + pattern = re.compile( + "^([^\n]*" + name + "[^\n]*\n?(?:[ \t].*?(?:\n|$))*)", + re.IGNORECASE | re.MULTILINE, + ) return [s.strip() for s in pattern.findall(source)] def formal_usage(section): - _, _, section = section.partition(':') # drop "usage:" + _, _, section = section.partition(":") # drop "usage:" pu = section.split() - return '( ' + ' '.join(') | (' if s == pu[0] else s for s in pu[1:]) + ' )' + return "( " + " ".join(") | (" if s == pu[0] else s for s in pu[1:]) + " )" def extras(help, version, options, doc): - if help and any((o.name in ('-h', '--help')) and o.value for o in options): + if help and any((o.name in ("-h", "--help")) and o.value for o in options): print(doc.strip("\n")) sys.exit() - if version and any(o.name == '--version' and o.value for o in options): + if version and any(o.name == "--version" and o.value for o in options): print(version) sys.exit() class Dict(dict): def __repr__(self): - return '{%s}' % ',\n '.join('%r: %r' % i for i in sorted(self.items())) + return "{%s}" % ",\n ".join("%r: %r" % i for i in sorted(self.items())) def docopt(doc, argv=None, help=True, version=None, options_first=False): @@ -552,7 +553,7 @@ def docopt(doc, argv=None, help=True, version=None, options_first=False): """ argv = sys.argv[1:] if argv is None else argv - usage_sections = parse_section('usage:', doc) + usage_sections = parse_section("usage:", doc) if len(usage_sections) == 0: raise DocoptLanguageError('"usage:" (case-insensitive) not found.') if len(usage_sections) > 1: @@ -562,7 +563,7 @@ def docopt(doc, argv=None, help=True, version=None, options_first=False): options = parse_defaults(doc) pattern = parse_pattern(formal_usage(DocoptExit.usage), options) # [default] syntax for argument is disabled - #for a in pattern.flat(Argument): + # for a in pattern.flat(Argument): # same_name = [d for d in arguments if d.name == a.name] # if same_name: # a.value = same_name[0].value @@ -571,7 +572,7 @@ def docopt(doc, argv=None, help=True, version=None, options_first=False): for options_shortcut in pattern.flat(OptionsShortcut): doc_options = parse_defaults(doc) options_shortcut.children = list(set(doc_options) - pattern_options) - #if any_options: + # if any_options: # options_shortcut.children += [Option(o.short, o.long, o.argcount) # for o in argv if type(o) is Option] extras(help, version, argv, doc) diff --git a/next/lib/pijemont/condition.py b/next/lib/pijemont/condition.py index 62a785fc..a0d8fbef 100644 --- a/next/lib/pijemont/condition.py +++ b/next/lib/pijemont/condition.py @@ -1,23 +1,24 @@ import ply.yacc as yacc import ply.lex as lex + class condition_lexer: def __init__(self): self.lexer = lex.lex(module=self) - tokens = ('LT', 'GT', 'EQ', 'AND', 'OR', 'NOT', 'NUMBER', 'LPAREN','RPAREN') - - t_LT = r'<|lt' - t_GT = r'>|gt' - t_EQ = r'=|eq' - t_AND = r'&|and' - t_OR = r'\||or' - t_NOT = r'!|not' - t_LPAREN = r'\(' - t_RPAREN = r'\)' - - def t_NUMBER(self,t): - r'\-?\d+\.?(\d+)?' + tokens = ("LT", "GT", "EQ", "AND", "OR", "NOT", "NUMBER", "LPAREN", "RPAREN") + + t_LT = r"<|lt" + t_GT = r">|gt" + t_EQ = r"=|eq" + t_AND = r"&|and" + t_OR = r"\||or" + t_NOT = r"!|not" + t_LPAREN = r"\(" + t_RPAREN = r"\)" + + def t_NUMBER(self, t): + r"\-?\d+\.?(\d+)?" try: t.value = float(t.value) except ValueError: @@ -27,80 +28,85 @@ def t_NUMBER(self,t): # Ignored characters t_ignore = " \t" - - def t_error(self,t): + + def t_error(self, t): raise Exception("Illegal character '%s'" % t.value[0]) - - def tokenize(self,data): + + def tokenize(self, data): self.lexer.input(data) while True: tok = self.lexer.token() if tok: yield tok else: - break + break + class condition_parser: def __init__(self): self.lexer = condition_lexer() self.tokens = self.lexer.tokens - self.parser = yacc.yacc(module=self,write_tables=0,debug=False) + self.parser = yacc.yacc(module=self, write_tables=0, debug=False) self.the_num = 0 - - def parse(self,data): + + def parse(self, data): if data: - return self.parser.parse(data,self.lexer.lexer,0,0,None) + return self.parser.parse(data, self.lexer.lexer, 0, 0, None) else: - return [] + return [] def p_statement(self, t): - 'statement : NUMBER set_the_num expr' + "statement : NUMBER set_the_num expr" t[0] = t[3] - + def p_set_the_num(self, t): - 'set_the_num :' + "set_the_num :" self.the_num = t[-1] - + def p_expr_paren(self, t): - 'expr : LPAREN expr RPAREN' + "expr : LPAREN expr RPAREN" t[0] = t[2] def p_expr_op(self, t): - '''expr : expr OR expr + """expr : expr OR expr | expr AND expr - | NOT expr''' - if t[1] == '!': t[0] = not t[2] - elif t[2] == '&': t[0] = t[1] and t[3] - elif t[2] == '|': t[0] = t[1] or t[3] + | NOT expr""" + if t[1] == "!": + t[0] = not t[2] + elif t[2] == "&": + t[0] = t[1] and t[3] + elif t[2] == "|": + t[0] = t[1] or t[3] def p_expr(self, t): - '''expr : GT NUMBER + """expr : GT NUMBER | LT NUMBER | GT EQ NUMBER | LT EQ NUMBER | NOT EQ NUMBER - | EQ NUMBER''' - if t[1] == '!' and t[2] == '=': - t[0] = (self.the_num != t[3]) - elif t[1] == '>' and t[2] == '=': - t[0] = (self.the_num >= t[3]) - elif t[1] == '<' and t[2] == '=': - t[0] = (self.the_num <= t[3]) - elif t[1] == '=': - t[0] = (self.the_num == t[2]) - elif t[1] == '>': - t[0] = (self.the_num > t[2]) - elif t[1] == '<': - t[0] = (self.the_num < t[2]) + | EQ NUMBER""" + if t[1] == "!" and t[2] == "=": + t[0] = self.the_num != t[3] + elif t[1] == ">" and t[2] == "=": + t[0] = self.the_num >= t[3] + elif t[1] == "<" and t[2] == "=": + t[0] = self.the_num <= t[3] + elif t[1] == "=": + t[0] = self.the_num == t[2] + elif t[1] == ">": + t[0] = self.the_num > t[2] + elif t[1] == "<": + t[0] = self.the_num < t[2] def p_error(self, t): raise Exception("Syntax error at '%s'" % t.value) + if __name__ == "__main__": p = condition_parser() - print(p.parse('2 > 3')) - print(p.parse('2 ( > 1 & > -1.5 )')) + print(p.parse("2 > 3")) + print(p.parse("2 ( > 1 & > -1.5 )")) try: - print(p.parse('2 ( > 1 & > -1a5 )')) + print(p.parse("2 ( > 1 & > -1a5 )")) except Exception as exc: - print("oops",exc) + print("oops", exc) diff --git a/next/lib/pijemont/doc.py b/next/lib/pijemont/doc.py index 8d2b88cb..6bc8666a 100644 --- a/next/lib/pijemont/doc.py +++ b/next/lib/pijemont/doc.py @@ -1,76 +1,130 @@ import json, sys, yaml, verifier -def get_docs(filename,base_path): - api,errs = verifier.load_doc(filename,base_path) + +def get_docs(filename, base_path): + api, errs = verifier.load_doc(filename, base_path) if len(errs) > 0: raise Exception("Failed to verify: {}".format(errs)) - - return api,blank_gen(api),doc_gen(api) + + return api, blank_gen(api), doc_gen(api) + # def print_docs(api_url): # api = json.loads(urllib2.urlopen(api_url).read())['api'] # print(doc_gen(api)) + def blank_gen(api): return {} + def doc_gen(api): - return "\n\n".join(["### `{func}({shortargs}) : {shortrets}`\n\n{desc}\n\n#### Arguments:\n{longargs}\n\n#### Returns:\n{longrets}".format( - func=f, - shortargs=", ".join(["" + k for k in (api[f]['args'] if 'args' in api[f] else {})]), - shortrets=args_summary(api[f]['rets']) if 'rets' in api[f] else "None", - desc = api[f]['description'] if 'description' in api[f] else "", - longargs = "".join(["\n* `" + k + "` = " + args_gen(api[f]['args'][k],1) for k in (api[f]['args'] if 'args' in api[f] else {})]), - longrets = args_gen(api[f]['rets'],1) if 'rets' in api[f] else "None" - ) for f in api]) + return "\n\n".join( + [ + "### `{func}({shortargs}) : {shortrets}`\n\n{desc}\n\n#### Arguments:\n{longargs}\n\n#### Returns:\n{longrets}".format( + func=f, + shortargs=", ".join( + ["" + k for k in (api[f]["args"] if "args" in api[f] else {})] + ), + shortrets=args_summary(api[f]["rets"]) if "rets" in api[f] else "None", + desc=api[f]["description"] if "description" in api[f] else "", + longargs="".join( + [ + "\n* `" + k + "` = " + args_gen(api[f]["args"][k], 1) + for k in (api[f]["args"] if "args" in api[f] else {}) + ] + ), + longrets=args_gen(api[f]["rets"], 1) if "rets" in api[f] else "None", + ) + for f in api + ] + ) + def args_summary(api): - if(api["type"] == "list"): + if api["type"] == "list": return "[{}]".format(args_summary(api["values"])) - elif(api["type"] == "dict"): - return "{{{}}}".format(", ".join(["{}: {}".format(k, args_summary(api["values"][k])) for k in api["values"]])) - elif(api["type"] == "tuple"): + elif api["type"] == "dict": + return "{{{}}}".format( + ", ".join( + [ + "{}: {}".format(k, args_summary(api["values"][k])) + for k in api["values"] + ] + ) + ) + elif api["type"] == "tuple": return ", ".join([args_summary(api["values"][k]) for k in api["values"]]) else: return api["type"] + def args_gen(api, depth): - indent = " "*depth - if(api["type"] == "list"): - return "List, all of whose elements are as follows: \n{indent} * {elements}\n".format(indent=indent, elements=args_gen(api['values'], depth+2)) - elif(api["type"] == "dict"): + indent = " " * depth + if api["type"] == "list": + return "List, all of whose elements are as follows: \n{indent} * {elements}\n".format( + indent=indent, elements=args_gen(api["values"], depth + 2) + ) + elif api["type"] == "dict": return "Dictionary with the following keys:\n{keys}\n{indent}".format( indent=indent, - keys="\n".join(["{indent}`{key}`:{value} {desc}".format(indent=indent + "* ", - key=k, - value=args_gen(api['values'][k], depth+1), - desc=("\n "+indent+api['values'][k]['description'] if 'description' in api['values'][k] else "")) - for k in api['values']])) - - elif(api["type"] == "tuple"): + keys="\n".join( + [ + "{indent}`{key}`:{value} {desc}".format( + indent=indent + "* ", + key=k, + value=args_gen(api["values"][k], depth + 1), + desc=( + "\n " + indent + api["values"][k]["description"] + if "description" in api["values"][k] + else "" + ), + ) + for k in api["values"] + ] + ), + ) + + elif api["type"] == "tuple": return "Tuple with the following values:\n{values}\n{indent}".format( indent=indent, - keys="\n".join(["{indent}`{key}`:{value} {desc}".format(indent=indent + "* ", - key=str(k), - value=args_gen(api['values'][k], depth+1), - desc=("\n "+indent+api['values'][k]['description'] if 'description' in api['values'][k] else "")) - for k in api['values']])) - elif(api["type"] in {"str","string","multiline"}): - if("values" in api and len(api['values'])>0): - return "`"+" | ".join(["\"" + k + "\"" for k in api["values"]])+"`" + keys="\n".join( + [ + "{indent}`{key}`:{value} {desc}".format( + indent=indent + "* ", + key=str(k), + value=args_gen(api["values"][k], depth + 1), + desc=( + "\n " + indent + api["values"][k]["description"] + if "description" in api["values"][k] + else "" + ), + ) + for k in api["values"] + ] + ), + ) + elif api["type"] in {"str", "string", "multiline"}: + if "values" in api and len(api["values"]) > 0: + return "`" + " | ".join(['"' + k + '"' for k in api["values"]]) + "`" else: - return "`string`{}".format(", "+api["description"] if "description" in api else "") + return "`string`{}".format( + ", " + api["description"] if "description" in api else "" + ) - elif(api["type"] in {"num","number"}): - if("values" in api and len(api['values'])>0): - return "`"+" | ".join([str(k) for k in api["values"]])+"`" + elif api["type"] in {"num", "number"}: + if "values" in api and len(api["values"]) > 0: + return "`" + " | ".join([str(k) for k in api["values"]]) + "`" else: - return "`num`{}".format(", "+api["description"] if "description" in api else "") - elif(api["type"] == "file"): + return "`num`{}".format( + ", " + api["description"] if "description" in api else "" + ) + elif api["type"] == "file": return "`file`" - elif(api["type"] == "oneof"): - return " | ".join([args_gen(api['values'][k], depth+1) for k in api["values"]]) + elif api["type"] == "oneof": + return " | ".join( + [args_gen(api["values"][k], depth + 1) for k in api["values"]] + ) else: return "`{type}`".format(type=api["type"]) - diff --git a/next/lib/pijemont/server.py b/next/lib/pijemont/server.py index f9a2253d..4149d1e8 100644 --- a/next/lib/pijemont/server.py +++ b/next/lib/pijemont/server.py @@ -1,30 +1,37 @@ from flask import Flask, request, render_template import json, sys, verifier import doc as doc_gen -app = Flask(__name__, static_url_path='/static') -@app.route('/doc/') +app = Flask(__name__, static_url_path="/static") + + +@app.route("/doc/") def doc(form="raw"): - api,blank,pretty = doc_gen.get_docs('example.yaml','.') - + api, blank, pretty = doc_gen.get_docs("example.yaml", ".") + if form == "pretty": - return render_template('doc.html',doc_string=pretty, base_dir="/static") + return render_template("doc.html", doc_string=pretty, base_dir="/static") elif form == "blank": - return render_template('raw.html',doc=blank) + return render_template("raw.html", doc=blank) elif form == "raw": - return render_template('raw.html',doc=api) + return render_template("raw.html", doc=api) return json.dumps(api) -@app.route('/form/') + +@app.route("/form/") def form(fn="excite"): - api,_ = verifier.load_doc('example.yaml','.') - return render_template('form.html',api_doc=api, submit="/submit", function_name=fn, base_dir="/static") + api, _ = verifier.load_doc("example.yaml", ".") + return render_template( + "form.html", api_doc=api, submit="/submit", function_name=fn, base_dir="/static" + ) -@app.route('/submit', methods=["POST"]) + +@app.route("/submit", methods=["POST"]) def submit(): print(json.dumps(request.data)) return "done" - -if __name__ == '__main__': - app.run(host='0.0.0.0',port=int(sys.argv[1]), debug=True) + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=int(sys.argv[1]), debug=True) diff --git a/next/lib/pijemont/tests/test_all.py b/next/lib/pijemont/tests/test_all.py index 71e49e21..efed9b67 100644 --- a/next/lib/pijemont/tests/test_all.py +++ b/next/lib/pijemont/tests/test_all.py @@ -15,19 +15,20 @@ import pytest from .. import verifier + def verify_yaml(test_name, test): """ This function does the actual work of verifying the YAML. It reads the spec in, formats the args (loaded from test_files/{test_name}) then returns both """ dir_path = os.path.dirname(os.path.realpath(__file__)) - api, errs = verifier.load_doc(test['spec'], os.path.join(dir_path,'specs/')) + api, errs = verifier.load_doc(test["spec"], os.path.join(dir_path, "specs/")) if len(errs) > 0: return None, None, errs - fn = test['inputs'][test_name]['function'] - args = test['inputs'][test_name]['args'] - verified = verifier.verify(args, api[fn]['args']) - expected_out = test['inputs'][test_name]['verified'] + fn = test["inputs"][test_name]["function"] + args = test["inputs"][test_name]["args"] + verified = verifier.verify(args, api[fn]["args"]) + expected_out = test["inputs"][test_name]["verified"] return verified, expected_out, None @@ -39,34 +40,40 @@ def run_test(filename): """ dir_path = os.path.dirname(os.path.realpath(__file__)) - with open(os.path.join(dir_path,'test_files/{}'.format(filename))) as f: + with open(os.path.join(dir_path, "test_files/{}".format(filename))) as f: test = yaml.load(f.read()) - for test_name in test['inputs']: - print(' {}'.format(test_name)) - if test['load_errors']: + for test_name in test["inputs"]: + print(" {}".format(test_name)) + if test["load_errors"]: args, out, load_errs = verify_yaml(test_name, test) assert (not load_errs is None) and len(load_errs) > 0 else: try: args, expected_out, load_errs = verify_yaml(test_name, test) assert expected_out == args - assert (not 'errors' in test['inputs'][test_name]) or test['inputs'][test_name]['errors'] == False + assert (not "errors" in test["inputs"][test_name]) or test["inputs"][ + test_name + ]["errors"] == False except: - assert 'errors' in test['inputs'][test_name] and test['inputs'][test_name]['errors'] == True + assert ( + "errors" in test["inputs"][test_name] + and test["inputs"][test_name]["errors"] == True + ) def test_all(): """ Loop over all the files in tests/test_files and run all of them. """ - print('\n') + print("\n") dir_path = os.path.dirname(os.path.realpath(__file__)) - dir_ = os.path.join(dir_path,'test_files/') + dir_ = os.path.join(dir_path, "test_files/") for yaml_filename in os.listdir(dir_): - if 'DS_Store' in yaml_filename: + if "DS_Store" in yaml_filename: continue - print('Testing YAML file {}'.format(yaml_filename)) + print("Testing YAML file {}".format(yaml_filename)) run_test(yaml_filename) + if __name__ == "__main__": test_all() diff --git a/next/lib/pijemont/verifier.py b/next/lib/pijemont/verifier.py index d913c67b..c06b3b04 100644 --- a/next/lib/pijemont/verifier.py +++ b/next/lib/pijemont/verifier.py @@ -5,95 +5,102 @@ import os from .condition import condition_parser -DICT = {'dict','dictionary','map'} -LIST = {'list'} -TUPLE = {'tuple'} -ONEOF = {'oneof'} - -NUM = {'num','number','float'} -STRING = {'str','string','multiline'} -ANY = {'any','stuff'} -FILE = {'file'} -BOOL = {'boolean','bool'} - -def load_doc(filename,base_path): +DICT = {"dict", "dictionary", "map"} +LIST = {"list"} +TUPLE = {"tuple"} +ONEOF = {"oneof"} + +NUM = {"num", "number", "float"} +STRING = {"str", "string", "multiline"} +ANY = {"any", "stuff"} +FILE = {"file"} +BOOL = {"boolean", "bool"} + + +def load_doc(filename, base_path): errs = [] - with open(os.path.join(base_path,filename)) as f: + with open(os.path.join(base_path, filename)) as f: ref = yaml.load(f.read()) ds = [] - for ext in ref.pop('extends',[]): - r,e = load_doc(ext,base_path) + for ext in ref.pop("extends", []): + r, e = load_doc(ext, base_path) ds += [r] errs += e for d in ds: ref = merge_dict(ref, d) errs = check_format(ref) - return ref,errs + return ref, errs + -def merge_dict(d1,d2,prefer=1): +def merge_dict(d1, d2, prefer=1): for k in d2: if k in d1: if type(d1[k]) == dict: - d1[k] = merge_dict(d1[k],d2[k]) + d1[k] = merge_dict(d1[k], d2[k]) if prefer == 2: d1[k] = d2[k] else: d1[k] = d2[k] return d1 + def check_format(doc): errs = [] for x in doc: - if 'args' in doc[x]: - errs += check_format_helper({'type':'dict','values':doc[x]['args']},'args/'+x) - if 'rets' in doc[x]: - errs += check_format_helper(doc[x]['rets'],'rets/'+x) + if "args" in doc[x]: + errs += check_format_helper( + {"type": "dict", "values": doc[x]["args"]}, "args/" + x + ) + if "rets" in doc[x]: + errs += check_format_helper(doc[x]["rets"], "rets/" + x) return errs -def check_format_helper(doc,name): + +def check_format_helper(doc, name): errs = [] - if not 'type' in doc: + if not "type" in doc: errs += ['{}: "type" key missing'.format(name)] - diff = set(doc.keys()) - {'type','description','values','optional','default'} + diff = set(doc.keys()) - {"type", "description", "values", "optional", "default"} if len(diff) > 0: - errs += ["{}: extra keys in spec: {}".format(name,", ".join(list(diff)))] + errs += ["{}: extra keys in spec: {}".format(name, ", ".join(list(diff)))] - if not 'type' in doc or not 'values' in doc: + if not "type" in doc or not "values" in doc: return errs - if not doc['type'] in DICT | LIST | TUPLE | ONEOF | NUM | STRING | BOOL | ANY | FILE: - errs += ['{}: invlid type: {}'.format(name, doc['type'])] + if ( + not doc["type"] + in DICT | LIST | TUPLE | ONEOF | NUM | STRING | BOOL | ANY | FILE + ): + errs += ["{}: invlid type: {}".format(name, doc["type"])] - if doc['type'] in DICT | LIST | TUPLE | ONEOF and not 'values' in doc: + if doc["type"] in DICT | LIST | TUPLE | ONEOF and not "values" in doc: errs += ['{}: requires "values" key'.format(name)] if len(errs) > 0: return errs - if doc['type'] in DICT: - for x in doc['values']: - errs += check_format_helper(doc['values'][x],'{}/{}'.format(name,x)) + if doc["type"] in DICT: + for x in doc["values"]: + errs += check_format_helper(doc["values"][x], "{}/{}".format(name, x)) - elif doc['type'] in LIST: - errs += check_format_helper(doc['values'],'{}/values'.format(name)) + elif doc["type"] in LIST: + errs += check_format_helper(doc["values"], "{}/values".format(name)) - elif doc['type'] in TUPLE: - for x in doc['values']: - errs += check_format_helper(doc['values'][x],'{}/{}'.format(name,str(x))) + elif doc["type"] in TUPLE: + for x in doc["values"]: + errs += check_format_helper(doc["values"][x], "{}/{}".format(name, str(x))) - elif doc['type'] in ONEOF: - for x in doc['values']: - errs += check_format_helper(doc['values'][x],'{}/{}'.format(name,str(x))) + elif doc["type"] in ONEOF: + for x in doc["values"]: + errs += check_format_helper(doc["values"][x], "{}/{}".format(name, str(x))) return errs - - def verify(input_dict, reference_dict): """ Returns: modified_input, success, list_of_errors @@ -103,18 +110,21 @@ def verify(input_dict, reference_dict): - success is a boolean true if there were no problems and false otherwise - list_of_errors is as in verify_helper """ - input_dict, messages = verify_helper("", input_dict, {'type':'dict','values':reference_dict}) + input_dict, messages = verify_helper( + "", input_dict, {"type": "dict", "values": reference_dict} + ) try: - if len(messages)>0: - raise Exception("Failed to verify: {}".format(messages)) - else: - return input_dict + if len(messages) > 0: + raise Exception("Failed to verify: {}".format(messages)) + else: + return input_dict except Exception as error: - exc_type, exc_value, exc_traceback = sys.exc_info() - print("Exception: {} {}".format(error, traceback.format_exc())) - traceback.print_tb(exc_traceback) - raise Exception(error.args[0]) + exc_type, exc_value, exc_traceback = sys.exc_info() + print("Exception: {} {}".format(error, traceback.format_exc())) + traceback.print_tb(exc_traceback) + raise Exception(error.args[0]) + def verify_helper(name, input_element, reference_dict): """ @@ -126,95 +136,140 @@ def verify_helper(name, input_element, reference_dict): - list_of_errors is: [{name: name, message: ...}, ...] """ ans = [] - if reference_dict['type'] in DICT: + if reference_dict["type"] in DICT: if not isinstance(input_element, (dict)): - ans += [{"name":name, "message":"invalid dict"}] + ans += [{"name": name, "message": "invalid dict"}] else: - l1,l2 = compare_dict_keys(input_element, reference_dict['values']) + l1, l2 = compare_dict_keys(input_element, reference_dict["values"]) if len(l1) > 0: - ans += [{"name":name, "message":"extra keys in input: " + ",".join(l1)}] + ans += [ + {"name": name, "message": "extra keys in input: " + ",".join(l1)} + ] else: ok = True for k in l2: - if 'default' in reference_dict['values'][k]: - input_element[k] = reference_dict['values'][k]['default'] - if reference_dict['values'][k]['type'] in NUM: + if "default" in reference_dict["values"][k]: + input_element[k] = reference_dict["values"][k]["default"] + if reference_dict["values"][k]["type"] in NUM: input_element[k] = float(input_element[k]) - elif (not 'optional' in reference_dict['values'][k]) or reference_dict['values'][k]['optional'] == False: - ans += [{"name":name+'/'+k, "message":"required key is absent"}] + elif ( + not "optional" in reference_dict["values"][k] + ) or reference_dict["values"][k]["optional"] == False: + ans += [ + { + "name": name + "/" + k, + "message": "required key is absent", + } + ] ok = False - if(ok): + if ok: for k in input_element: - input_element[k], temp_ans = verify_helper(name + '/' + k, input_element[k], reference_dict['values'][str(k)]) + input_element[k], temp_ans = verify_helper( + name + "/" + k, + input_element[k], + reference_dict["values"][str(k)], + ) ans += temp_ans - elif reference_dict['type'] in LIST: + elif reference_dict["type"] in LIST: if not isinstance(input_element, (list)): - ans += [{"name":name, "message":"invalid list"}] + ans += [{"name": name, "message": "invalid list"}] else: for i in range(len(input_element)): - input_element[i],temp_ans = verify_helper(name+'/'+str(i), input_element[i], reference_dict['values']) + input_element[i], temp_ans = verify_helper( + name + "/" + str(i), input_element[i], reference_dict["values"] + ) ans += temp_ans - elif reference_dict['type'] in TUPLE: - if not isinstance(input_element, (list,tuple)): - ans += [{"name":name, "message":"invalid tuple"}] + elif reference_dict["type"] in TUPLE: + if not isinstance(input_element, (list, tuple)): + ans += [{"name": name, "message": "invalid tuple"}] else: new_tuple = list(input_element) for i in range(len(input_element)): - new_tuple[i], temp_ans = verify_helper(name+'/'+str(i), input_element[i], reference_dict['values'][i]) + new_tuple[i], temp_ans = verify_helper( + name + "/" + str(i), input_element[i], reference_dict["values"][i] + ) ans += temp_ans new_tuple = tuple(new_tuple) - elif reference_dict['type'] in BOOL: + elif reference_dict["type"] in BOOL: if not isinstance(input_element, (bool)): - ans += [{"name":name, "message":"invalid boolean"}] + ans += [{"name": name, "message": "invalid boolean"}] - elif reference_dict['type'] in NUM: + elif reference_dict["type"] in NUM: ok = True if not isinstance(input_element, (int, float, long)): if isinstance(input_element, (str, unicode)): try: input_element = float(input_element) except: - ans += [{"name":name, "message":"invalid number"}] + ans += [{"name": name, "message": "invalid number"}] ok = False else: - ans += [{"name":name, "message":"invalid number"}] + ans += [{"name": name, "message": "invalid number"}] ok = False if ok: - if 'values' in reference_dict: + if "values" in reference_dict: try: - condition_parser().parse("{} {}".format(str(input_element),str(reference_dict['values']))) + condition_parser().parse( + "{} {}".format( + str(input_element), str(reference_dict["values"]) + ) + ) except Exception as exc: - ans += [{"name":name, "message":str(exc)}] + ans += [{"name": name, "message": str(exc)}] - elif reference_dict['type'] in STRING: + elif reference_dict["type"] in STRING: if not isinstance(input_element, (str, unicode)): - ans += [{"name":name, "message":"expected a string, got {}".format(type(input_element))}] - elif 'values' in reference_dict and not input_element in reference_dict['values']: - ans += [{"name":name, "message":"argument must be one of the specified strings: "+", ".join(reference_dict['values'])}] - - elif reference_dict['type'] in ONEOF: + ans += [ + { + "name": name, + "message": "expected a string, got {}".format(type(input_element)), + } + ] + elif ( + "values" in reference_dict and not input_element in reference_dict["values"] + ): + ans += [ + { + "name": name, + "message": "argument must be one of the specified strings: " + + ", ".join(reference_dict["values"]), + } + ] + + elif reference_dict["type"] in ONEOF: count = 0 - for k in reference_dict['values']: + for k in reference_dict["values"]: if k in input_element: count += 1 if count > 1: - ans += [{"name":name+"/"+k,"message":"More than one argument specified for 'oneof arg: " + name}] + ans += [ + { + "name": name + "/" + k, + "message": "More than one argument specified for 'oneof arg: " + + name, + } + ] if count == 0: - if 'default' in reference_dict: - input_element = reference_dict['default'] + if "default" in reference_dict: + input_element = reference_dict["default"] else: - ans += [{"name":name, "message":"no argument provided for 'oneof' arg"}] + ans += [ + {"name": name, "message": "no argument provided for 'oneof' arg"} + ] - elif reference_dict['type'] in ANY | FILE: + elif reference_dict["type"] in ANY | FILE: pass else: - ans += [{"name":name, "message":"invalid type: {}".format(reference_dict['type'])}] + ans += [ + {"name": name, "message": "invalid type: {}".format(reference_dict["type"])} + ] + + return input_element, ans - return input_element,ans def compare_dict_keys(d1, d2): """ @@ -222,13 +277,13 @@ def compare_dict_keys(d1, d2): """ return [k for k in d1 if not k in d2], [k for k in d2 if not k in d1] -if __name__ == '__main__': + +if __name__ == "__main__": if len(sys.argv) > 1: - r,e = load_doc(sys.argv[1]) - print('doc',r) - print('errs',e) + r, e = load_doc(sys.argv[1]) + print("doc", r) + print("errs", e) if len(sys.argv) > 2: - i,e = verify(sys.argv[2],r) - print("Errors",e) - print("Verified input",i) - + i, e = verify(sys.argv[2], r) + print("Errors", e) + print("Verified input", i) diff --git a/next/logging_client/LoggerAPI.py b/next/logging_client/LoggerAPI.py index b22d356b..b69f8700 100644 --- a/next/logging_client/LoggerAPI.py +++ b/next/logging_client/LoggerAPI.py @@ -8,19 +8,23 @@ import datetime from next.database_client.DatabaseAPI import DatabaseAPI + class LoggerAPI(DatabaseAPI): - def __init__(self, mongo_host=constants.MONGODB_HOST, mongo_port=constants.MONGODB_PORT, - database_name=constants.logs_database_id): + def __init__( + self, + mongo_host=constants.MONGODB_HOST, + mongo_port=constants.MONGODB_PORT, + database_name=constants.logs_database_id, + ): super(LoggerAPI, self).__init__(mongo_host, mongo_port, database_name) - def _normalize_logentry(self, log): - if log.get('timestamp') and isinstance(log.get('timestamp'), datetime.datetime): - log['timestamp'] = str(log['timestamp']) + if log.get("timestamp") and isinstance(log.get("timestamp"), datetime.datetime): + log["timestamp"] = str(log["timestamp"]) return log - def log(self,bucket_id,log_dict): + def log(self, bucket_id, log_dict): """ Saves log_dict to PermStore as an individual document for later recall. @@ -29,17 +33,19 @@ def log(self,bucket_id,log_dict): """ self.set_doc(bucket_id, None, log_dict) - def get_logs_with_filter(self,bucket_id,pattern_dict): + def get_logs_with_filter(self, bucket_id, pattern_dict): """ Retrieves all logs in bucket_id that match (i.e. contain) pattern_dict Inputs: (string) bucket_id, (dict of string values) pattern_dict """ - return [self._normalize_logentry(d) - for d in self.get_docs_with_filter(bucket_id, pattern_dict)] + return [ + self._normalize_logentry(d) + for d in self.get_docs_with_filter(bucket_id, pattern_dict) + ] - def delete_logs_with_filter(self,bucket_id,pattern_dict): + def delete_logs_with_filter(self, bucket_id, pattern_dict): """ Deletes all logs in bucket_id that match (i.e. contain) pattern_dict diff --git a/next/logging_client/test_loggerapi.py b/next/logging_client/test_loggerapi.py index b1781c82..6b1e0328 100644 --- a/next/logging_client/test_loggerapi.py +++ b/next/logging_client/test_loggerapi.py @@ -5,44 +5,47 @@ import next.utils as utils # IMPORTANT NOTE: only uses the `test_logger_data` database; it gets cleared after each test session -MONGO_HOST, MONGO_PORT = 'localhost', 27017 -MONGO_DB = 'test_logger_data' +MONGO_HOST, MONGO_PORT = "localhost", 27017 +MONGO_DB = "test_logger_data" # === fixtures === -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def lapi(): lapi = LoggerAPI(MONGO_HOST, MONGO_PORT, MONGO_DB) yield lapi lapi.client.drop_database(MONGO_DB) lapi.client.close() + # === tests === def test_log(lapi): - B = 'test_log' + B = "test_log" now = utils.datetimeNow() - log_entry = {'a': 2, 'timestamp': now} + log_entry = {"a": 2, "timestamp": now} lapi.log(B, log_entry) - assert lapi._bucket(B).find_one({'timestamp': now}).get('a') == 2 + assert lapi._bucket(B).find_one({"timestamp": now}).get("a") == 2 + def test_get_logs_with_filter(lapi): - B = 'test_get_logs_with_filter' + B = "test_get_logs_with_filter" now = utils.datetimeNow() - log_entry = {'a': 2, 'timestamp': now} + log_entry = {"a": 2, "timestamp": now} lapi.log(B, log_entry) - retrieved_entry = lapi.get_logs_with_filter(B, {'timestamp': now}) + retrieved_entry = lapi.get_logs_with_filter(B, {"timestamp": now}) assert len(retrieved_entry) == 1 - assert retrieved_entry[0].get('a') == 2 + assert retrieved_entry[0].get("a") == 2 + def test_delete_logs_with_filter(lapi): - B = 'test_delete_logs_with_filter' + B = "test_delete_logs_with_filter" - lapi._bucket(B).insert_many([{'a': 2}, {'a': 2, 'b': 3}, {'a': 6}]) + lapi._bucket(B).insert_many([{"a": 2}, {"a": 2, "b": 3}, {"a": 6}]) - lapi.delete_logs_with_filter(B, {'a': 2}) + lapi.delete_logs_with_filter(B, {"a": 2}) - logs = [{k:v for k, v in d.items() if k != '_id'} for d in lapi._bucket(B).find()] - assert logs == [{'a': 6}] \ No newline at end of file + logs = [{k: v for k, v in d.items() if k != "_id"} for d in lapi._bucket(B).find()] + assert logs == [{"a": 6}] diff --git a/next/query_page/query_page.py b/next/query_page/query_page.py index 5ce69509..a16e983c 100644 --- a/next/query_page/query_page.py +++ b/next/query_page/query_page.py @@ -7,28 +7,34 @@ import next.utils as utils resource_manager = ResourceManager() -query_page = Blueprint('query_page', - __name__, - template_folder='templates', - static_folder='static') +query_page = Blueprint( + "query_page", __name__, template_folder="templates", static_folder="static" +) -@query_page.route('/query_page/') -@query_page.route('/query_page//') + +@query_page.route("/query_page/") +@query_page.route("/query_page//") def load_page(page, exp_uid=None): experiment = resource_manager.get_experiment(exp_uid) - app_template = page+'.html' + app_template = page + ".html" if constants.NEXT_BACKEND_GLOBAL_HOST: - host_url = 'http://{}:{}'.format(constants.NEXT_BACKEND_GLOBAL_HOST, - constants.NEXT_BACKEND_GLOBAL_PORT) + host_url = "http://{}:{}".format( + constants.NEXT_BACKEND_GLOBAL_HOST, constants.NEXT_BACKEND_GLOBAL_PORT + ) else: - host_url = '' + host_url = "" - part_id = request.args.get('participant', None) + part_id = request.args.get("participant", None) participant_uid = str(part_id) if part_id else part_id - - return render_template(app_template, host_url=host_url, exp_uid=exp_uid, - experiment=experiment, - participant_uid=participant_uid), \ - 200, {'Cache-Control':'private, max-age=0, no-cache, no-store'} - + return ( + render_template( + app_template, + host_url=host_url, + exp_uid=exp_uid, + experiment=experiment, + participant_uid=participant_uid, + ), + 200, + {"Cache-Control": "private, max-age=0, no-cache, no-store"}, + ) diff --git a/next/utils.py b/next/utils.py index 10a7fccd..1973726c 100644 --- a/next/utils.py +++ b/next/utils.py @@ -1,3 +1,4 @@ +from __future__ import print_function import yaml import random import sys @@ -5,14 +6,16 @@ from decorator import decorator from line_profiler import LineProfiler -color_ansi = {'yellow': '\x1b[33m', - 'red': '\x1b[31m', - 'blue': '\x1b[34m', - 'green': '\x1b[32m', - 'white': '\x1b[37m', - 'black': '\x1b[30m', - 'purple': '\x1b[35m', - 'reset all': '\x1b[0m'} +color_ansi = { + "yellow": "\x1b[33m", + "red": "\x1b[31m", + "blue": "\x1b[34m", + "green": "\x1b[32m", + "white": "\x1b[37m", + "black": "\x1b[30m", + "purple": "\x1b[35m", + "reset all": "\x1b[0m", +} @decorator @@ -27,85 +30,95 @@ def profile_each_line(func, *args, **kwargs): return retval -def get_supported_apps(apps_path='apps/'): - """ +def get_supported_apps(apps_path="apps/"): + """ Returns a list of strings correspdoning to the app_id's that are fully operational in the learning library. Usage: ::\n app_id_list = utils.get_supported_apps() - print app_id_list + print(app_id_list) >>> ['StochasticBanditsPureExploration', 'DuelingBanditsPureExploration', 'StochasticLinearBanditsExploreExploit', 'PoolBasedTripletMDS'] """ - import os - return [d for d in next(os.walk(os.path.dirname(apps_path)))[1] if d[0] not in {'.', '_'}] + import os + + return [ + d + for d in next(os.walk(os.path.dirname(apps_path)))[1] + if d[0] not in {".", "_"} + ] def get_app(app_id, exp_uid, db, ell): - """ + """ Returns an object correspoding to the app_id that contains methods like initExp,getQuery,etc. Usage: ::\n app = utils.get_app(app_id) - print app + print(app) >>> """ - app_id = str(app_id) # soemtimes input is unicode formatted which causes error - next_path = 'next.apps.App' - app_module = __import__(next_path,fromlist=['']) - app_class = getattr(app_module, 'App') - return app_class(app_id, exp_uid, db, ell) + app_id = str(app_id) # soemtimes input is unicode formatted which causes error + next_path = "next.apps.App" + app_module = __import__(next_path, fromlist=[""]) + app_class = getattr(app_module, "App") + return app_class(app_id, exp_uid, db, ell) -def get_app_alg(app_id,alg_id): - """ + +def get_app_alg(app_id, alg_id): + """ Returns an object correspoding to the alg_id that contains methods like initExp,getQuery,etc. Note that each algorithm (with an alg_id) is a child of an app (with an app_id), hence the app_id input Usage: ::\n alg = utils.get_app_alg(app_id,alg_id) - print alg + print(alg) >>> """ - app_id = str(app_id) # soemtimes input is unicode formatted which causes error - alg_id = str(alg_id) # soemtimes input is unicode formatted which causes error - next_path = 'apps.{}.algs.{}'.format(app_id, alg_id, alg_id) - alg_module = __import__(next_path, fromlist=['']) - alg_class = getattr(alg_module, 'MyAlg') - return alg_class() + app_id = str(app_id) # soemtimes input is unicode formatted which causes error + alg_id = str(alg_id) # soemtimes input is unicode formatted which causes error + next_path = "apps.{}.algs.{}".format(app_id, alg_id, alg_id) + alg_module = __import__(next_path, fromlist=[""]) + alg_class = getattr(alg_module, "MyAlg") + return alg_class() -def getDocUID(exp_uid,alg_uid=None): - """ +def getDocUID(exp_uid, alg_uid=None): + """ Each instance of an app (with an (app_id,exp_uid) pair) and an algorithm (with an (app_id,exp_uid,alg_id,alg_uid) tuple) gets its own namespace. This method defines that namespace given the exp_uid, or (exp_uid,alg_uid) Usage::\n - print utils.getDocUID(exp_uid) + print(utils.getDocUID(exp_uid)) >>> 'eee9d58c61d580029113ba593446d23a' - print utils.getDocUID(exp_uid,alg_uid) + print(utils.getDocUID(exp_uid,alg_uid)) >>> 'eee9d58c61d580029113ba593446d23a-f081d374abac6c009f5a74877f8b9f3c' """ - if alg_uid==None: - return exp_uid - else: - return exp_uid + "-" + alg_uid + if alg_uid == None: + return exp_uid + else: + return exp_uid + "-" + alg_uid + import os + + def getNewUID(): - """ + """ Returns length 32 string of random hex that is generated from machine state - good enough for cryptography Probability of collision is 1 in 340282366920938463463374607431768211456 Used for unique identifiers all over the system """ - uid = os.urandom(16).encode('hex') - return uid - + uid = os.urandom(16).encode("hex") + return uid from datetime import datetime -def datetimeNow(format='datetime'): - """ + + +def datetimeNow(format="datetime"): + """ Returns the current datetime in the format used throughout the system. For consistency, one should ALWAYS call this method, do not make your own call to datetime. @@ -113,27 +126,29 @@ def datetimeNow(format='datetime'): utils.datetimeNow() >>> datetime.datetime(2015, 2, 17, 11, 5, 56, 27822) """ - date = datetime.now() - if format=='string': - return datetime2str(date) - else: - return date + date = datetime.now() + if format == "string": + return datetime2str(date) + else: + return date + def datetime2str(obj_datetime): - """ + """ Converts a datetime string into a datetime object in the system. For consistency, one should never use their own method of converting to string, always use this method. Usage: ::\n date = utils.datetimeNow() date_str = utils.datetime2str(date) - print date_str + print(date_str) >>> '2015-02-17 11:11:07.489925' """ - return str(obj_datetime) + return str(obj_datetime) + def str2datetime(str_time): - """ + """ Converts a datetime object into the string format used in the system. For consistency, one should never use their own method of converting to string, always use this method. @@ -142,14 +157,14 @@ def str2datetime(str_time): date_str = utils.datetime2str(date) utils.str2datetime(date_str) """ - try: - return datetime.strptime(str_time,'%Y-%m-%d %H:%M:%S.%f') - except: - return datetime.strptime(str_time,'%Y-%m-%d %H:%M:%S') + try: + return datetime.strptime(str_time, "%Y-%m-%d %H:%M:%S.%f") + except: + return datetime.strptime(str_time, "%Y-%m-%d %H:%M:%S") def _get_filename(target): - return target['alt_description'] + return target["alt_description"] def filenames_to_ids(filenames, targets): @@ -162,45 +177,50 @@ def filenames_to_ids(filenames, targets): if isinstance(filenames[0], dict): return {k: _to_ids(v, targets) for k, v in filenames.items()} - ids = {_get_filename(target): target['target_id'] for target in targets} + ids = {_get_filename(target): target["target_id"] for target in targets} not_in_targets = set(filenames) - set(ids) if len(not_in_targets) > 0: - msg = 'Filenames specified in init.yaml "{}" in the not found the list of targets' + msg = ( + 'Filenames specified in init.yaml "{}" in the not found the list of targets' + ) raise ValueError(msg.format(not_in_targets)) return [ids[filename] for filename in filenames] def debug_print(*args, **kwargs): - color = kwargs.get('color', 'yellow') + color = kwargs.get("color", "yellow") for a in args: if type(a) in {str}: - lines = a.split('\n') + lines = a.split("\n") for line in lines: - pprint_arg = pprint.pformat(line).split('\n') + pprint_arg = pprint.pformat(line).split("\n") for line2 in pprint_arg: - print '{}{}{}'.format(color_ansi[color], - line2, - color_ansi['reset all']) + print( + "{}{}{}".format( + color_ansi[color], line2, color_ansi["reset all"] + ) + ) else: - pprint_a = pprint.pformat(a).split('\n') + pprint_a = pprint.pformat(a).split("\n") for line in pprint_a: - print '{}{}{}'.format(color_ansi[color], - line, - color_ansi['reset all']) - print '' + print("{}{}{}".format(color_ansi[color], line, color_ansi["reset all"])) + print("") + def random_string(length=20): - letters = list('qwertyuiopasdfghkjlzxcvbnm') + letters = list("qwertyuiopasdfghkjlzxcvbnm") s = [random.choice(letters) for _ in range(length)] - s = ''.join(s) + s = "".join(s) return s import time + + def timeit(f): - """ + """ Utility used to time the duration of code execution. This script can be composed with any other script. Usage::\n @@ -213,13 +233,15 @@ def g(n): answer0,dt = timeit(f)(3) answer1,answer2,dt = timeit(g)(3) """ - def timed(*args, **kw): - ts = time.time() - result = f(*args, **kw) - te = time.time() - # TODO: delete these three lines. Use - # `grep -Hnri ,.*,.* = .*utils.timeit` to find all locations this function - # is are used (typically in `a, b, c, dt = utils.timeit(...)(...)`. We want - # `a, dt = utils.timeit(...)(...)`. - return result, (te-ts) - return timed + + def timed(*args, **kw): + ts = time.time() + result = f(*args, **kw) + te = time.time() + # TODO: delete these three lines. Use + # `grep -Hnri ,.*,.* = .*utils.timeit` to find all locations this function + # is are used (typically in `a, b, c, dt = utils.timeit(...)(...)`. We want + # `a, dt = utils.timeit(...)(...)`. + return result, (te - ts) + + return timed