diff --git a/.gitignore b/.gitignore index d646835b..3253a2d2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.pyc __pycache__/ +Data diff --git a/Pilot1/Attn/README.md b/Pilot1/Attn/README.md new file mode 100644 index 00000000..95cc7970 --- /dev/null +++ b/Pilot1/Attn/README.md @@ -0,0 +1,133 @@ +The Pilot1 Attn Benchmark requires an hdf5 file specified by the hyperparameter "in", name of this file for default case is: top_21_1fold_001.h5 + +Benchmark auto downloads the file below: +http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_1fold_001.h5 (~4GB) + +Any file of the form top*21_1fold*"ijk".h5 can be used as input + +## Sample run: + +``` +python attn_baseline_keras2.py +Params: {'model_name': 'attn', 'dense': [2000, 600], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'categorical_crossentropy', 'optimizer': 'sgd', 'drop': 0.2, 'learning_rate': 1e-05, 'momentum': 0.7, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'in': 'top_21_1fold_001.h5', 'save_path': 'candle_save', 'save_dir': './save/001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/nfs2/jain/Benchmarks/Pilot1/Attn/Output/EXP000/RUN000'} +... +... +processing h5 in file top_21_1fold_001.h5 + +x_train shape: (271915, 6212) +x_test shape: (33989, 6212) +Examples: +Total: 339893 +Positive: 12269 (3.61% of total) + +X_train shape: (271915, 6212) +X_test shape: (33989, 6212) +Y_train shape: (271915, 2) +Y_test shape: (33989, 2) +Instructions for updating: +If using Keras pass *_constraint arguments to layers. +Model: "model_1" +__________________________________________________________________________________________________ +Layer (type) Output Shape Param # Connected to +================================================================================================== +input_1 (InputLayer) (None, 6212) 0 +__________________________________________________________________________________________________ +dense_1 (Dense) (None, 1000) 6213000 input_1[0][0] +__________________________________________________________________________________________________ +batch_normalization_1 (BatchNor (None, 1000) 4000 dense_1[0][0] +__________________________________________________________________________________________________ +dense_2 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] +__________________________________________________________________________________________________ +batch_normalization_2 (BatchNor (None, 1000) 4000 dense_2[0][0] +__________________________________________________________________________________________________ +dense_3 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] +__________________________________________________________________________________________________ +multiply_1 (Multiply) (None, 1000) 0 batch_normalization_2[0][0] + dense_3[0][0] +__________________________________________________________________________________________________ +dense_4 (Dense) (None, 500) 500500 multiply_1[0][0] +__________________________________________________________________________________________________ +batch_normalization_3 (BatchNor (None, 500) 2000 dense_4[0][0] +__________________________________________________________________________________________________ +dropout_1 (Dropout) (None, 500) 0 batch_normalization_3[0][0] +__________________________________________________________________________________________________ +dense_5 (Dense) (None, 250) 125250 dropout_1[0][0] +__________________________________________________________________________________________________ +batch_normalization_4 (BatchNor (None, 250) 1000 dense_5[0][0] +__________________________________________________________________________________________________ +dropout_2 (Dropout) (None, 250) 0 batch_normalization_4[0][0] +__________________________________________________________________________________________________ +dense_6 (Dense) (None, 125) 31375 dropout_2[0][0] +__________________________________________________________________________________________________ +batch_normalization_5 (BatchNor (None, 125) 500 dense_6[0][0] +__________________________________________________________________________________________________ +dropout_3 (Dropout) (None, 125) 0 batch_normalization_5[0][0] +__________________________________________________________________________________________________ +dense_7 (Dense) (None, 60) 7560 dropout_3[0][0] +__________________________________________________________________________________________________ +batch_normalization_6 (BatchNor (None, 60) 240 dense_7[0][0] +__________________________________________________________________________________________________ +dropout_4 (Dropout) (None, 60) 0 batch_normalization_6[0][0] +__________________________________________________________________________________________________ +dense_8 (Dense) (None, 30) 1830 dropout_4[0][0] +__________________________________________________________________________________________________ +batch_normalization_7 (BatchNor (None, 30) 120 dense_8[0][0] +__________________________________________________________________________________________________ +dropout_5 (Dropout) (None, 30) 0 batch_normalization_7[0][0] +__________________________________________________________________________________________________ +dense_9 (Dense) (None, 2) 62 dropout_5[0][0] +================================================================================================== + +Total params: 8,893,437 +Trainable params: 8,887,507 +Non-trainable params: 5,930 +.. +.. +271915/271915 [==============================] - 631s 2ms/step - loss: 0.8681 - acc: 0.5548 - tf_auc: 0.5371 - val_loss: 0.6010 - val_acc: 0.8365 - val_tf_auc: 0.5743 +Current time ....631.567 + +Epoch 00001: val_loss improved from inf to 0.60103, saving model to ./save/001/Agg_attn_bin.autosave.model.h5 +creating table of predictions +creating figure 1 at ./save/001/Agg_attn_bin.auroc.pdf +creating figure 2 at ./save/001/Agg_attn_bin.auroc2.pdf +f1=0.234 auroc=0.841 aucpr=0.990 +creating figure 3 at ./save/001/Agg_attn_bin.aurpr.pdf +creating figure 4 at ./save/001/Agg_attn_bin.confusion_without_norm.pdf +Confusion matrix, without normalization +[[27591 5190][ 360 848]] +Confusion matrix, without normalization +[[27591 5190][ 360 848]] +Normalized confusion matrix +[[0.84 0.16][0.3 0.7 ]] +Examples: +Total: 339893 +Positive: 12269 (3.61% of total) + +0.7718316679565835 +0.7718316679565836 +precision recall f1-score support + + 0 0.99 0.84 0.91 32781 + 1 0.14 0.70 0.23 1208 + +micro avg 0.84 0.84 0.84 33989 +macro avg 0.56 0.77 0.57 33989 +weighted avg 0.96 0.84 0.88 33989 + +[[27591 5190][ 360 848]] +score +[0.5760348070144456, 0.8367118835449219, 0.5936741828918457] +Test val_loss: 0.5760348070144456 +Test accuracy: 0.8367118835449219 +Saved model to disk +Loaded json model from disk +json Validation loss: 0.560062773128295 +json Validation accuracy: 0.8367118835449219 +json accuracy: 83.67% +Loaded yaml model from disk +yaml Validation loss: 0.560062773128295 +yaml Validation accuracy: 0.8367118835449219 +yaml accuracy: 83.67% +Yaml_train_shape: (271915, 2) +Yaml_test_shape: (33989, 2) +``` diff --git a/Pilot1/Attn/attn.py b/Pilot1/Attn/attn.py new file mode 100644 index 00000000..aaeb9526 --- /dev/null +++ b/Pilot1/Attn/attn.py @@ -0,0 +1,214 @@ +from __future__ import print_function + +import os +import sys +import logging + +import pandas as pd +import numpy as np + +from sklearn.metrics import mean_squared_error +from sklearn.metrics import r2_score +from scipy.stats.stats import pearsonr + +file_path = os.path.dirname(os.path.realpath(__file__)) +#lib_path = os.path.abspath(os.path.join(file_path, '..')) +#sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) + +import candle + +logger = logging.getLogger(__name__) +candle.set_parallelism_threads() + +additional_definitions = [ +{'name':'latent_dim', + 'action':'store', + 'type': int, + 'help':'latent dimensions'}, +{'name':'residual', + 'type': candle.str2bool, + 'default': False, + 'help':'add skip connections to the layers'}, +{'name':'reduce_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'reduce learning rate on plateau'}, +{'name':'warmup_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'gradually increase learning rate on start'}, +{'name':'base_lr', + 'type': float, + 'help':'base learning rate'}, +{'name':'epsilon_std', + 'type': float, + 'help':'epsilon std for sampling latent noise'}, +{'name':'use_cp', + 'type': candle.str2bool, + 'default': False, + 'help':'checkpoint models with best val_loss'}, +#{'name':'shuffle', + #'type': candle.str2bool, + #'default': False, + #'help':'shuffle data'}, +{'name':'use_tb', + 'type': candle.str2bool, + 'default': False, + 'help':'use tensorboard'}, +{'name':'tsne', + 'type': candle.str2bool, + 'default': False, + 'help':'generate tsne plot of the latent representation'} +] + +required = [ + 'activation', + 'batch_size', + 'dense', + 'dropout', + 'epochs', + 'initialization', + 'learning_rate', + 'loss', + 'optimizer', + 'rng_seed', + 'scaling', + 'val_split', + 'latent_dim', + 'batch_normalization', + 'epsilon_std', + 'timeout' + ] + +class BenchmarkAttn(candle.Benchmark): + + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +def extension_from_parameters(params, framework=''): + """Construct string for saving model with annotation of parameters""" + ext = framework + for i, n in enumerate(params['dense']): + if n: + ext += '.D{}={}'.format(i+1, n) + ext += '.A={}'.format(params['activation'][0]) + ext += '.B={}'.format(params['batch_size']) + ext += '.E={}'.format(params['epochs']) + ext += '.L={}'.format(params['latent_dim']) + ext += '.LR={}'.format(params['learning_rate']) + ext += '.S={}'.format(params['scaling']) + + if params['epsilon_std'] != 1.0: + ext += '.EPS={}'.format(params['epsilon_std']) + if params['dropout']: + ext += '.DR={}'.format(params['dropout']) + if params['batch_normalization']: + ext += '.BN' + if params['warmup_lr']: + ext += '.WU_LR' + if params['reduce_lr']: + ext += '.Re_LR' + if params['residual']: + ext += '.Res' + + return ext +def load_data(params, seed): + + # start change # + if params['train_data'].endswith('h5') or params['train_data'].endswith('hdf5'): + print ('processing h5 in file {}'.format(params['train_data'])) + + url = params['data_url'] + file_train = params['train_data'] + train_file = candle.get_file(file_train, url+file_train, cache_subdir='Pilot1') + + df_x_train_0 = pd.read_hdf(train_file, 'x_train_0').astype(np.float32) + df_x_train_1 = pd.read_hdf(train_file, 'x_train_1').astype(np.float32) + X_train = pd.concat([df_x_train_0, df_x_train_1], axis=1, sort=False) + del df_x_train_0, df_x_train_1 + + df_x_test_0 = pd.read_hdf(train_file, 'x_test_0').astype(np.float32) + df_x_test_1 = pd.read_hdf(train_file, 'x_test_1').astype(np.float32) + X_test = pd.concat([df_x_test_0, df_x_test_1], axis=1, sort=False) + del df_x_test_0, df_x_test_1 + + df_x_val_0 = pd.read_hdf(train_file, 'x_val_0').astype(np.float32) + df_x_val_1 = pd.read_hdf(train_file, 'x_val_1').astype(np.float32) + X_val = pd.concat([df_x_val_0, df_x_val_1], axis=1, sort=False) + del df_x_val_0, df_x_val_1 + + Y_train = pd.read_hdf(train_file, 'y_train') + Y_test = pd.read_hdf(train_file, 'y_test') + Y_val = pd.read_hdf(train_file, 'y_val') + + # assumes AUC is in the third column at index 2 + # df_y = df['AUC'].astype('int') + # df_x = df.iloc[:,3:].astype(np.float32) + + # assumes dataframe has already been scaled + # scaler = StandardScaler() + # df_x = scaler.fit_transform(df_x) + else: + print ('expecting in file file suffix h5') + sys.exit() + + + print('x_train shape:', X_train.shape) + print('x_test shape:', X_test.shape) + + return X_train, Y_train, X_val, Y_val, X_test, Y_test + + # start change # + if train_file.endswith('h5') or train_file.endswith('hdf5'): + print ('processing h5 in file {}'.format(train_file)) + + df_x_train_0 = pd.read_hdf(train_file, 'x_train_0').astype(np.float32) + df_x_train_1 = pd.read_hdf(train_file, 'x_train_1').astype(np.float32) + X_train = pd.concat([df_x_train_0, df_x_train_1], axis=1, sort=False) + del df_x_train_0, df_x_train_1 + + df_x_test_0 = pd.read_hdf(train_file, 'x_test_0').astype(np.float32) + df_x_test_1 = pd.read_hdf(train_file, 'x_test_1').astype(np.float32) + X_test = pd.concat([df_x_test_0, df_x_test_1], axis=1, sort=False) + del df_x_test_0, df_x_test_1 + + df_x_val_0 = pd.read_hdf(train_file, 'x_val_0').astype(np.float32) + df_x_val_1 = pd.read_hdf(train_file, 'x_val_1').astype(np.float32) + X_val = pd.concat([df_x_val_0, df_x_val_1], axis=1, sort=False) + del df_x_val_0, df_x_val_1 + + Y_train = pd.read_hdf(train_file, 'y_train') + Y_test = pd.read_hdf(train_file, 'y_test') + Y_val = pd.read_hdf(train_file, 'y_val') + + # assumes AUC is in the third column at index 2 + # df_y = df['AUC'].astype('int') + # df_x = df.iloc[:,3:].astype(np.float32) + + # assumes dataframe has already been scaled + # scaler = StandardScaler() + # df_x = scaler.fit_transform(df_x) + + else: + print ('expecting in file file suffix h5') + sys.exit() + + + print('x_train shape:', X_train.shape) + print('x_test shape:', X_test.shape) + + return X_train, Y_train, X_val, Y_val, X_test, Y_test + + diff --git a/Pilot1/Attn/attn_abs_default_model.txt b/Pilot1/Attn/attn_abs_default_model.txt new file mode 100644 index 00000000..442c5c7d --- /dev/null +++ b/Pilot1/Attn/attn_abs_default_model.txt @@ -0,0 +1,27 @@ +[Global_Params] +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +train_data='top_21_1fold_001.h5' +model_name='attn_abs' +dense=[1000, 1000, 1000, 500, 250, 125, 60, 30, 2] +batch_size=32 +epochs=2 +activation=['relu', 'relu', 'softmax', 'relu', 'relu', 'relu', 'relu', 'relu', 'softmax'] +loss='categorical_crossentropy' +optimizer='sgd' +dropout=0.2 +learning_rate=0.00001 +momentum=0.9 +val_split=0.1 +rng_seed=2017 +use_cp=False +early_stop=True +reduce_lr=True +feature_subsample=0 +output_dir='save_abs/EXP01/' +experiment_id='01' +run_id='1' +save_path='save_abs/EXP01/' +target_abs_acc=0.85 + +[Monitor_Params] +timeout=3600 diff --git a/Pilot1/Attn/attn_abstention_keras2.py b/Pilot1/Attn/attn_abstention_keras2.py new file mode 100644 index 00000000..3d492141 --- /dev/null +++ b/Pilot1/Attn/attn_abstention_keras2.py @@ -0,0 +1,499 @@ +from __future__ import print_function + +#import itertools +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse +import sklearn + +import tensorflow as tf + +import keras as ke +from keras import backend as K + +from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization +from keras.optimizers import SGD, Adam, RMSprop, Adadelta +from keras.models import Sequential, Model, model_from_json, model_from_yaml +from keras.utils import np_utils, multi_gpu_model + +from keras.callbacks import Callback, ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping, TensorBoard + +from sklearn.utils.class_weight import compute_class_weight +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, roc_auc_score, confusion_matrix, balanced_accuracy_score, classification_report +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler +from sklearn.metrics import recall_score, auc, roc_curve, f1_score, precision_recall_curve + +import attn +import candle + +import attn_viz_utils as attnviz + +from attn_baseline_keras2 import build_attention_model + +np.set_printoptions(precision=4) + +additional_definitions = [ +{'name':'target_abs_acc', + 'type': float, + 'default': 0.7, + 'help':'target abstention accuracy'}, +{'name':'abs_scale_factor', + 'type': float, + 'default': 0.9, + 'help':'factor to increase or decrease weight for abstention term in cost function'} +] + +required = [ + 'activation', + 'batch_size', + 'dense', + 'dropout', + 'epochs', + 'learning_rate', + 'loss', + 'optimizer', + 'rng_seed', + 'val_split', + 'timeout', + 'target_abs_acc'] + + +class BenchmarkAttnAbs(candle.Benchmark): + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + attn.additional_definitions + + + +def tf_auc(y_true, y_pred): + auc = tf.metrics.auc(y_true, y_pred)[1] + K.get_session().run(tf.local_variables_initializer()) + return auc + + +def auroc( y_true, y_pred ) : + score = tf.py_func( lambda y_true, y_pred : roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32'), + [y_true, y_pred], + 'float32', + stateful=False, + name='sklearnAUC' ) + return score + + +class MetricHistory(Callback): + def on_epoch_begin(self, epoch, logs=None): + print("\n") + + def on_epoch_end(self, epoch, logs=None): + y_pred = self.model.predict(self.validation_data[0]) + r2 = r2_score(self.validation_data[1], y_pred) + corr, _ = pearsonr(self.validation_data[1].flatten(), y_pred.flatten()) + print("\nval_r2:", r2) + print(y_pred.shape) + print("\nval_corr:", corr, "val_r2:", r2) + print("\n") + + +def build_type_classifier(x_train, y_train, x_test, y_test): + y_train = np.argmax(y_train, axis=1) + y_test = np.argmax(y_test, axis=1) + from xgboost import XGBClassifier + clf = XGBClassifier(max_depth=6, n_estimators=100) + clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], verbose=False) + y_pred = clf.predict(x_test) + acc = accuracy_score(y_test, y_pred) + print(acc) + return clf + + +def initialize_parameters(default_model = 'attn_abs_default_model.txt'): + + # Build benchmark object + attnAbsBmk = BenchmarkAttnAbs(attn.file_path, default_model, 'keras', + prog='attention_abstention', desc='Attention model with abstention - Pilot 1 Benchmark') + + # Initialize parameters + gParameters = candle.finalize_parameters(attnAbsBmk) + #attn.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def save_cache(cache_file, x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels): + with h5py.File(cache_file, 'w') as hf: + hf.create_dataset("x_train", data=x_train) + hf.create_dataset("y_train", data=y_train) + hf.create_dataset("x_val", data=x_val) + hf.create_dataset("y_val", data=y_val) + hf.create_dataset("x_test", data=x_test) + hf.create_dataset("y_test", data=y_test) + hf.create_dataset("x_labels", (len(x_labels), 1), 'S100', data=[x.encode("ascii", "ignore") for x in x_labels]) + hf.create_dataset("y_labels", (len(y_labels), 1), 'S100', data=[x.encode("ascii", "ignore") for x in y_labels]) + + +def load_cache(cache_file): + with h5py.File(cache_file, 'r') as hf: + x_train = hf['x_train'][:] + y_train = hf['y_train'][:] + x_val = hf['x_val'][:] + y_val = hf['y_val'][:] + x_test = hf['x_test'][:] + y_test = hf['y_test'][:] + x_labels = [x[0].decode('unicode_escape') for x in hf['x_labels'][:]] + y_labels = [x[0].decode('unicode_escape') for x in hf['y_labels'][:]] + return x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels + +def extension_from_parameters(params, framework=''): + """Construct string for saving model with annotation of parameters""" + ext = framework + '.abs' + for i, n in enumerate(params['dense']): + if n: + ext += '.D{}={}'.format(i+1, n) + ext += '.A={}'.format(params['activation'][0]) + ext += '.B={}'.format(params['batch_size']) + ext += '.E={}'.format(params['epochs']) + ext += '.LR={}'.format(params['learning_rate']) + + if params['dropout']: + ext += '.DR={}'.format(params['dropout']) + if params['warmup_lr']: + ext += '.WU_LR' + if params['reduce_lr']: + ext += '.Re_LR' + if params['residual']: + ext += '.Res' + + return ext + + +def run(params): + args = candle.ArgumentStruct(**params) + seed = args.rng_seed + candle.set_seed(seed) + + # Construct extension to save model + ext = extension_from_parameters(params, 'keras') + candle.verify_path(params['save_path']) + prefix = '{}{}'.format(params['save_path'], ext) + logfile = params['logfile'] if params['logfile'] else prefix+'.log' + root_fname = 'Agg_attn_abs_bin' + candle.set_up_logger(logfile, attn.logger, params['verbose']) + attn.logger.info('Params: {}'.format(params)) + + # Get default parameters for initialization and optimizer functions + keras_defaults = candle.keras_default_config() + + ## + X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = attn.load_data(params, seed) + + # move this inside the load_data function + Y_train = _Y_train['AUC'] + Y_test = _Y_test['AUC'] + Y_val = _Y_val['AUC'] + + Y_train_neg, Y_train_pos = np.bincount(Y_train) + Y_test_neg, Y_test_pos = np.bincount(Y_test) + Y_val_neg, Y_val_pos = np.bincount(Y_val) + + Y_train_total = Y_train_neg + Y_train_pos + Y_test_total = Y_test_neg + Y_test_pos + Y_val_total = Y_val_neg + Y_val_pos + + total = Y_train_total + Y_test_total + Y_val_total + neg = Y_train_neg + Y_test_neg + Y_val_neg + pos = Y_train_pos + Y_test_pos + Y_val_pos + + print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( + total, pos, 100 * pos / total)) + + nb_classes = params['dense'][-1] + + # Convert classes to categorical with an extra slot for the abstaining class + Y_train, Y_test, Y_val = candle.modify_labels(nb_classes+1, Y_train, Y_test, Y_val) + + # Disable class weight (for initial testing of the abstention classifier) + #y_integers = np.argmax(Y_train, axis=1) + #class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers) + #d_class_weights = dict(enumerate(class_weights)) + + print('X_train shape:', X_train.shape) + print('X_test shape:', X_test.shape) + + print('Y_train shape:', Y_train.shape) + print('Y_test shape:', Y_test.shape) + + PS = X_train.shape[1] + model = build_attention_model(params, PS) + model = candle.add_model_output(model, mode='abstain', num_add=1, activation='sigmoid') + print('Model after modifying layer for abstention') + model.summary() + + # Configure abstention model + mask_ = np.zeros(nb_classes+1) + mask_[-1] = 1 + mu0 = 0.5 # In the long term this is not as important since mu auto tunes, however it may require a large number of epochs to converge if set far away from target + + candle.abstention_variable_initialization(mu0, mask_, nb_classes) + + #parallel_model = multi_gpu_model(model, gpus=4) + #parallel_model.compile(loss='mean_squared_error', + # optimizer=SGD(lr=0.0001, momentum=0.9), + # metrics=['mae',r2]) + kerasDefaults = candle.keras_default_config() + if params['momentum']: + kerasDefaults['momentum_sgd'] = params['momentum'] + + optimizer = candle.build_optimizer(params['optimizer'], params['learning_rate'], kerasDefaults) + + # compile model with abstention loss + model.compile(loss=candle.abstention_loss, optimizer=optimizer, metrics=['acc',tf_auc,candle.abs_acc,candle.acc_class1,candle.abs_acc_class1]) + + + # set up a bunch of callbacks to do work during model training.. + checkpointer = ModelCheckpoint(filepath=params['save_path'] + root_fname + '.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) + csv_logger = CSVLogger('{}/{}.training.log'.format(params['save_path'], root_fname)) + reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) + early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto') + candle_monitor = candle.CandleRemoteMonitor(params=params) + + candle_monitor = candle.CandleRemoteMonitor(params=params) + timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext)) + + history_logger = candle.LoggingCallback(attn.logger.debug) + + abstention_cbk = candle.AbstentionAdapt_Callback(monitor='val_abs_acc_class1', scale_factor=params['abs_scale_factor'], target_acc=params['target_abs_acc']) + + callbacks = [candle_monitor, timeout_monitor, csv_logger, history_logger, abstention_cbk] + + if params['reduce_lr']: + callbacks.append(reduce_lr) + + if params['use_cp']: + callbacks.append(checkpointer) + if params['use_tb']: + callbacks.append(tensorboard) + if params['early_stop']: + callbacks.append(early_stop) + + epochs = params['epochs'] + batch_size=params['batch_size'] + history = model.fit(X_train, Y_train, #class_weight=d_class_weights, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(X_val, Y_val), + callbacks = callbacks) + + # diagnostic plots + if 'loss' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'loss') + if 'acc' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'acc') + if 'abs_acc' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'abs_acc') + # Plot mu evolution + fname = params['save_path'] + root_fname + '.mu.png' + xlabel='Epochs' + ylabel='Abstention Weight mu' + title='mu Evolution' + attnviz.plot_array(abstention_cbk.muvalues, xlabel, ylabel, title, fname) + + # Evaluate model + score = model.evaluate(X_test, Y_test, verbose=0) + Y_predict = model.predict(X_test) + evaluate_abstention(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score) + + save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test) + + attn.logger.handlers = [] + + return history + + +def evaluate_abstention(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score): + Y_pred_int = np.argmax(Y_predict, axis=1).astype(np.int) + Y_test_int = np.argmax(Y_test, axis=1).astype(np.int) + + # Get samples where it abstains from predicting + Y_pred_abs = (Y_pred_int == nb_classes).astype(np.int) + + abs0 = 0 + abs1 = 0 + print ('creating table of predictions (with abstention)') + f = open(params['save_path'] + root_fname + '.predictions.tsv', 'w') + + for index, row in _Y_test.iterrows(): + + if row['AUC'] == 1: + if Y_pred_abs[index] == 1: # abstaining in this sample + call='ABS1' + abs1 += 1 + else: # Prediction is made (no abstention) + if Y_pred_int[index] == 1: + call='TP' + else: + call='FN' + if row['AUC'] == 0: + if Y_pred_abs[index] == 1: # abstaining in this sample + call='ABS0' + abs0 += 1 + else: # Prediction is made (no abstention) + if Y_pred_int[index] == 0: + call = 'TN' + else: + call = 'FP' + + print(index, "\t", call, "\t", Y_pred_int[index], "\t", row['AUC'], "\t", Y_pred_abs[index], "\t", row['Sample'], "\t", row['Drug1'], file=f) + + f.close() + + # Filtering samples by predictions made (i.e. leave just the predicted samples where there is NO abstention) + index_pred_noabs = (Y_pred_int < nb_classes) + Y_test_noabs = Y_test[index_pred_noabs,:2] + Y_test_int_noabs = Y_test_int[index_pred_noabs] + Y_pred_noabs = Y_predict[index_pred_noabs,:2] / np.sum(Y_predict[index_pred_noabs,:2], axis=1, keepdims=True) + Y_pred_int_noabs = Y_pred_int[index_pred_noabs] + false_pos_rate, true_pos_rate, thresholds = roc_curve(Y_test_noabs[:,0], Y_pred_noabs[:,0]) + + roc_auc = auc(false_pos_rate, true_pos_rate) + + auc_keras = roc_auc + fpr_keras = false_pos_rate + tpr_keras = true_pos_rate + + # ROC plots + fname = params['save_path'] + root_fname + '.auroc.pdf' + print ('creating figure at ', fname) + add_lbl = ' (after removing abstained samples) ' + attnviz.plot_ROC(fpr_keras, tpr_keras, auc_keras, fname, xlabel_add=add_lbl, ylabel_add=add_lbl) + # Zoom in view of the upper left corner. + fname = params['save_path'] + root_fname + '.auroc_zoom.pdf' + print ('creating figure at ', fname) + attnviz.plot_ROC(fpr_keras, tpr_keras, auc_keras, fname, xlabel_add=add_lbl, ylabel_add=add_lbl, zoom=True) + + f1 = f1_score(Y_test_int_noabs, Y_pred_int_noabs) + precision, recall, thresholds = precision_recall_curve(Y_test_noabs[:,0], Y_pred_noabs[:,0]) + pr_auc = auc(recall, precision) + pr_keras = pr_auc + precision_keras = precision + recall_keras = recall + print('f1=%.3f auroc=%.3f aucpr=%.3f' % (f1, auc_keras, pr_keras)) + # Plot RF + fname = params['save_path'] + root_fname + '.aurpr.pdf' + print ('creating figure at ', fname) + no_skill = len(Y_test_int_noabs[Y_test_int_noabs==1]) / len(Y_test_int_noabs) + attnviz.plot_RF(recall_keras, precision_keras, pr_keras, no_skill, fname, xlabel_add=add_lbl, ylabel_add=add_lbl) + + # Compute confusion matrix (complete) + cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) + # Plot non-normalized confusion matrix + class_names=['Non-Response','Response', 'Abstain'] + fname = params['save_path'] + root_fname + '.confusion_without_norm.pdf' + attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, title='Confusion matrix, without normalization') + print('NOTE: Confusion matrix above has zeros in the last row since the ground-truth does not include samples in the abstaining class.') + # Plot normalized confusion matrix + fname = params['save_path'] + root_fname + '.confusion_with_norm.pdf' + attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, normalize=True, title='Normalized confusion matrix') + print('NOTE: Normalized confusion matrix above has NaNs in the last row since the ground-truth does not include samples in the abstaining class.') + + + print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total)) + total_pred = Y_pred_int_noabs.shape[0] + print('Abstention (in prediction): Label0: {} ({:.2f}% of total pred)\n Label1: {} ({:.2f}% of total pred)\n'.format(abs0, 100 * abs0 / total_pred, abs1, 100 * abs1 / total_pred)) + print(sklearn.metrics.roc_auc_score(Y_test_int_noabs, Y_pred_int_noabs)) + print(sklearn.metrics.balanced_accuracy_score(Y_test_int_noabs, Y_pred_int_noabs)) + print(sklearn.metrics.classification_report(Y_test_int_noabs, Y_pred_int_noabs)) + print(sklearn.metrics.confusion_matrix(Y_test_int_noabs, Y_pred_int_noabs)) + print('Score: ', score) + print('Test val_loss (not abstained samples):', score[0]) + print('Test accuracy (not abstained samples):', score[1]) + + +def save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test): + + # serialize model to JSON + model_json = model.to_json() + with open(params['save_path'] + root_fname + '.model.json', "w") as json_file: + json_file.write(model_json) + + # serialize model to YAML + model_yaml = model.to_yaml() + with open(params['save_path'] + root_fname + '.model.yaml', "w") as yaml_file: + + yaml_file.write(model_yaml) + + # serialize weights to HDF5 + model.save_weights(params['save_path'] + root_fname + '.model.h5') + print("Saved model to disk") + + # load json and create model + json_file = open(params['save_path'] + root_fname + '.model.json', 'r') + loaded_model_json = json_file.read() + json_file.close() + loaded_model_json = model_from_json(loaded_model_json) + + # load yaml and create model + yaml_file = open(params['save_path'] + root_fname + '.model.yaml', 'r') + loaded_model_yaml = yaml_file.read() + yaml_file.close() + loaded_model_yaml = model_from_yaml(loaded_model_yaml) + #yaml.load(input, Loader=yaml.FullLoader) + + # load weights into new model + loaded_model_json.load_weights(params['save_path'] + root_fname + '.model.h5') + #input = params['save_path'] + root_fname + '.model.h5' + #loaded_model_json.load(input, Loader=yaml.FullLoader) + #print("Loaded json model from disk") + + # evaluate json loaded model on test data + loaded_model_json.compile(loss=candle.abstention_loss, optimizer='SGD', metrics=[candle.abs_acc]) + score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) + print('json Validation abstention loss:', score_json[0]) + print('json Validation abstention accuracy:', score_json[1]) + print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) + + # load weights into new model + loaded_model_yaml.load_weights(params['save_path'] + root_fname + '.model.h5') + print("Loaded yaml model from disk") + # evaluate yaml loaded model on test data + loaded_model_yaml.compile(loss=candle.abstention_loss, optimizer='SGD', metrics=[candle.abs_acc]) + score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) + print('yaml Validation abstention loss:', score_yaml[0]) + print('yaml Validation abstention accuracy:', score_yaml[1]) + print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) + + # predict using loaded yaml model on test and training data + predict_yaml_train = loaded_model_yaml.predict(X_train) + predict_yaml_test = loaded_model_yaml.predict(X_test) + print('Yaml_train_shape:', predict_yaml_train.shape) + print('Yaml_test_shape:', predict_yaml_test.shape) + predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) + predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) + np.savetxt(params['save_path'] + root_fname + '_predict_yaml_train.csv', predict_yaml_train, delimiter=",", fmt="%.3f") + np.savetxt(params['save_path'] + root_fname + '_predict_yaml_test.csv', predict_yaml_test, delimiter=",", fmt="%.3f") + np.savetxt(params['save_path'] + root_fname + '_predict_yaml_train_classes.csv', predict_yaml_train_classes, delimiter=",",fmt="%d") + np.savetxt(params['save_path'] + root_fname + '_predict_yaml_test_classes.csv', predict_yaml_test_classes, delimiter=",",fmt="%d") + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/Pilot1/Attn/attn_baseline_keras2.py b/Pilot1/Attn/attn_baseline_keras2.py new file mode 100644 index 00000000..82302872 --- /dev/null +++ b/Pilot1/Attn/attn_baseline_keras2.py @@ -0,0 +1,475 @@ +from __future__ import print_function + +import itertools +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse +import sklearn + +import tensorflow as tf + +import keras as ke +from keras import backend as K + +from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization +from keras.optimizers import SGD, Adam, RMSprop, Adadelta +from keras.models import Sequential, Model, model_from_json, model_from_yaml +from keras.utils import np_utils, multi_gpu_model + +from keras.callbacks import Callback, ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping, TensorBoard + +from sklearn.utils.class_weight import compute_class_weight +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, roc_auc_score, confusion_matrix, balanced_accuracy_score, classification_report +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler +from sklearn.metrics import recall_score, auc, roc_curve, f1_score, precision_recall_curve + +import attn +import candle + +import attn_viz_utils as attnviz + +np.set_printoptions(precision=4) + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + + + +def tf_auc(y_true, y_pred): + auc = tf.metrics.auc(y_true, y_pred)[1] + K.get_session().run(tf.local_variables_initializer()) + return auc + + +#from sklearn.metrics import roc_auc_score +#import tensorflow as tf + +def auroc( y_true, y_pred ) : + score = tf.py_func( lambda y_true, y_pred : roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32'), + [y_true, y_pred], + 'float32', + stateful=False, + name='sklearnAUC' ) + return score + + +def covariance(x, y): + return K.mean(x * y) - K.mean(x) * K.mean(y) + + +def corr(y_true, y_pred): + cov = covariance(y_true, y_pred) + var1 = covariance(y_true, y_true) + var2 = covariance(y_pred, y_pred) + return cov / (K.sqrt(var1 * var2) + K.epsilon()) + + +def xent(y_true, y_pred): + return binary_crossentropy(y_true, y_pred) + + +def mse(y_true, y_pred): + return mean_squared_error(y_true, y_pred) + + +class MetricHistory(Callback): + def on_epoch_begin(self, epoch, logs=None): + print("\n") + + def on_epoch_end(self, epoch, logs=None): + y_pred = self.model.predict(self.validation_data[0]) + r2 = r2_score(self.validation_data[1], y_pred) + corr, _ = pearsonr(self.validation_data[1].flatten(), y_pred.flatten()) + print("\nval_r2:", r2) + print(y_pred.shape) + print("\nval_corr:", corr, "val_r2:", r2) + print("\n") + + +class LoggingCallback(Callback): + def __init__(self, print_fcn=print): + Callback.__init__(self) + self.print_fcn = print_fcn + + def on_epoch_end(self, epoch, logs={}): + msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) + self.print_fcn(msg) + +def build_type_classifier(x_train, y_train, x_test, y_test): + y_train = np.argmax(y_train, axis=1) + y_test = np.argmax(y_test, axis=1) + from xgboost import XGBClassifier + clf = XGBClassifier(max_depth=6, n_estimators=100) + clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], verbose=False) + y_pred = clf.predict(x_test) + acc = accuracy_score(y_test, y_pred) + print(acc) + return clf + +def initialize_parameters(default_model = 'attn_default_model.txt'): + + # Build benchmark object + attnBmk = attn.BenchmarkAttn(attn.file_path, default_model, 'keras', + prog='attn_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') + + # Initialize parameters + gParameters = candle.finalize_parameters(attnBmk) + #attn.logger.info('Params: {}'.format(gParameters)) + + return gParameters + +def save_cache(cache_file, x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels): + with h5py.File(cache_file, 'w') as hf: + hf.create_dataset("x_train", data=x_train) + hf.create_dataset("y_train", data=y_train) + hf.create_dataset("x_val", data=x_val) + hf.create_dataset("y_val", data=y_val) + hf.create_dataset("x_test", data=x_test) + hf.create_dataset("y_test", data=y_test) + hf.create_dataset("x_labels", (len(x_labels), 1), 'S100', data=[x.encode("ascii", "ignore") for x in x_labels]) + hf.create_dataset("y_labels", (len(y_labels), 1), 'S100', data=[x.encode("ascii", "ignore") for x in y_labels]) + + +def load_cache(cache_file): + with h5py.File(cache_file, 'r') as hf: + x_train = hf['x_train'][:] + y_train = hf['y_train'][:] + x_val = hf['x_val'][:] + y_val = hf['y_val'][:] + x_test = hf['x_test'][:] + y_test = hf['y_test'][:] + x_labels = [x[0].decode('unicode_escape') for x in hf['x_labels'][:]] + y_labels = [x[0].decode('unicode_escape') for x in hf['y_labels'][:]] + return x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels + + +def build_attention_model(params, PS): + + assert (len(params['dense']) == len(params['activation'])) + assert (len(params['dense']) > 3) + + DR = params['dropout'] + inputs = Input(shape=(PS,)) + x = Dense(params['dense'][0], activation=params['activation'][0])(inputs) + x = BatchNormalization()(x) + a = Dense(params['dense'][1], activation=params['activation'][1])(x) + a = BatchNormalization()(a) + b = Dense(params['dense'][2], activation=params['activation'][2])(x) + x = ke.layers.multiply([a,b]) + + for i in range(3, len(params['dense'])-1): + x = Dense(params['dense'][i], activation=params['activation'][i])(x) + x = BatchNormalization()(x) + x = Dropout(DR)(x) + + outputs = Dense(params['dense'][-1], activation=params['activation'][-1])(x) + model = Model(inputs=inputs, outputs=outputs) + model.summary() + + return model + + +def run(params): + args = candle.ArgumentStruct(**params) + seed = args.rng_seed + candle.set_seed(seed) + + # Construct extension to save model + ext = attn.extension_from_parameters(params, 'keras') + candle.verify_path(params['save_path']) + prefix = '{}{}'.format(params['save_path'], ext) + logfile = params['logfile'] if params['logfile'] else prefix+'.log' + root_fname = 'Agg_attn_bin' + candle.set_up_logger(logfile, attn.logger, params['verbose']) + attn.logger.info('Params: {}'.format(params)) + + # Get default parameters for initialization and optimizer functions + keras_defaults = candle.keras_default_config() + + ## + X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = attn.load_data(params, seed) + + # move this inside the load_data function + Y_train = _Y_train['AUC'] + Y_test = _Y_test['AUC'] + Y_val = _Y_val['AUC'] + + Y_train_neg, Y_train_pos = np.bincount(Y_train) + Y_test_neg, Y_test_pos = np.bincount(Y_test) + Y_val_neg, Y_val_pos = np.bincount(Y_val) + + Y_train_total = Y_train_neg + Y_train_pos + Y_test_total = Y_test_neg + Y_test_pos + Y_val_total = Y_val_neg + Y_val_pos + + total = Y_train_total + Y_test_total + Y_val_total + neg = Y_train_neg + Y_test_neg + Y_val_neg + pos = Y_train_pos + Y_test_pos + Y_val_pos + + print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( + total, pos, 100 * pos / total)) + + nb_classes = params['dense'][-1] + + Y_train = np_utils.to_categorical(Y_train,nb_classes) + Y_test = np_utils.to_categorical(Y_test,nb_classes) + Y_val = np_utils.to_categorical(Y_val,nb_classes) + + y_integers = np.argmax(Y_train, axis=1) + class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers) + d_class_weights = dict(enumerate(class_weights)) + + print('X_train shape:', X_train.shape) + print('X_test shape:', X_test.shape) + + print('Y_train shape:', Y_train.shape) + print('Y_test shape:', Y_test.shape) + + PS=X_train.shape[1] + model = build_attention_model(params, PS) + + #parallel_model = multi_gpu_model(model, gpus=4) + #parallel_model.compile(loss='mean_squared_error', + # optimizer=SGD(lr=0.0001, momentum=0.9), + # metrics=['mae',r2]) + kerasDefaults = candle.keras_default_config() + if params['momentum']: + kerasDefaults['momentum_sgd'] = params['momentum'] + + optimizer = candle.build_optimizer(params['optimizer'], params['learning_rate'], kerasDefaults) + + model.compile(loss=params['loss'], + optimizer=optimizer, + # SGD(lr=0.00001, momentum=0.9), + # optimizer=Adam(lr=0.00001), + # optimizer=RMSprop(lr=0.0001), + # optimizer=Adadelta(), + metrics=['acc',tf_auc]) + + # set up a bunch of callbacks to do work during model training.. + + checkpointer = ModelCheckpoint(filepath=params['save_path'] + root_fname + '.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) + csv_logger = CSVLogger('{}/{}.training.log'.format(params['save_path'], root_fname)) + reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) + early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto') + candle_monitor = candle.CandleRemoteMonitor(params=params) + + candle_monitor = candle.CandleRemoteMonitor(params=params) + timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext)) + + history_logger = LoggingCallback(attn.logger.debug) + + callbacks = [candle_monitor, timeout_monitor, csv_logger, history_logger] + + if params['reduce_lr']: + callbacks.append(reduce_lr) + + if params['use_cp']: + callbacks.append(checkpointer) + if params['use_tb']: + callbacks.append(tensorboard) + if params['early_stop']: + callbacks.append(early_stop) + + epochs = params['epochs'] + batch_size=params['batch_size'] + history = model.fit(X_train, Y_train, class_weight=d_class_weights, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(X_val, Y_val), + callbacks = callbacks) + + # diagnostic plots + if 'loss' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'loss') + if 'acc' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'acc') + if 'tf_auc' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'tf_auc') + + # Evaluate model + score = model.evaluate(X_test, Y_test, verbose=0) + Y_predict = model.predict(X_test) + + evaluate_model(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score) + + save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test) + + attn.logger.handlers = [] + + return history + + +def evaluate_model(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score): + + threshold = 0.5 + + Y_pred_int = (Y_predict[:,0] < threshold).astype(np.int) + Y_test_int = (Y_test[:,0] < threshold).astype(np.int) + + print ('creating table of predictions') + f = open(params['save_path'] + root_fname + '.predictions.tsv', 'w') + for index, row in _Y_test.iterrows(): + if row['AUC'] == 1: + if Y_pred_int[index] == 1: + call='TP' + else: + call='FN' + if row['AUC'] == 0: + if Y_pred_int[index] == 0: + call = 'TN' + else: + call = 'FP' + # 1 TN 0 0.6323 NCI60.786-0 NSC.256439 NSC.102816 + print(index, "\t", call, "\t", Y_pred_int[index], "\t", row['AUC'], "\t", row['Sample'], "\t", row['Drug1'], file=f) + f.close() + + + false_pos_rate, true_pos_rate, thresholds = roc_curve(Y_test[:,0], Y_predict[:,0]) + #print(thresholds) + roc_auc = auc(false_pos_rate, true_pos_rate) + + auc_keras = roc_auc + fpr_keras = false_pos_rate + tpr_keras = true_pos_rate + + # ROC plots + fname = params['save_path'] + root_fname + '.auroc.pdf' + print ('creating figure at ', fname) + attnviz.plot_ROC(fpr_keras, tpr_keras, auc_keras, fname) + # Zoom in view of the upper left corner. + fname = params['save_path'] + root_fname + '.auroc_zoom.pdf' + print ('creating figure at ', fname) + attnviz.plot_ROC(fpr_keras, tpr_keras, auc_keras, fname, zoom=True) + + f1 = f1_score(Y_test_int, Y_pred_int) + + precision, recall, thresholds = precision_recall_curve(Y_test[:,0], Y_predict[:,0]) + #print(thresholds) + pr_auc = auc(recall, precision) + + pr_keras = pr_auc + precision_keras = precision + recall_keras = recall + + print('f1=%.3f auroc=%.3f aucpr=%.3f' % (f1, auc_keras, pr_keras)) + # Plot RF + fname = params['save_path'] + root_fname + '.aurpr.pdf' + print ('creating figure at ', fname) + no_skill = len(Y_test_int[Y_test_int==1]) / len(Y_test_int) + attnviz.plot_RF(recall_keras, precision_keras, pr_keras, no_skill, fname) + + # Compute confusion matrix + cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) + # Plot non-normalized confusion matrix + class_names=["Non-Response","Response"] + fname = params['save_path'] + root_fname + '.confusion_without_norm.pdf' + print ('creating figure at ', fname) + attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, title='Confusion matrix, without normalization') + # Plot normalized confusion matrix + fname = params['save_path'] + root_fname + '.confusion_with_norm.pdf' + print ('creating figure at ', fname) + attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, normalize=True, title='Normalized confusion matrix') + + print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( + total, pos, 100 * pos / total)) + + print(sklearn.metrics.roc_auc_score(Y_test_int, Y_pred_int)) + + print(sklearn.metrics.balanced_accuracy_score(Y_test_int, Y_pred_int)) + + print(sklearn.metrics.classification_report(Y_test_int, Y_pred_int)) + + print(sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int)) + + print("score") + print(score) + + print('Test val_loss:', score[0]) + print('Test accuracy:', score[1]) + + +def save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test): + + # serialize model to JSON + model_json = model.to_json() + with open(params['save_path'] + root_fname + ".model.json", "w") as json_file: + json_file.write(model_json) + + # serialize model to YAML + model_yaml = model.to_yaml() + with open(params['save_path'] + root_fname + ".model.yaml", "w") as yaml_file: + yaml_file.write(model_yaml) + + # serialize weights to HDF5 + model.save_weights(params['save_path'] + root_fname + ".model.h5") + print("Saved model to disk") + + # load json and create model + json_file = open(params['save_path'] + root_fname + '.model.json', 'r') + loaded_model_json = json_file.read() + json_file.close() + loaded_model_json = model_from_json(loaded_model_json) + + # load yaml and create model + yaml_file = open(params['save_path'] + root_fname + '.model.yaml', 'r') + loaded_model_yaml = yaml_file.read() + yaml_file.close() + loaded_model_yaml = model_from_yaml(loaded_model_yaml) + + # load weights into new model + loaded_model_json.load_weights(params['save_path'] + root_fname + ".model.h5") + print("Loaded json model from disk") + + # evaluate json loaded model on test data + loaded_model_json.compile(loss='binary_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) + score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) + + print('json Validation loss:', score_json[0]) + print('json Validation accuracy:', score_json[1]) + + print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) + + # load weights into new model + loaded_model_yaml.load_weights(params['save_path'] + root_fname + ".model.h5") + print("Loaded yaml model from disk") + + # evaluate loaded model on test data + loaded_model_yaml.compile(loss='binary_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) + score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) + print('yaml Validation loss:', score_yaml[0]) + print('yaml Validation accuracy:', score_yaml[1]) + print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) + + # predict using loaded yaml model on test and training data + predict_yaml_train = loaded_model_yaml.predict(X_train) + predict_yaml_test = loaded_model_yaml.predict(X_test) + print('Yaml_train_shape:', predict_yaml_train.shape) + print('Yaml_test_shape:', predict_yaml_test.shape) + + predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) + predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) + np.savetxt(params['save_path'] + root_fname + "_predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") + np.savetxt(params['save_path'] + root_fname + "_predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") + + np.savetxt(params['save_path'] + root_fname + "_predict_yaml_train_classes.csv", predict_yaml_train_classes, delimiter=",",fmt="%d") + np.savetxt(params['save_path'] + root_fname + "_predict_yaml_test_classes.csv", predict_yaml_test_classes, delimiter=",",fmt="%d") + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/Pilot1/Attn/attn_default_model.txt b/Pilot1/Attn/attn_default_model.txt new file mode 100644 index 00000000..0fc2b03d --- /dev/null +++ b/Pilot1/Attn/attn_default_model.txt @@ -0,0 +1,28 @@ +[Global_Params] +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +train_data='top_21_1fold_001.h5' +model_name='attn' +dense=[1000, 1000, 1000, 500, 250, 125, 60, 30, 2] +batch_size=32 +epochs=1 +activation=['relu', 'relu', 'softmax', 'relu', 'relu', 'relu', 'relu', 'relu', 'softmax'] +loss='categorical_crossentropy' +optimizer='sgd' +dropout=0.2 +learning_rate=0.00001 +momentum=0.9 +scaling='minmax' +val_split=0.1 +epsilon_std=1.0 +rng_seed=2017 +initialization='glorot_uniform' +latent_dim=2 +batch_normalization=False +use_cp=False +early_stop=True +reduce_lr=True +feature_subsample=0 +save_path='./save/001/' + +[Monitor_Params] +timeout=3600 diff --git a/Pilot1/Attn/attn_viz_utils.py b/Pilot1/Attn/attn_viz_utils.py new file mode 100644 index 00000000..650985e7 --- /dev/null +++ b/Pilot1/Attn/attn_viz_utils.py @@ -0,0 +1,83 @@ +from __future__ import print_function + +import itertools + +import numpy as np + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + + +np.set_printoptions(precision=2) + + +def plot_ROC(fpr_keras, tpr_keras, auc_keras, fname, xlabel_add='', ylabel_add='', zoom=False): + + plt.figure() + if zoom: + plt.xlim(0, 0.2) + plt.ylim(0.8, 1) + + plt.plot([0, 1], [0, 1], 'k--', label='No Skill') + plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) + plt.xlabel('False positive rate' + xlabel_add) + plt.ylabel('True positive rate' + ylabel_add) + plt.title('ROC curve') + plt.legend(loc='best') + plt.savefig(fname, bbox_inches='tight') + plt.close() + + +def plot_RF(recall_keras, precision_keras, pr_keras, no_skill, fname, xlabel_add='', ylabel_add=''): + + plt.figure() + plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') + plt.plot(recall_keras, precision_keras, label='PR Keras (area = {:.3f})'.format(pr_keras)) + plt.xlabel('Recall' + xlabel_add) + plt.ylabel('Precision' + ylabel_add) + plt.title('PR curve') + plt.legend(loc='best') + plt.savefig(fname, bbox_inches='tight') + plt.close() + + +def plot_confusion_matrix(cm, fname, classes, normalize=False, title='Confusion matrix'): + """ + This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print('Normalized confusion matrix') + else: + print('Confusion matrix, without normalization') + print(cm) + + cmap=plt.cm.Blues + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") + + plt.ylabel('True label') + plt.xlabel('Predicted label') + plt.tight_layout() + plt.savefig(fname, bbox_inches='tight') + plt.close() + +def plot_array(nparray, xlabel, ylabel, title, fname): + + plt.figure() + plt.plot(nparray, lw=3.) + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.title(title) + plt.savefig(fname, bbox_inches='tight') + plt.close() diff --git a/Pilot1/Combo/NCI60.py b/Pilot1/Combo/NCI60.py index 702c66ad..b061316e 100644 --- a/Pilot1/Combo/NCI60.py +++ b/Pilot1/Combo/NCI60.py @@ -7,7 +7,10 @@ import numpy as np import pandas as pd -from sklearn.preprocessing import Imputer +try: + from sklearn.impute import SimpleImputer as Imputer +except ImportError: + from sklearn.preprocessing import Imputer from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler file_path = os.path.dirname(os.path.realpath(__file__)) @@ -40,7 +43,7 @@ def impute_and_scale(df, scaling='std'): df = df.dropna(axis=1, how='all') - imputer = Imputer(strategy='mean', axis=0) + imputer = Imputer(strategy='mean') mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': diff --git a/Pilot1/Combo/README.md b/Pilot1/Combo/README.md index 82cdfe56..fbd034a2 100644 --- a/Pilot1/Combo/README.md +++ b/Pilot1/Combo/README.md @@ -39,7 +39,7 @@ $ python combo_baseline_keras2.py python combo_baseline_keras2.py --use_landmark_genes --warmup_lr --reduce_lr -z 256 Using TensorFlow backend. -Params: {'activation': 'relu', 'batch_size': 256, 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0, 'epochs': 10, 'learning_rate': None, 'loss': 'mse', 'optimizer': 'adam', 'residual': False, 'rng_seed': 2017, 'save': 'save/combo', 'scaling': 'std', 'feature_subsample': 0, 'validation_split': 0.2, 'solr_root': '', 'timeout': -1, 'cell_features': ['expression'], 'drug_features': ['descriptors'], 'cv': 1, 'max_val_loss': 1.0, 'base_lr': None, 'reduce_lr': True, 'warmup_lr': True, 'batch_normalization': False, 'gen': False, 'use_combo_score': False, 'config_file': '/home/fangfang/work/Benchmarks.combo/Pilot1/Combo/combo_default_model.txt', 'verbose': False, 'logfile': None, 'train_bool': True, 'shuffle': True, 'alpha_dropout': False, 'gpus': [], 'experiment_id': 'EXP.000', 'run_id': 'RUN.000', 'use_landmark_genes': True, 'cp': False, 'tb': False, 'datatype': } +Params: {'activation': 'relu', 'batch_size': 256, 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0, 'epochs': 10, 'learning_rate': None, 'loss': 'mse', 'optimizer': 'adam', 'residual': False, 'rng_seed': 2017, 'save': 'save/combo', 'scaling': 'std', 'feature_subsample': 0, 'validation_split': 0.2, 'timeout': -1, 'cell_features': ['expression'], 'drug_features': ['descriptors'], 'cv': 1, 'max_val_loss': 1.0, 'base_lr': None, 'reduce_lr': True, 'warmup_lr': True, 'batch_normalization': False, 'gen': False, 'use_combo_score': False, 'config_file': '/home/fangfang/work/Benchmarks.combo/Pilot1/Combo/combo_default_model.txt', 'verbose': False, 'logfile': None, 'train_bool': True, 'shuffle': True, 'alpha_dropout': False, 'gpus': [], 'experiment_id': 'EXP.000', 'run_id': 'RUN.000', 'use_landmark_genes': True, 'cp': False, 'tb': False, 'datatype': } Loaded 311737 unique (CL, D1, D2) response sets. Filtered down to 85303 rows with matching information. Unique cell lines: 59 @@ -196,4 +196,4 @@ python uno_baseline_keras2.py --conf combo_perf_benchmark.txt | Nucleus | 0:14:13 | 72 | 3.47 | 3.8 | 9.3 | 21.9 | 63.4 | 91.9 | | Tesla (K20) | 0:44:17 | 250 | 1.00 | 3.9 | 42.3 | 12.9 | 73.8 | 53.3 | | Titan | | | | | | | | | keras version 2.0.3 does not supprot model.clone_model() which is introduced in 2.0.7 | -* Time per epoch on the machine divided by time per epoch of Titan (or Tesla) \ No newline at end of file +* Time per epoch on the machine divided by time per epoch of Titan (or Tesla) diff --git a/Pilot1/Combo/combo.py b/Pilot1/Combo/combo.py index da35f207..72e2b37f 100644 --- a/Pilot1/Combo/combo.py +++ b/Pilot1/Combo/combo.py @@ -32,6 +32,10 @@ 'type':candle.str2bool, 'default':True, #action="store_true", 'help':"use the 978 landmark genes from LINCS (L1000) as expression features"}, +{'name':'use_combo_score', + 'type':candle.str2bool, + 'default':False, + 'help':"use combination score in place of percent growth (stored in 'GROWTH' column)"}, {'name':'preprocess_rnaseq', 'default':'none', 'choices':['source_scale', 'combat', 'none'], @@ -86,10 +90,10 @@ ] -required = [ 'activation', 'batch_size', 'dense', 'dense_feature_layers', 'drop', +required = [ 'activation', 'batch_size', 'dense', 'dense_feature_layers', 'dropout', 'epochs', 'learning_rate', 'loss', 'optimizer', 'residual', 'rng_seed', - 'save_path', 'scaling', 'feature_subsample', 'validation_split', - 'solr_root', 'timeout' + 'save_path', 'scaling', 'feature_subsample', 'val_split', + 'timeout' ] class BenchmarkCombo(candle.Benchmark): diff --git a/Pilot1/Combo/combo_baseline_keras2.py b/Pilot1/Combo/combo_baseline_keras2.py index 16f2e6f9..379aaea7 100644 --- a/Pilot1/Combo/combo_baseline_keras2.py +++ b/Pilot1/Combo/combo_baseline_keras2.py @@ -94,8 +94,8 @@ def extension_from_parameters(args): ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: ext += '.FS={}'.format(args.feature_subsample) - if args.drop > 0: - ext += '.DR={}'.format(args.drop) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) if args.warmup_lr: ext += '.wu_lr' if args.reduce_lr: @@ -604,7 +604,7 @@ def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], def build_model(loader, args, verbose=False): input_models = {} - dropout_rate = args.drop + dropout_rate = args.dropout permanent_dropout = True for fea_type, shape in loader.feature_shapes.items(): box = build_feature_model(input_shape=shape, name=fea_type, @@ -644,15 +644,15 @@ def build_model(loader, args, verbose=False): return Model(inputs, output) -def initialize_parameters(): +def initialize_parameters(default_model = 'combo_default_model.txt'): # Build benchmark object - comboBmk = combo.BenchmarkCombo(combo.file_path, 'combo_default_model.txt', 'keras', + comboBmk = combo.BenchmarkCombo(combo.file_path, default_model, 'keras', prog='combo_baseline', desc = 'Build neural network based models to predict tumor response to drug pairs.') # Initialize parameters - gParameters = candle.initialize_parameters(comboBmk) + gParameters = candle.finalize_parameters(comboBmk) #combo.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -673,7 +673,7 @@ def run(params): logger.info('Params: {}'.format(params)) loader = ComboDataLoader(seed=args.rng_seed, - val_split=args.validation_split, + val_split=args.val_split, cell_features=args.cell_features, drug_features=args.drug_features, use_mean_growth=args.use_mean_growth, diff --git a/Pilot1/Combo/combo_default_model.txt b/Pilot1/Combo/combo_default_model.txt index 3ab500e2..d25d2d5d 100644 --- a/Pilot1/Combo/combo_default_model.txt +++ b/Pilot1/Combo/combo_default_model.txt @@ -7,10 +7,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=10 batch_size=32 -validation_split=0.2 +val_split=0.2 cv=1 cv_partition='overlapping' max_val_loss=1.0 @@ -28,5 +28,4 @@ use_combo_score=False verbose = False [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/Combo/combo_dose.py b/Pilot1/Combo/combo_dose.py index f57ee7e5..f720c8e3 100644 --- a/Pilot1/Combo/combo_dose.py +++ b/Pilot1/Combo/combo_dose.py @@ -95,8 +95,8 @@ def extension_from_parameters(args): ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: ext += '.FS={}'.format(args.feature_subsample) - if args.drop > 0: - ext += '.DR={}'.format(args.drop) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) if args.warmup_lr: ext += '.wu_lr' if args.reduce_lr: @@ -609,7 +609,7 @@ def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], def build_model(loader, args, verbose=False): input_models = {} - dropout_rate = args.drop + dropout_rate = args.dropout permanent_dropout = True for fea_type, shape in loader.feature_shapes.items(): box = build_feature_model(input_shape=shape, name=fea_type, @@ -678,7 +678,7 @@ def initialize_parameters(): desc = 'Build neural network based models to predict tumor response to drug pairs.') # Initialize parameters - gParameters = candle.initialize_parameters(comboBmk) + gParameters = candle.finalize_parameters(comboBmk) #combo.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/Combo/combo_perf_bench_model.txt b/Pilot1/Combo/combo_perf_bench_model.txt index d581aea7..db404931 100644 --- a/Pilot1/Combo/combo_perf_bench_model.txt +++ b/Pilot1/Combo/combo_perf_bench_model.txt @@ -7,10 +7,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=10 batch_size=32 -validation_split=0.2 +val_split=0.2 cv=1 cv_partition='overlapping' max_val_loss=1.0 @@ -29,5 +29,4 @@ verbose=False use_landmark_genes=True [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/NT3/nt3.py b/Pilot1/NT3/nt3.py index 1863b3a3..c828b3d9 100644 --- a/Pilot1/NT3/nt3.py +++ b/Pilot1/NT3/nt3.py @@ -8,9 +8,6 @@ import candle additional_definitions = [ -{'name':'model_name', - 'default':'nt3', - 'type':str}, {'name':'classes', 'type':int, 'default':2} @@ -24,17 +21,17 @@ 'conv', 'dense', 'activation', - 'out_act', + 'out_activation', 'loss', 'optimizer', 'metrics', 'epochs', 'batch_size', 'learning_rate', - 'drop', + 'dropout', 'classes', 'pool', - 'save', + 'output_dir', 'timeout' ] diff --git a/Pilot1/NT3/nt3_baseline_keras2.py b/Pilot1/NT3/nt3_baseline_keras2.py index 1fc8e0dc..36aa65df 100644 --- a/Pilot1/NT3/nt3_baseline_keras2.py +++ b/Pilot1/NT3/nt3_baseline_keras2.py @@ -22,14 +22,14 @@ import nt3 as bmk import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'nt3_default_model.txt'): # Build benchmark object - nt3Bmk = bmk.BenchmarkNT3(bmk.file_path, 'nt3_default_model.txt', 'keras', - prog='nt3_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') + nt3Bmk = bmk.BenchmarkNT3(bmk.file_path, default_model, 'keras', + prog='nt3_baseline', desc='1D CNN to classify RNA sequence data in normal or tumor classes') # Initialize parameters - gParameters = candle.initialize_parameters(nt3Bmk) + gParameters = candle.finalize_parameters(nt3Bmk) #benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -133,10 +133,10 @@ def run(gParameters): if layer: model.add(Dense(layer)) model.add(Activation(gParameters['activation'])) - if gParameters['drop']: - model.add(Dropout(gParameters['drop'])) + if gParameters['dropout']: + model.add(Dropout(gParameters['dropout'])) model.add(Dense(gParameters['classes'])) - model.add(Activation(gParameters['out_act'])) + model.add(Activation(gParameters['out_activation'])) #Reference case #model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) @@ -167,7 +167,7 @@ def run(gParameters): optimizer=optimizer, metrics=[gParameters['metrics']]) - output_dir = gParameters['save'] + output_dir = gParameters['output_dir'] if not os.path.exists(output_dir): os.makedirs(output_dir) diff --git a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py deleted file mode 100644 index ce03d86b..00000000 --- a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py +++ /dev/null @@ -1,414 +0,0 @@ -from __future__ import print_function -import pandas as pd -import numpy as np -import os -import sys -import gzip -import argparse -try: - import configparser -except ImportError: - import ConfigParser as configparser - -from keras import backend as K - -from keras.layers import Input, Dense, Dropout, Activation, Conv1D, MaxPooling1D, Flatten -from keras.optimizers import SGD, Adam, RMSprop -from keras.models import Sequential, Model, model_from_json, model_from_yaml -from keras.utils import np_utils -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau - -from sklearn.metrics import accuracy_score -from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler - -TIMEOUT=3600 # in sec; set this to -1 for no timeout -file_path = os.path.dirname(os.path.realpath(__file__)) -lib_path = os.path.abspath(os.path.join(file_path, '..', 'common')) -sys.path.append(lib_path) -lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) -sys.path.append(lib_path2) - -import data_utils -import p1_common, p1_common_keras -from solr_keras import CandleRemoteMonitor, compute_trainable_params, TerminateOnTimeOut - - - -''' Import Tensorflow Modules ''' -import tensorflow as tf -from tensorflow.python.framework import graph_io -from tensorflow.python.tools import freeze_graph -from tensorflow.core.protobuf import saver_pb2 -from tensorflow.python.training import saver as saver_lib - - -#url_nt3 = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' -#file_train = 'nt_train2.csv' -#file_test = 'nt_test2.csv' - -#EPOCH = 400 -#BATCH = 20 -#CLASSES = 2 - -#PL = 60484 # 1 + 60483 these are the width of the RNAseq datasets -#P = 60483 # 60483 -#DR = 0.1 # Dropout rate - -def common_parser(parser): - - parser.add_argument("--config_file", dest='config_file', type=str, - default=os.path.join(file_path, 'nt3_default_model.txt'), - help="specify model configuration file") - - # Parse has been split between arguments that are common with the default neon parser - # and all the other options - parser = p1_common.get_default_neon_parse(parser) - parser = p1_common.get_p1_common_parser(parser) - - return parser - -def get_nt3_parser(): - - parser = argparse.ArgumentParser(prog='nt3_baseline', formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='Train Autoencoder - Pilot 1 Benchmark NT3') - - return common_parser(parser) - -def read_config_file(file): - config = configparser.ConfigParser() - config.read(file) - section = config.sections() - fileParams = {} - - fileParams['data_url'] = eval(config.get(section[0],'data_url')) - fileParams['train_data'] = eval(config.get(section[0],'train_data')) - fileParams['test_data'] = eval(config.get(section[0],'test_data')) - fileParams['model_name'] = eval(config.get(section[0],'model_name')) - fileParams['conv'] = eval(config.get(section[0],'conv')) - fileParams['dense'] = eval(config.get(section[0],'dense')) - fileParams['activation'] = eval(config.get(section[0],'activation')) - fileParams['out_act'] = eval(config.get(section[0],'out_act')) - fileParams['loss'] = eval(config.get(section[0],'loss')) - fileParams['optimizer'] = eval(config.get(section[0],'optimizer')) - fileParams['metrics'] = eval(config.get(section[0],'metrics')) - fileParams['epochs'] = eval(config.get(section[0],'epochs')) - fileParams['batch_size'] = eval(config.get(section[0],'batch_size')) - fileParams['learning_rate'] = eval(config.get(section[0], 'learning_rate')) - fileParams['drop'] = eval(config.get(section[0],'drop')) - fileParams['classes'] = eval(config.get(section[0],'classes')) - fileParams['pool'] = eval(config.get(section[0],'pool')) - fileParams['save'] = eval(config.get(section[0], 'save')) - - # parse the remaining values - for k,v in config.items(section[0]): - if not k in fileParams: - fileParams[k] = eval(v) - - return fileParams - -def initialize_parameters(): - # Get command-line parameters - parser = get_nt3_parser() - args = parser.parse_args() - #print('Args:', args) - # Get parameters from configuration file - fileParameters = read_config_file(args.config_file) - #print ('Params:', fileParameters) - # Consolidate parameter set. Command-line parameters overwrite file configuration - gParameters = p1_common.args_overwrite_config(args, fileParameters) - return gParameters - - -def load_data(train_path, test_path, gParameters): - - print('Loading data...') - df_train = (pd.read_csv(train_path,header=None).values).astype('float32') - df_test = (pd.read_csv(test_path,header=None).values).astype('float32') - print('done') - - print('df_train shape:', df_train.shape) - print('df_test shape:', df_test.shape) - - seqlen = df_train.shape[1] - - df_y_train = df_train[:,0].astype('int') - df_y_test = df_test[:,0].astype('int') - - Y_train = np_utils.to_categorical(df_y_train,gParameters['classes']) - Y_test = np_utils.to_categorical(df_y_test,gParameters['classes']) - - df_x_train = df_train[:, 1:seqlen].astype(np.float32) - df_x_test = df_test[:, 1:seqlen].astype(np.float32) - -# X_train = df_x_train.as_matrix() -# X_test = df_x_test.as_matrix() - - X_train = df_x_train - X_test = df_x_test - - scaler = MaxAbsScaler() - mat = np.concatenate((X_train, X_test), axis=0) - mat = scaler.fit_transform(mat) - - X_train = mat[:X_train.shape[0], :] - X_test = mat[X_train.shape[0]:, :] - - return X_train, Y_train, X_test, Y_test - - -def run(gParameters): - - print ('Params:', gParameters) - - file_train = gParameters['train_data'] - file_test = gParameters['test_data'] - url = gParameters['data_url'] - - train_file = data_utils.get_file(file_train, url+file_train, cache_subdir='Pilot1') - test_file = data_utils.get_file(file_test, url+file_test, cache_subdir='Pilot1') - - X_train, Y_train, X_test, Y_test = load_data(train_file, test_file, gParameters) - - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) - - print('Y_train shape:', Y_train.shape) - print('Y_test shape:', Y_test.shape) - - x_train_len = X_train.shape[1] - - # this reshaping is critical for the Conv1D to work - - X_train = np.expand_dims(X_train, axis=2) - X_test = np.expand_dims(X_test, axis=2) - - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) - - model = Sequential() - - layer_list = list(range(0, len(gParameters['conv']), 3)) - for l, i in enumerate(layer_list): - filters = gParameters['conv'][i] - filter_len = gParameters['conv'][i+1] - stride = gParameters['conv'][i+2] - print(int(i/3), filters, filter_len, stride) - if gParameters['pool']: - pool_list=gParameters['pool'] - if type(pool_list) != list: - pool_list=list(pool_list) - - if filters <= 0 or filter_len <= 0 or stride <= 0: - break - if 'locally_connected' in gParameters: - model.add(LocallyConnected1D(filters, filter_len, strides=stride, padding='valid', input_shape=(x_train_len, 1))) - else: - #input layer - if i == 0: - model.add(Conv1D(filters=filters, kernel_size=filter_len, strides=stride, padding='valid', input_shape=(x_train_len, 1))) - else: - model.add(Conv1D(filters=filters, kernel_size=filter_len, strides=stride, padding='valid')) - model.add(Activation(gParameters['activation'])) - if gParameters['pool']: - model.add(MaxPooling1D(pool_size=pool_list[int(i/3)])) - - model.add(Flatten()) - - for layer in gParameters['dense']: - if layer: - model.add(Dense(layer)) - model.add(Activation(gParameters['activation'])) - # This has to be disabled for tensorrt otherwise I am getting an error - if False and gParameters['drop']: - model.add(Dropout(gParameters['drop'])) - #model.add(Dense(gParameters['classes'])) - #model.add(Activation(gParameters['out_act']), name='activation_5') - model.add(Dense(gParameters['classes'], activation=gParameters['out_act'], name='activation_5')) -#Reference case -#model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) -#model.add(Activation('relu')) -#model.add(MaxPooling1D(pool_size=1)) -#model.add(Conv1D(filters=128, kernel_size=10, strides=1, padding='valid')) -#model.add(Activation('relu')) -#model.add(MaxPooling1D(pool_size=10)) -#model.add(Flatten()) -#model.add(Dense(200)) -#model.add(Activation('relu')) -#model.add(Dropout(0.1)) -#model.add(Dense(20)) -#model.add(Activation('relu')) -#model.add(Dropout(0.1)) -#model.add(Dense(CLASSES)) -#model.add(Activation('softmax')) - - kerasDefaults = p1_common.keras_default_config() - - # Define optimizer - optimizer = p1_common_keras.build_optimizer(gParameters['optimizer'], - gParameters['learning_rate'], - kerasDefaults) - - model.summary() - for layer in model.layers: - print(layer.name) - - print([x.op.name for x in model.outputs]) - - model.compile(loss=gParameters['loss'], - optimizer=optimizer, - metrics=[gParameters['metrics']]) - - output_dir = gParameters['save'] - - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # calculate trainable and non-trainable params - gParameters.update(compute_trainable_params(model)) - - # set up a bunch of callbacks to do work during model training.. - model_name = gParameters['model_name'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) - # checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) - csv_logger = CSVLogger('{}/training.log'.format(output_dir)) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) - candleRemoteMonitor = CandleRemoteMonitor(params=gParameters) - timeoutMonitor = TerminateOnTimeOut(TIMEOUT) - history = model.fit(X_train, Y_train, - batch_size=gParameters['batch_size'], - epochs=2, #gParameters['epochs'], - verbose=1, - validation_data=(X_test, Y_test), - callbacks = [csv_logger, reduce_lr, candleRemoteMonitor, timeoutMonitor]) - - score = model.evaluate(X_test, Y_test, verbose=0) - - #Begin tensorrt code - config = { - # Where to save models (Tensorflow + TensorRT) - "graphdef_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3.pb", - "frozen_model_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3_frozen_model.pb", - "snapshot_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/snapshot", - "engine_save_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3", - - # Needed for TensorRT - "inference_batch_size": 1, # inference batch size - "input_layer": "conv1d_1", # name of the input tensor in the TF computational graph - "out_layer": "activation_5/Softmax", # name of the output tensorf in the TF conputational graph - "output_size" : 2, # number of classes in output (5) - "precision": "fp32" # desired precision (fp32, fp16) "test_image_path" : "/home/data/val/roses" - } - - # Now, let's use the Tensorflow backend to get the TF graphdef and frozen graph - K.set_learning_phase(0) - sess = K.get_session() - saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) - - # save model weights in TF checkpoint - checkpoint_path = saver.save(sess, config['snapshot_dir'], global_step=0, latest_filename='checkpoint_state') - - # remove nodes not needed for inference from graph def - train_graph = sess.graph - inference_graph = tf.graph_util.remove_training_nodes(train_graph.as_graph_def()) - - #print(len([n.name for n in tf.get_default_graph().as_graph_def().node])) - - # write the graph definition to a file. - # You can view this file to see your network structure and - # to determine the names of your network's input/output layers. - graph_io.write_graph(inference_graph, '.', config['graphdef_file']) - - # specify which layer is the output layer for your graph. - # In this case, we want to specify the softmax layer after our - # last dense (fully connected) layer. - out_names = config['out_layer'] - - # freeze your inference graph and save it for later! (Tensorflow) - freeze_graph.freeze_graph( - config['graphdef_file'], - '', - False, - checkpoint_path, - out_names, - "save/restore_all", - "save/Const:0", - config['frozen_model_file'], - False, - "" - ) - - - if False: - print('Test score:', score[0]) - print('Test accuracy:', score[1]) - # serialize model to JSON - model_json = model.to_json() - with open("{}/{}.model.json".format(output_dir, model_name), "w") as json_file: - json_file.write(model_json) - - # serialize model to YAML - model_yaml = model.to_yaml() - with open("{}/{}.model.yaml".format(output_dir, model_name), "w") as yaml_file: - yaml_file.write(model_yaml) - - # serialize weights to HDF5 - model.save_weights("{}/{}.weights.h5".format(output_dir, model_name)) - print("Saved model to disk") - - # load json and create model - json_file = open('{}/{}.model.json'.format(output_dir, model_name), 'r') - loaded_model_json = json_file.read() - json_file.close() - loaded_model_json = model_from_json(loaded_model_json) - - - # load yaml and create model - yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_name), 'r') - loaded_model_yaml = yaml_file.read() - yaml_file.close() - loaded_model_yaml = model_from_yaml(loaded_model_yaml) - - - # load weights into new model - loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) - print("Loaded json model from disk") - - # evaluate json loaded model on test data - loaded_model_json.compile(loss=gParameters['loss'], - optimizer=gParameters['optimizer'], - metrics=[gParameters['metrics']]) - score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) - - print('json Test score:', score_json[0]) - print('json Test accuracy:', score_json[1]) - - print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) - - # load weights into new model - loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) - print("Loaded yaml model from disk") - - # evaluate loaded model on test data - loaded_model_yaml.compile(loss=gParameters['loss'], - optimizer=gParameters['optimizer'], - metrics=[gParameters['metrics']]) - score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) - - print('yaml Test score:', score_yaml[0]) - print('yaml Test accuracy:', score_yaml[1]) - - print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) - - return history - -def main(): - - gParameters = initialize_parameters() - run(gParameters) - -if __name__ == '__main__': - main() - try: - K.clear_session() - except AttributeError: # theano does not have this function - pass diff --git a/Pilot1/NT3/nt3_default_model.txt b/Pilot1/NT3/nt3_default_model.txt index d848df78..e763f259 100644 --- a/Pilot1/NT3/nt3_default_model.txt +++ b/Pilot1/NT3/nt3_default_model.txt @@ -1,20 +1,20 @@ [Global_Params] -data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' train_data = 'nt_train2.csv' test_data = 'nt_test2.csv' model_name = 'nt3' conv = [128, 20, 1, 128, 10, 1] dense = [200,20] activation = 'relu' -out_act = 'softmax' +out_activation = 'softmax' loss = 'categorical_crossentropy' optimizer = 'sgd' metrics = 'accuracy' epochs = 400 batch_size = 20 learning_rate = 0.001 -drop = 0.1 +dropout = 0.1 classes = 2 pool = [1, 10] -save = '.' +output_dir = '.' timeout = 3600 diff --git a/Pilot1/NT3/nt3_perf_bench_model.txt b/Pilot1/NT3/nt3_perf_bench_model.txt index 86a1873a..41af2c32 100644 --- a/Pilot1/NT3/nt3_perf_bench_model.txt +++ b/Pilot1/NT3/nt3_perf_bench_model.txt @@ -1,20 +1,20 @@ [Global_Params] -data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' train_data = 'nt_train2.csv' test_data = 'nt_test2.csv' model_name = 'nt3' conv = [128, 20, 1, 128, 10, 1] dense = [200,20] activation = 'relu' -out_act = 'softmax' +out_activation = 'softmax' loss = 'categorical_crossentropy' optimizer = 'sgd' metrics = 'accuracy' epochs = 50 batch_size = 5 learning_rate = 0.001 -drop = 0.1 +dropout = 0.1 classes = 2 pool = [1, 10] -save = '.' +output_dir = '.' timeout = 7200 diff --git a/Pilot1/NT3/nt3_tensorrt_convert.py b/Pilot1/NT3/nt3_tensorrt_convert.py deleted file mode 100644 index 019f87ea..00000000 --- a/Pilot1/NT3/nt3_tensorrt_convert.py +++ /dev/null @@ -1,45 +0,0 @@ -''' Import TensorRT Modules ''' -import tensorrt as trt -import uff -from tensorrt.parsers import uffparser - -config = { - # Where to save models (Tensorflow + TensorRT) - "graphdef_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3.pb", - "frozen_model_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3_frozen_model.pb", - "snapshot_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/snapshot", - "engine_save_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3", - # Needed for TensorRT - "inference_batch_size": 1, # inference batch size - "input_layer": "conv1d_1", # name of the input tensor in the TF computational graph - "out_layer": "activation_5/Softmax", # name of the output tensorf in the TF conputational graph - "output_size" : 2, # number of classes in output (5) - "precision": "fp32", # desired precision (fp32, fp16) - "test_image_path" : "/home/data/val/roses" -} - -G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO) -INPUT_LAYERS = [config['input_layer']] -OUTPUT_LAYERS = [config['out_layer']] -INFERENCE_BATCH_SIZE = config['inference_batch_size'] - -# Load your newly created Tensorflow frozen model and convert it to UFF -uff_model = uff.from_tensorflow_frozen_model(config['frozen_model_file'], OUTPUT_LAYERS) - -# Create a UFF parser to parse the UFF file created from your TF Frozen model -parser = uffparser.create_uff_parser() -parser.register_input(INPUT_LAYERS[0],(1,60464,128),0) -parser.register_output(OUTPUT_LAYERS[0]) - -# Build your TensorRT inference engine -if(config['precision'] == 'fp32'): - engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1<<20, trt.infer.DataType.FLOAT) -elif(config['precision'] == 'fp16'): - engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1<<20, trt.infer.DataType.HALF) - - # Serialize TensorRT engine to a file for when you are ready to deploy your model. -save_path = str(config['engine_save_dir']) + "keras_vgg19_b" + str(INFERENCE_BATCH_SIZE) + "_"+ str(config['precision']) + ".engine" - -trt.utils.write_engine_to_file(save_path, engine.serialize()) - -print("Saved TRT engine to {}".format(save_path)) diff --git a/Pilot1/NT3/training.log b/Pilot1/NT3/training.log deleted file mode 100644 index c96faefb..00000000 --- a/Pilot1/NT3/training.log +++ /dev/null @@ -1,3 +0,0 @@ -epoch,acc,loss,val_acc,val_loss -0,0.5892857164144516,0.6910436621734074,0.51428571769169396,0.68834920866148808 -1,0.55446428805589676,0.6870809867978096,0.57857143453189308,0.68404603004455566 diff --git a/Pilot1/P1B1/p1b1.py b/Pilot1/P1B1/p1b1.py index 6eccb92c..d40a9865 100644 --- a/Pilot1/P1B1/p1b1.py +++ b/Pilot1/P1B1/p1b1.py @@ -68,29 +68,32 @@ {'name':'tsne', 'type': candle.str2bool, 'default': False, - 'help':'generate tsne plot of the latent representation'} + 'help':'generate tsne plot of the latent representation'}, +{'name':'alpha_dropout', + 'type': candle.str2bool, + 'default': False, + 'help':'use the AlphaDropout layer from keras instead of regular Dropout'} ] required = [ 'activation', 'batch_size', 'dense', - 'drop', + 'dropout', 'epochs', 'initialization', 'learning_rate', 'loss', - 'noise_factor', + #'noise_factor', 'optimizer', 'rng_seed', 'model', 'scaling', - 'validation_split', + 'val_split', 'latent_dim', 'feature_subsample', 'batch_normalization', 'epsilon_std', - 'solr_root', 'timeout' ] @@ -125,8 +128,8 @@ def extension_from_parameters(params, framework=''): ext += '.EPS={}'.format(params['epsilon_std']) if params['feature_subsample'] > 0: ext += '.FS={}'.format(params['feature_subsample']) - if params['drop']: - ext += '.DR={}'.format(params['drop']) + if params['dropout']: + ext += '.DR={}'.format(params['dropout']) if params['alpha_dropout']: ext += '.AD' if params['batch_normalization']: @@ -150,15 +153,15 @@ def load_data(params, seed): if params['use_landmark_genes']: lincs_file = 'lincs1000.tsv' - lincs_path = candle.fetch_file(params['url_p1b1'] + lincs_file, 'Pilot1') + lincs_path = candle.fetch_file(params['data_url'] + lincs_file, 'Pilot1') df_l1000 = pd.read_csv(lincs_path, sep='\t') x_cols = df_l1000['gdc'].tolist() drop_cols = None else: x_cols = None - train_path = candle.fetch_file(params['url_p1b1'] + params['file_train'], 'Pilot1') - test_path = candle.fetch_file(params['url_p1b1'] + params['file_test'], 'Pilot1') + train_path = candle.fetch_file(params['data_url'] + params['train_data'], 'Pilot1') + test_path = candle.fetch_file(params['data_url'] + params['test_data'], 'Pilot1') return candle.load_csv_data(train_path, test_path, x_cols=x_cols, @@ -168,8 +171,8 @@ def load_data(params, seed): n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], - dtype=params['datatype'], - validation_split=params['validation_split'], + dtype=params['data_type'], + validation_split=params['val_split'], return_dataframe=False, return_header=True, seed=seed) @@ -185,22 +188,22 @@ def load_data_orig(params, seed): if params['use_landmark_genes']: lincs_file = 'lincs1000.tsv' - lincs_path = candle.fetch_file(url_p1b1 + lincs_file) + lincs_path = candle.fetch_file(params['data_url'] + lincs_file) df_l1000 = pd.read_csv(lincs_path, sep='\t') usecols = df_l1000['gdc'] drop_cols = None else: usecols = None - return candle.load_X_data(params['url_p1b1'], params['file_train'], params['file_test'], + return candle.load_X_data(params['data_url'], params['train_data'], params['test_data'], drop_cols=drop_cols, onehot_cols=onehot_cols, usecols=usecols, n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], - validation_split=params['validation_split'], - dtype=params['datatype'], + validation_split=params['val_split'], + dtype=params['data_type'], seed=seed) diff --git a/Pilot1/P1B1/p1b1_baseline_keras2.py b/Pilot1/P1B1/p1b1_baseline_keras2.py index 1b515c5e..05177230 100644 --- a/Pilot1/P1B1/p1b1_baseline_keras2.py +++ b/Pilot1/P1B1/p1b1_baseline_keras2.py @@ -6,7 +6,7 @@ from keras import backend as K from keras import optimizers from keras.models import Model -from keras.layers import BatchNormalization, Dense, Dropout, Input, Lambda +from keras.layers import BatchNormalization, Dense, Dropout, Input, Lambda, AlphaDropout from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard from keras.metrics import binary_crossentropy, mean_squared_error from scipy.stats.stats import pearsonr @@ -104,14 +104,14 @@ def build_type_classifier(x_train, y_train, x_test, y_test): print(acc) return clf -def initialize_parameters(): +def initialize_parameters(default_model = 'p1b1_default_model.txt'): # Build benchmark object - p1b1Bmk = p1b1.BenchmarkP1B1(p1b1.file_path, 'p1b1_default_model.txt', 'keras', + p1b1Bmk = p1b1.BenchmarkP1B1(p1b1.file_path, default_model, 'keras', prog='p1b1_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(p1b1Bmk) + gParameters = candle.finalize_parameters(p1b1Bmk) #p1b1.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -189,9 +189,9 @@ def run(params): latent_dim = params['latent_dim'] activation = params['activation'] - dropout = params['drop'] + dropout = params['dropout'] dense_layers = params['dense'] - dropout_layer = keras.layers.noise.AlphaDropout if params['alpha_dropout'] else Dropout + dropout_layer = AlphaDropout if params['alpha_dropout'] else Dropout # Initialize weights and learning rule initializer_weights = candle.build_initializer(params['initialization'], keras_defaults, seed) diff --git a/Pilot1/P1B1/p1b1_default_model.txt b/Pilot1/P1B1/p1b1_default_model.txt index 3b319dde..a2a9051a 100644 --- a/Pilot1/P1B1/p1b1_default_model.txt +++ b/Pilot1/P1B1/p1b1_default_model.txt @@ -1,7 +1,7 @@ [Global_Params] -url_p1b1 = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' -file_train = 'P1B1.dev.train.csv' -file_test = 'P1B1.dev.test.csv' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' +train_data = 'P1B1.dev.train.csv' +test_data = 'P1B1.dev.test.csv' model_name='p1b1' dense=[2000, 600] batch_size=100 @@ -9,13 +9,12 @@ epochs=100 activation='relu' loss='mse' optimizer='adam' -drop=0 +dropout=0 learning_rate=None base_lr=None scaling='minmax' model='ae' -noise_factor=0 -validation_split=0.1 +val_split=0.1 epsilon_std=1.0 rng_seed=2017 initialization='glorot_uniform' @@ -23,8 +22,7 @@ latent_dim=2 feature_subsample=0 batch_normalization=False alpha_dropout=False -save_path='save' +save_path='save/' [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/P1B1/p1b1_perf_bench_model.txt b/Pilot1/P1B1/p1b1_perf_bench_model.txt index 877c628b..04c791b7 100644 --- a/Pilot1/P1B1/p1b1_perf_bench_model.txt +++ b/Pilot1/P1B1/p1b1_perf_bench_model.txt @@ -1,7 +1,7 @@ [Global_Params] -url_p1b1 = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' -file_train = 'P1B1.dev.train.csv' -file_test = 'P1B1.dev.test.csv' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' +train_data = 'P1B1.dev.train.csv' +test_data = 'P1B1.dev.test.csv' model_name='p1b1' dense=[2000, 1000, 500, 200, 100] batch_size=100 @@ -9,13 +9,13 @@ epochs=500 activation='relu' loss='mse' optimizer='adam' -drop=0 +dropout=0 learning_rate=None base_lr=None scaling='minmax' model='ae' noise_factor=0 -validation_split=0.1 +val_split=0.1 epsilon_std=1.0 rng_seed=2017 initialization='glorot_uniform' @@ -27,5 +27,4 @@ save='save' use_landmark_genes=True [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/P1B2/p1b2.py b/Pilot1/P1B2/p1b2.py index 206005d9..70bb9b8f 100644 --- a/Pilot1/P1B2/p1b2.py +++ b/Pilot1/P1B2/p1b2.py @@ -29,7 +29,12 @@ logger = logging.getLogger(__name__) -additional_definitions = [] +additional_definitions = [ +{'name':'reg_l2', +'type': float, +'default': 0., +'help':'weight of regularization for l2 norm of nn weights'} +] required = [ 'data_url', @@ -38,17 +43,17 @@ 'activation', 'batch_size', 'dense', - 'drop', + 'dropout', 'epochs', 'feature_subsample', 'initialization', 'learning_rate', 'loss', 'optimizer', - 'penalty', + 'reg_l2', 'rng_seed', 'scaling', - 'validation_split', + 'val_split', 'shuffle' ] @@ -71,7 +76,7 @@ def extension_from_parameters(params, framework): ext = framework ext += '.A={}'.format(params['activation']) ext += '.B={}'.format(params['batch_size']) - ext += '.D={}'.format(params['drop']) + ext += '.D={}'.format(params['dropout']) ext += '.E={}'.format(params['epochs']) if params['feature_subsample']: ext += '.F={}'.format(params['feature_subsample']) @@ -93,8 +98,8 @@ def load_data_one_hot(params, seed): n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], - validation_split=params['validation_split'], - dtype=params['datatype'], + validation_split=params['val_split'], + dtype=params['data_type'], seed=seed) @@ -108,8 +113,8 @@ def load_data(params, seed): n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], - validation_split=params['validation_split'], - dtype=params['datatype'], + validation_split=params['val_split'], + dtype=params['data_type'], seed=seed) diff --git a/Pilot1/P1B2/p1b2_baseline_keras2.py b/Pilot1/P1B2/p1b2_baseline_keras2.py index 504cd421..a8e85dff 100644 --- a/Pilot1/P1B2/p1b2_baseline_keras2.py +++ b/Pilot1/P1B2/p1b2_baseline_keras2.py @@ -17,14 +17,14 @@ import p1b2 import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'p1b2_default_model.txt'): # Build benchmark object - p1b2Bmk = p1b2.BenchmarkP1B2(p1b2.file_path, 'p1b2_default_model.txt', 'keras', + p1b2Bmk = p1b2.BenchmarkP1B2(p1b2.file_path, default_model, 'keras', prog='p1b2_baseline', desc='Train Classifier - Pilot 1 Benchmark 2') # Initialize parameters - gParameters = candle.initialize_parameters(p1b2Bmk) + gParameters = candle.finalize_parameters(p1b2Bmk) #p1b2.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -34,7 +34,10 @@ def run(gParameters): # Construct extension to save model ext = p1b2.extension_from_parameters(gParameters, '.keras') - logfile = gParameters['logfile'] if gParameters['logfile'] else gParameters['save_path']+ext+'.log' + candle.verify_path(gParameters['save_path']) + prefix = '{}{}'.format(gParameters['save_path'], ext) + logfile = gParameters['logfile'] if gParameters['logfile'] else prefix+'.log' + candle.set_up_logger(logfile, p1b2.logger, gParameters['verbose']) p1b2.logger.info('Params: {}'.format(gParameters)) # Get default parameters for initialization and optimizer functions @@ -80,16 +83,16 @@ def run(gParameters): x = Dense(l, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, - kernel_regularizer=l2(gParameters['penalty']), - activity_regularizer=l2(gParameters['penalty']))(input_vector) + kernel_regularizer=l2(gParameters['reg_l2']), + activity_regularizer=l2(gParameters['reg_l2']))(input_vector) else: x = Dense(l, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, - kernel_regularizer=l2(gParameters['penalty']), - activity_regularizer=l2(gParameters['penalty']))(x) - if gParameters['drop']: - x = Dropout(gParameters['drop'])(x) + kernel_regularizer=l2(gParameters['reg_l2']), + activity_regularizer=l2(gParameters['reg_l2']))(x) + if gParameters['dropout']: + x = Dropout(gParameters['dropout'])(x) output = Dense(output_dim, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias)(x) diff --git a/Pilot1/P1B2/p1b2_baseline_mxnet.py b/Pilot1/P1B2/p1b2_baseline_mxnet.py index 3c423256..999fde02 100644 --- a/Pilot1/P1B2/p1b2_baseline_mxnet.py +++ b/Pilot1/P1B2/p1b2_baseline_mxnet.py @@ -88,8 +88,8 @@ def main(): for i,l in enumerate(layers): net = mx.sym.FullyConnected(data=net, num_hidden=l) net = mx.sym.Activation(data=net, act_type=activation) - if gParameters['drop']: - net = mx.sym.Dropout(data=net, p=gParameters['drop']) + if gParameters['dropout']: + net = mx.sym.Dropout(data=net, p=gParameters['dropout']) net = mx.sym.FullyConnected(data=net, num_hidden=num_classes)# 1) net = mx.symbol.SoftmaxOutput(data=net, label=out) diff --git a/Pilot1/P1B2/p1b2_baseline_neon.py b/Pilot1/P1B2/p1b2_baseline_neon.py index e36ef67c..caea982b 100644 --- a/Pilot1/P1B2/p1b2_baseline_neon.py +++ b/Pilot1/P1B2/p1b2_baseline_neon.py @@ -101,7 +101,7 @@ def main(): rng_seed=seed, device_id=args.device_id, batch_size=gParameters['batch_size'], - datatype=gParameters['datatype'], + datatype=gParameters['data_type'], max_devices=args.max_devices, compat_mode=args.compat_mode) @@ -123,8 +123,8 @@ def main(): for layer in gParameters['dense']: if layer: layers.append(Affine(nout=layer, init=initializer_weights, bias=initializer_bias, activation=activation)) - if gParameters['drop']: - layers.append(Dropout(keep=(1-gParameters['drop']))) + if gParameters['dropout']: + layers.append(Dropout(keep=(1-gParameters['dropout']))) layers.append(Affine(nout=output_dim, init=initializer_weights, bias=initializer_bias, activation=activation)) diff --git a/Pilot1/P1B2/p1b2_default_model.txt b/Pilot1/P1B2/p1b2_default_model.txt index 10802e7e..33e37f87 100644 --- a/Pilot1/P1B2/p1b2_default_model.txt +++ b/Pilot1/P1B2/p1b2_default_model.txt @@ -2,6 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/' train_data = 'P1B2.train.csv' test_data = 'P1B2.test.csv' +model_name='p1b2' dense=[1024, 512, 256] batch_size=60 epochs=1 @@ -10,10 +11,10 @@ loss='categorical_crossentropy' optimizer='rmsprop' learning_rate=0.001 scaling='minmax' -drop=0. +dropout=0. feature_subsample=0 -penalty=0.00001 -validation_split=0.1 +reg_l2=0.00001 +val_split=0.1 rng_seed=2017 initialization='glorot_uniform' save_path='save' diff --git a/Pilot1/P1B3/p1b3.py b/Pilot1/P1B3/p1b3.py index 4683c042..3b29ad32 100644 --- a/Pilot1/P1B3/p1b3.py +++ b/Pilot1/P1B3/p1b3.py @@ -18,7 +18,10 @@ from itertools import cycle, islice -from sklearn.preprocessing import Imputer +try: + from sklearn.impute import SimpleImputer as Imputer +except ImportError: + from sklearn.preprocessing import Imputer from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler file_path = os.path.dirname(os.path.realpath(__file__)) @@ -104,7 +107,7 @@ def set_locals(self): 'batch_normalization', 'category_cutoffs', 'cell_features', - 'drop', + 'dropout', 'drug_features', 'epochs', 'feature_subsample', @@ -119,7 +122,7 @@ def set_locals(self): 'scaling', 'subsample', 'test_cell_split', - 'validation_split', + 'val_split', 'cell_noise_sigma' ] @@ -193,7 +196,7 @@ def set_locals(self): # fileParams['batch_normalization']=eval(config.get(section[0],'batch_normalization')) # fileParams['category_cutoffs']=eval(config.get(section[0],'category_cutoffs')) # fileParams['cell_features']=eval(config.get(section[0],'cell_features')) -# fileParams['drop']=eval(config.get(section[0],'drop')) +# fileParams['dropout']=eval(config.get(section[0],'dropout')) # fileParams['drug_features']=eval(config.get(section[0],'drug_features')) # fileParams['epochs']=eval(config.get(section[0],'epochs')) # fileParams['feature_subsample']=eval(config.get(section[0],'feature_subsample')) @@ -208,7 +211,7 @@ def set_locals(self): # fileParams['scaling']=eval(config.get(section[0],'scaling')) # fileParams['subsample']=eval(config.get(section[0],'subsample')) # fileParams['test_cell_split']=eval(config.get(section[0],'test_cell_split')) -# fileParams['validation_split']=eval(config.get(section[0],'validation_split')) +# fileParams['val_split']=eval(config.get(section[0],'val_split')) # fileParams['cell_noise_sigma']=eval(config.get(section[0],'cell_noise_sigma')) # # # parse the remaining values @@ -241,7 +244,7 @@ def extension_from_parameters(params, framework): ext = framework ext += '.A={}'.format(params['activation']) ext += '.B={}'.format(params['batch_size']) - ext += '.D={}'.format(params['drop']) + ext += '.D={}'.format(params['dropout']) ext += '.E={}'.format(params['epochs']) if params['feature_subsample']: ext += '.F={}'.format(params['feature_subsample']) @@ -318,7 +321,8 @@ def impute_and_scale(df, scaling='std'): df = df.dropna(axis=1, how='all') - imputer = Imputer(strategy='mean', axis=0) + #imputer = Imputer(strategy='mean', axis=0) + imputer = Imputer(strategy='mean') mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': diff --git a/Pilot1/P1B3/p1b3_baseline_keras2.py b/Pilot1/P1B3/p1b3_baseline_keras2.py index 8458684d..c9c858a4 100644 --- a/Pilot1/P1B3/p1b3_baseline_keras2.py +++ b/Pilot1/P1B3/p1b3_baseline_keras2.py @@ -28,14 +28,14 @@ #np.set_printoptions(threshold=np.nan) -def initialize_parameters(): +def initialize_parameters(default_model = 'p1b3_default_model.txt'): # Build benchmark object - p1b3Bmk = benchmark.BenchmarkP1B3(benchmark.file_path, 'p1b3_default_model.txt', 'keras', + p1b3Bmk = benchmark.BenchmarkP1B3(benchmark.file_path, default_model, 'keras', prog='p1b3_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(p1b3Bmk) + gParameters = candle.finalize_parameters(p1b3Bmk) #benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -259,8 +259,8 @@ def run(gParameters): seed = gParameters['rng_seed'] # Build dataset loader object - loader = benchmark.DataLoader(seed=seed, dtype=gParameters['datatype'], - val_split=gParameters['validation_split'], + loader = benchmark.DataLoader(seed=seed, dtype=gParameters['data_type'], + val_split=gParameters['val_split'], test_cell_split=gParameters['test_cell_split'], cell_features=gParameters['cell_features'], drug_features=gParameters['drug_features'], @@ -292,8 +292,8 @@ def run(gParameters): if gParameters['batch_normalization']: model.add(BatchNormalization()) model.add(Activation(gParameters['activation'])) - if gParameters['drop']: - model.add(Dropout(gParameters['drop'])) + if gParameters['dropout']: + model.add(Dropout(gParameters['dropout'])) else: # Build convolutional layers gen_shape = 'add_1d' layer_list = list(range(0, len(gParameters['conv']))) @@ -359,8 +359,7 @@ def run(gParameters): validation_steps=val_steps, verbose=0, callbacks=[checkpointer, loss_history, progbar, candleRemoteMonitor], - pickle_safe=True, - workers=gParameters['workers']) + ) benchmark.logger.removeHandler(fh) benchmark.logger.removeHandler(sh) diff --git a/Pilot1/P1B3/p1b3_conv_model.txt b/Pilot1/P1B3/p1b3_conv_model.txt index 67af6d18..7f38c676 100644 --- a/Pilot1/P1B3/p1b3_conv_model.txt +++ b/Pilot1/P1B3/p1b3_conv_model.txt @@ -8,9 +8,9 @@ loss = 'mse' optimizer = 'sgd' learning_rate = 0.001 scaling = 'std' -drop = 0.1 +dropout = 0.1 feature_subsample = 0 -validation_split = 0.1 +val_split = 0.1 rng_seed = 2017 initialization = 'normal' min_logconc = -5. diff --git a/Pilot1/P1B3/p1b3_default_model.txt b/Pilot1/P1B3/p1b3_default_model.txt index 9d4645de..70beb20c 100644 --- a/Pilot1/P1B3/p1b3_default_model.txt +++ b/Pilot1/P1B3/p1b3_default_model.txt @@ -7,9 +7,9 @@ loss='mse' optimizer='sgd' learning_rate=0.001 scaling='std' -drop=0.1 +dropout=0.1 feature_subsample=0 -validation_split=0.1 +val_split=0.1 rng_seed=2017 initialization='normal' min_logconc=-5. diff --git a/Pilot1/P1B3/p1b3_perf_bench_model.txt b/Pilot1/P1B3/p1b3_perf_bench_model.txt index fc5aec7e..a288e817 100644 --- a/Pilot1/P1B3/p1b3_perf_bench_model.txt +++ b/Pilot1/P1B3/p1b3_perf_bench_model.txt @@ -7,9 +7,9 @@ loss='mse' optimizer='sgd' learning_rate=0.001 scaling='std' -drop=0.1 +dropout=0.1 feature_subsample=500 -validation_split=0.1 +val_split=0.1 rng_seed=2017 initialization='normal' min_logconc=-5. diff --git a/Pilot1/T29/README.candle b/Pilot1/T29/README.candle deleted file mode 100644 index 17f4a743..00000000 --- a/Pilot1/T29/README.candle +++ /dev/null @@ -1,43 +0,0 @@ -curl -o rip.it.test.csv.gz ftp://ftp.mcs.anl.gov/pub/candle/public/tutorials/t29res/rip.it.test.csv.gz -curl -o rip.it.train.csv.gz ftp://ftp.mcs.anl.gov/pub/candle/public/tutorials/t29res/rip.it.train.csv.gz -gunzip rip.it.test.csv.gz -gunzip rip.it.train.csv.gz - - -git checkout release_01 - -def initialize_parameters(): - t29_common = candle_keras.Benchmark(file_path, 't29_default_model.txt','keras', - prog='t29res.py',desc='resnet') - gParameters = candle.keras.initialize_parameters(t29_common) - return gParameters - - -# In the run method, get default settings for keras objects, -# such as those for the the optimizer. - -kerasDefaults = candle_keras.xkeras_default_config() -kerasDefaults['momentum_sgd'] = gParameters['momentum'] - -# In the run method, create the optimizer using user supplied -# parameters as well as those in the keras defaults. - -OPTIMIZER = keras_utils.build_optimizer(gParameters['optimizer'], - gParameters['learning_rate'], - kerasDefaults) - -# Add additional arguements that are not represented in the default -# arguments (Need a reference to the list of default arguements). - -additional_definitions = [ - {'name':'connections', - 'default':1, - 'type':int, - 'help':'The number of residual connections.'}, - {'name':'distance', - 'default':1, - 'type':int, - 'help':'Residual connection distance between dense layers.'} -] - -# To configure the width of the dense layers diff --git a/Pilot1/T29/infer.py b/Pilot1/T29/infer.py deleted file mode 100644 index 471bcde7..00000000 --- a/Pilot1/T29/infer.py +++ /dev/null @@ -1,136 +0,0 @@ -import pandas as pd -import numpy as np -import os -import sys -import keras as ke -from keras.models import Sequential, Model, model_from_json, model_from_yaml -from keras.utils import np_utils -from keras import backend as K -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, LearningRateScheduler -from sklearn.metrics import accuracy_score -from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler - - -file_path = os.path.dirname(os.path.realpath(__file__)) - -# candle -sys.path.append('/raid/brettin/Benchmarks/common') -import candle - -# This needs to be fixed -# candle -def initialize_parameters(): - t29_common = candle.Benchmark(file_path, 't29_default_model.txt','keras', - prog='t29res.py',desc='resnet') - - # Need a pointer to the docs showing what is provided - # by default - additional_definitions = [ - {'name':'connections', - 'default':1, - 'type':int, - 'help':'The number of residual connections.'}, - {'name':'distance', - 'default':1, - 'type':int, - 'help':'Residual connection distance between dense layers.'}, - {'name':'model', - 'default':'model.json', - 'type':str, - 'help':'Name of json model description file.'}, - {'name':'weights', - 'default':'model.h5', - 'type':str, - 'help':'Name of h5 weights file.'}, - {'name':'n_pred', - 'default':1, - 'type':int, - 'help':'Number of predictions to do on each sample.'} - ] - t29_common.additional_definitions = additional_definitions - gParameters = candle.initialize_parameters(t29_common) - return gParameters - - -def load_data(gParameters): - train_path=gParameters['train_path'] - test_path=gParameters['test_path'] - df_train = (pd.read_csv(train_path,header=None).values).astype('float32') - df_test = (pd.read_csv(test_path,header=None).values).astype('float32') - - print('df_train shape:', df_train.shape) - print('df_test shape:', df_test.shape) - - df_y_train = df_train[:,0].astype('int') - df_y_test = df_test[:,0].astype('int') - - Y_train = np_utils.to_categorical(df_y_train,gParameters['classes']) - train_classes = np.argmax(Y_train, axis=1) - - Y_test = np_utils.to_categorical(df_y_test,gParameters['classes']) - test_classes = np.argmax(Y_test, axis=1) - - df_x_train = df_train[:, 1:df_train.shape[1]].astype(np.float32) - df_x_test = df_test[:, 1:df_train.shape[1]].astype(np.float32) - - # not sure the extra variable is needed, and is this a copy or reference - X_train = df_x_train - X_test = df_x_test - - scaler = MaxAbsScaler() - mat = np.concatenate((X_train, X_test), axis=0) - mat = scaler.fit_transform(mat) - - X_train = mat[:X_train.shape[0], :] - X_test = mat[X_train.shape[0]:, :] - - return X_train, Y_train, X_test, Y_test - -# This is required for candle compliance. -# It essentially wraps what was in the implicit main funcion -def run(gParameters): - print ('gParameters: ', gParameters) - - # load the data - X_train, Y_train, X_test, Y_test = load_data(gParameters) - - # load json and create model - json_file = open(gParameters['model'], 'r') - loaded_model_json = json_file.read() - json_file.close() - loaded_model_json = model_from_json(loaded_model_json) - - # load weights into new model - loaded_model_json.load_weights(gParameters['weights']) - print("Loaded json model from disk") - - # predict using loaded yaml model on test and training data - pred_test_df = pd.DataFrame() - pred_test_classes_df = pd.DataFrame() - - for x in range(gParameters['n_pred']): - predict_test = loaded_model_json.predict(X_test) - pred_test_df[str(x)] = np.amax(predict_test, axis=1) - pred_test_classes_df[str(x)] = np.argmax(predict_test, axis=1) - - pred_test_df['mean'] = pred_test_df.mean(axis=1) - pred_test_df['std'] = pred_test_df.std(axis=1) - - pred_test_df.to_csv("predict_test.csv") - pred_test_classes_df.to_csv("predict_test_classes.csv") - return - -# This is also added for candle compliance so that the program can -# still be executed independently from the command line. -def main(): - - gParameters = initialize_parameters() - run(gParameters) - -if __name__ == '__main__': - main() - try: - ke.clear_session() - except AttributeError: # theano does not have this function - pass - diff --git a/Pilot1/T29/t29_default_model.txt b/Pilot1/T29/t29_default_model.txt deleted file mode 100644 index 8d143315..00000000 --- a/Pilot1/T29/t29_default_model.txt +++ /dev/null @@ -1,13 +0,0 @@ -[Global_Params] -train_path='./rip.it.train.csv' -test_path='./rip.it.test.csv' -batch_size=64 -epochs=100 -drop=0.2 -classes=2 -optimizer='sgd' -learning_rate=0.002 -momentum=0.42 -loss='categorical_crossentropy' -activation='relu' -CHECK=1 diff --git a/Pilot1/T29/t29res.py b/Pilot1/T29/t29res.py deleted file mode 100644 index b66e41be..00000000 --- a/Pilot1/T29/t29res.py +++ /dev/null @@ -1,292 +0,0 @@ -import pandas as pd -import numpy as np -import os -import sys -import gzip -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt -import keras as ke -from keras.layers import Input, Dense, Dropout, Activation -from keras.optimizers import SGD, Adam, RMSprop -from keras.models import Sequential, Model, model_from_json, model_from_yaml -from keras.utils import np_utils -from keras import backend as K -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, LearningRateScheduler -from sklearn.metrics import accuracy_score -from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler - - -file_path = os.path.dirname(os.path.realpath(__file__)) - -# candle -sys.path.append('/raid/brettin/Benchmarks/common') -import candle - -# candle -def initialize_parameters(): - t29_common = candle.Benchmark(file_path, 't29_default_model.txt','keras', - prog='t29res.py',desc='resnet') - - # Need a pointer to the docs showing what is provided - # by default - additional_definitions = [ - {'name':'connections', - 'default':1, - 'type':int, - 'help':'The number of residual connections.'}, - {'name':'distance', - 'default':1, - 'type':int, - 'help':'Residual connection distance between dense layers.'} - ] - t29_common.additional_definitions = additional_definitions - gParameters = candle.initialize_parameters(t29_common) - return gParameters - - -def load_data(nb_classes, PL, gParameters): - train_path=gParameters['train_path'] - test_path=gParameters['test_path'] - df_train = (pd.read_csv(train_path,header=None).values).astype('float32') - df_test = (pd.read_csv(test_path,header=None).values).astype('float32') - - print('df_train shape:', df_train.shape) - print('df_test shape:', df_test.shape) - - df_y_train = df_train[:,0].astype('int') - df_y_test = df_test[:,0].astype('int') - - Y_train = np_utils.to_categorical(df_y_train,nb_classes) - train_classes = np.argmax(Y_train, axis=1) - np.savetxt("train_classes.csv", train_classes, delimiter=",", fmt="%d") - - Y_test = np_utils.to_categorical(df_y_test,nb_classes) - test_classes = np.argmax(Y_test, axis=1) - np.savetxt("test_classes.csv", test_classes, delimiter=",", fmt="%d") - - df_x_train = df_train[:, 1:PL].astype(np.float32) - df_x_test = df_test[:, 1:PL].astype(np.float32) - - # not sure the extra variable is needed, and is this a copy or reference - X_train = df_x_train - X_test = df_x_test - - scaler = MaxAbsScaler() - mat = np.concatenate((X_train, X_test), axis=0) - mat = scaler.fit_transform(mat) - - X_train = mat[:X_train.shape[0], :] - X_test = mat[X_train.shape[0]:, :] - - return X_train, Y_train, X_test, Y_test - -# Create residual connections -# x is input -# distance is distance to residual connection - -# this is a function I added so that we could include -# the distance between residually connected layers -# and the number of residual connections needed -def f(x, gParameters, distance=1): - input = x - for i in range(distance): - if 'drop' in gParameters: - x = Dropout(gParameters['drop'])(x) - x = Dense(1000, activation=gParameters['activation'])(x) - y = ke.layers.add([input,x]) - return y - -# This is required for candle compliance. -# It essentially wraps what was in the implicit main funcion -def run(gParameters): - print ('gParameters: ', gParameters) - - EPOCH = gParameters['epochs'] - BATCH = gParameters['batch_size'] - nb_classes = gParameters['classes'] - DR = gParameters['drop'] - ACTIVATION = gParameters['activation'] - kerasDefaults = candle.keras_default_config() - kerasDefaults['momentum_sgd'] = gParameters['momentum'] - OPTIMIZER = candle.build_optimizer(gParameters['optimizer'], - gParameters['learning_rate'], - kerasDefaults) - PL = 6213 # 38 + 60483 - PS = 6212 # 60483 - - X_train, Y_train, X_test, Y_test = load_data(nb_classes, PL, gParameters) - - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) - - print('Y_train shape:', Y_train.shape) - print('Y_test shape:', Y_test.shape) - - - inputs = Input(shape=(PS,)) - - x = Dense(2000, activation=ACTIVATION)(inputs) - x = Dense(1000, activation=ACTIVATION)(x) - - for i in range(gParameters['connections']): - x = f(x, gParameters, distance=gParameters['distance'] ) - - x = Dropout(DR)(x) - - x = Dense(500, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - x = Dense(250, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - x = Dense(125, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - x = Dense(62, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - x = Dense(30, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - outputs = Dense(2, activation='softmax')(x) - - model = Model(inputs=inputs, outputs=outputs) - model.summary() - model.compile(loss='categorical_crossentropy', - optimizer=OPTIMIZER, - metrics=['accuracy']) - - # set up a bunch of callbacks to do work during model training. - checkpointer = ModelCheckpoint(filepath='t29res.autosave.model.h5', verbose=0, save_weights_only=False, save_best_only=True) - csv_logger = CSVLogger('t29res.training.log') - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=3, min_lr=0.000000001) - callbacks = [checkpointer, csv_logger, reduce_lr] - - def warmup_scheduler(epoch): - lr=gParameters['learning_rate'] - if epoch <= 4: - K.set_value(model.optimizer.lr, (lr * (epoch+1) / 5)) - print ('Epoch {}: lr={}'.format(epoch, K.get_value(model.optimizer.lr))) - return K.get_value(model.optimizer.lr) - - if 'warmup_lr' in gParameters: - - warmup_lr = LearningRateScheduler(warmup_scheduler) - print("adding LearningRateScheduler") - callbacks.append(warmup_lr) - - - history = model.fit(X_train, Y_train, - batch_size=BATCH, - epochs=EPOCH, - verbose=1, - validation_data=(X_test, Y_test), - callbacks = callbacks) - - score = model.evaluate(X_test, Y_test, verbose=0) - - # summarize history for accuracy - plt.plot(history.history['acc']) - plt.plot(history.history['val_acc']) - plt.title('Model Accuracy') - plt.ylabel('accuracy') - plt.xlabel('epoch') - plt.legend(['train', 'test'], loc='upper left') - - plt.savefig('t29res.accuracy.png', bbox_inches='tight') - plt.savefig('t29res.accuracy.pdf', bbox_inches='tight') - - plt.close() - - # summarize history for loss - plt.plot(history.history['loss']) - plt.plot(history.history['val_loss']) - plt.title('Model Loss') - plt.ylabel('loss') - plt.xlabel('epoch') - plt.legend(['train', 'test'], loc='upper left') - - plt.savefig('t29res.loss.png', bbox_inches='tight') - plt.savefig('t29res.loss.pdf', bbox_inches='tight') - - print('Test val_loss:', score[0]) - print('Test accuracy:', score[1]) - - # serialize model to JSON - model_json = model.to_json() - with open("t29res.model.json", "w") as json_file: - json_file.write(model_json) - - # serialize model to YAML - model_yaml = model.to_yaml() - with open("t29res.model.yaml", "w") as yaml_file: - yaml_file.write(model_yaml) - - # serialize weights to HDF5 - model.save_weights("t29res.model.h5") - print("Saved model to disk") - - # load json and create model - json_file = open('t29res.model.json', 'r') - loaded_model_json = json_file.read() - json_file.close() - loaded_model_json = model_from_json(loaded_model_json) - - # load yaml and create model - yaml_file = open('t29res.model.yaml', 'r') - loaded_model_yaml = yaml_file.read() - yaml_file.close() - loaded_model_yaml = model_from_yaml(loaded_model_yaml) - - # load weights into new model - loaded_model_json.load_weights("t29res.model.h5") - print("Loaded json model from disk") - - # evaluate json loaded model on test data - loaded_model_json.compile(loss='binary_crossentropy', optimizer=gParameters['optimizer'], metrics=['accuracy']) - score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) - - print('json Validation loss:', score_json[0]) - print('json Validation accuracy:', score_json[1]) - print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) - - # load weights into new model - loaded_model_yaml.load_weights("t29res.model.h5") - print("Loaded yaml model from disk") - - # evaluate loaded model on test data - loaded_model_yaml.compile(loss='binary_crossentropy', optimizer=gParameters['optimizer'], metrics=['accuracy']) - score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) - - print('yaml Validation loss:', score_yaml[0]) - print('yaml Validation accuracy:', score_yaml[1]) - print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) - - # predict using loaded yaml model on test and training data - predict_yaml_train = loaded_model_yaml.predict(X_train) - predict_yaml_test = loaded_model_yaml.predict(X_test) - - print('Yaml_train_shape:', predict_yaml_train.shape) - print('Yaml_test_shape:', predict_yaml_test.shape) - - predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) - predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) - - np.savetxt("predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") - np.savetxt("predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") - - np.savetxt("predict_yaml_train_classes.csv", predict_yaml_train_classes, delimiter=",",fmt="%d") - np.savetxt("predict_yaml_test_classes.csv", predict_yaml_test_classes, delimiter=",",fmt="%d") - - return history - -# This is also added for candle compliance so that the program can -# still be executed independently from the command line. -def main(): - - gParameters = initialize_parameters() - run(gParameters) - -if __name__ == '__main__': - main() - try: - ke.clear_session() - except AttributeError: # theano does not have this function - pass - diff --git a/Pilot1/TC1/tc1.py b/Pilot1/TC1/tc1.py index b36c9663..5be856d1 100644 --- a/Pilot1/TC1/tc1.py +++ b/Pilot1/TC1/tc1.py @@ -18,6 +18,9 @@ 'nargs':'+', 'type': int, 'help':'network structure of shared layer'}, + {'name':'classes', + 'type':int, + 'default':36} ] required = [ @@ -28,17 +31,17 @@ 'conv', 'dense', 'activation', - 'out_act', + 'out_activation', 'loss', 'optimizer', 'feature_subsample', 'metrics', 'epochs', 'batch_size', - 'drop', + 'dropout', 'classes', 'pool', - 'save' + 'output_dir' ] @@ -69,4 +72,4 @@ def load_data(params): return candle.load_Xy_data_noheader(train_path, test_path, params['classes'], usecols, - scaling='maxabs',dtype=params['datatype']) + scaling='maxabs',dtype=params['data_type']) diff --git a/Pilot1/TC1/tc1_baseline_keras2.py b/Pilot1/TC1/tc1_baseline_keras2.py index bbb90057..bb916b09 100644 --- a/Pilot1/TC1/tc1_baseline_keras2.py +++ b/Pilot1/TC1/tc1_baseline_keras2.py @@ -30,14 +30,14 @@ import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'tc1_default_model.txt'): # Build benchmark object - tc1Bmk = bmk.BenchmarkTC1(file_path, 'tc1_default_model.txt', 'keras', + tc1Bmk = bmk.BenchmarkTC1(file_path, default_model, 'keras', prog='tc1_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(tc1Bmk) + gParameters = candle.finalize_parameters(tc1Bmk) #benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -102,14 +102,14 @@ def run(gParameters): else: model.add(Dense(layer)) model.add(Activation(gParameters['activation'])) - if gParameters['drop']: - model.add(Dropout(gParameters['drop'])) + if gParameters['dropout']: + model.add(Dropout(gParameters['dropout'])) if dense_first: model.add(Flatten()) model.add(Dense(gParameters['classes'])) - model.add(Activation(gParameters['out_act'])) + model.add(Activation(gParameters['out_activation'])) model.summary() @@ -117,7 +117,7 @@ def run(gParameters): optimizer=gParameters['optimizer'], metrics=[gParameters['metrics']]) - output_dir = gParameters['save'] + output_dir = gParameters['output_dir'] if not os.path.exists(output_dir): os.makedirs(output_dir) diff --git a/Pilot1/TC1/tc1_default_model.txt b/Pilot1/TC1/tc1_default_model.txt index b70487e5..a22aed28 100644 --- a/Pilot1/TC1/tc1_default_model.txt +++ b/Pilot1/TC1/tc1_default_model.txt @@ -1,19 +1,19 @@ [Global_Params] -data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' train_data = 'type_18_300_train.csv' test_data = 'type_18_300_test.csv' model_name = 'tc1' conv=[128, 20, 1, 128, 10, 1] dense=[200,20] activation='relu' -out_act='softmax' +out_activation='softmax' loss='categorical_crossentropy' optimizer='sgd' metrics='accuracy' epochs=400 batch_size=20 -drop=0.1 +dropout=0.1 classes=36 feature_subsample=0 pool=[1, 10] -save='.' +output_dir='.' diff --git a/Pilot1/TC1/tc1_perf_bench_model.txt b/Pilot1/TC1/tc1_perf_bench_model.txt index 6c8d8168..dbf7fb04 100644 --- a/Pilot1/TC1/tc1_perf_bench_model.txt +++ b/Pilot1/TC1/tc1_perf_bench_model.txt @@ -1,19 +1,19 @@ [Global_Params] -data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' train_data = 'type_18_300_train.csv' test_data = 'type_18_300_test.csv' model_name = 'tc1' conv=[128, 20, 1, 128, 10, 1] dense=[200,20] activation='relu' -out_act='softmax' +out_activation='softmax' loss='categorical_crossentropy' optimizer='sgd' metrics='accuracy' epochs=20 batch_size=5 -drop=0.1 +dropout=0.1 classes=36 feature_subsample=0 pool=[1, 10] -save='.' +output_dir='.' diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md new file mode 100644 index 00000000..902adb93 --- /dev/null +++ b/Pilot1/Uno/README.AUC.md @@ -0,0 +1,148 @@ +# Predicting AUC values for Top21 cancer types + +## Data prep +A static dataset is prebuilt and available at `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5`. + +``` +$ wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5 +``` + + +## Training +``` +python uno_baseline_keras2.py --config_file uno_auc_model.txt \ + --use_exported_data top_21_auc_1fold.uno.h5 --es True + +... +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': None, + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'config_file': 'uno_auc_model.txt', + 'cp': True, + 'cv': 1, + 'datatype': , + 'dense': [1000, 1000, 1000, 1000, 1000], + 'dense_cell_feature_layers': None, + 'dense_drug_feature_layers': None, + 'dense_feature_layers': [1000, 1000, 1000], + 'drop': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 50, + 'es': True, + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.0001, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'adamax', + 'output_dir': '/ssd1/homes/hsyoo/projects/CANDLE/Benchmarks/Pilot1/Uno/Output/EXP000/RUN000', + 'partition_by': None, + 'preprocess_rnaseq': 'source_scale', + 'profiling': False, + 'reduce_lr': True, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'save_path': 'save/uno', + 'save_weights': None, + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': -1, + 'train_bool': True, + 'train_sources': ['CCLE'], + 'use_exported_data': 'top_21_auc_1fold.uno.h5', + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'validation_split': 0.2, + 'verbose': None, + 'warmup_lr': True} + + ... +Total params: 16,224,001 +Trainable params: 16,224,001 +Non-trainable params: 0 +... +Between random pairs in y_val: + mse: 0.0474 + mae: 0.1619 + r2: -1.0103 + corr: -0.0051 +Data points per epoch: train = 423952, val = 52994, test = 52994 +Steps per epoch: train = 13248, val = 1656, test = 1656 +Epoch 1/50 +13248/13248 [==============================] - 102s 8ms/step - loss: 0.0268 - mae: 0.0794 - r2: -0.2754 - val_loss: 0.0092 - val_mae: 0.0725 - val_r2: 0.5657 +Current time ....101.892 +... +13248/13248 [==============================] - 102s 8ms/step - loss: 0.004572, lr: 0.000010, mae: 0.046159, r2: 0.782253, val_loss: 0.005335, val_mae: 0.049082, val_r2: 0.748585 +Comparing y_true and y_pred: + mse: 0.0053 + mae: 0.0490 + r2: 0.7742 + corr: 0.8800 +``` + + +## Inference +The script `uno_infer.py` takes a couple of parameters for inferences. You are required to specify a datafile (the same dataset for training, `top_21_auc_1fold.uno.h5` in this case), model file, and trained weights. You can choose a partition as a inference input (training, validation, or all) and number of predictions for each data points (-n). +``` +$ python uno_infer.py --data top_21_auc_1fold.uno.h5 \ + --model_file top21_ref/model.json \ + --weights_file top21_ref/weights.h5 \ + --partition val \ + -n 30 \ + --single True \ + --agg_dose AUC +... + mse: 0.0058 + mae: 0.0505 + r2: 0.7543 + corr: 0.8688 + mean std min max +mse: 0.0058, 0.0000, 0.0058, 0.0058 +mae: 0.0505, 0.0001, 0.0504, 0.0506 +r2: 0.7543, 0.0007, 0.7527, 0.7557 +corr: 0.8688, 0.0004, 0.8679, 0.8696 +``` + +After the inference script completes, you should be able to find `uno_pred.all.tsv` and `uno_pred.tsv` files, which contains all predicted value and error, and aggregated statistics for each data point respectively. See below for example, +``` +$ head -n 4 uno_pred.all.tsv +AUC Sample Drug1 PredictedAUC AUCError +0.7153 CCLE.22RV1 CCLE.1 0.726853 0.011553 +0.7153 CCLE.22RV1 CCLE.1 0.745033 0.0297334 +0.7153 CCLE.22RV1 CCLE.1 0.752899 0.0375985 + +$ head -n 4 uno_pred.tsv +AUC Sample Drug1 PredAUCMean PredAUCStd PredAUCMin PredAUCMax +0.918 CTRP.HCC-1438 CTRP.302 0.954987 0.0109111 0.938283 0.983576 +0.6474 NCI60.IGR-OV1 NSC.757440 0.680934 0.0279046 0.644829 0.755912 +0.5675 NCI60.CCRF-CEM NSC.381866 0.591151 0.0228838 0.553855 0.645553 +``` diff --git a/Pilot1/Uno/README.md b/Pilot1/Uno/README.md index 96f46c45..c2c86f4e 100644 --- a/Pilot1/Uno/README.md +++ b/Pilot1/Uno/README.md @@ -7,7 +7,7 @@ Uno can be trained with a subset of dose response data sources. Here is an comma uno_baseline_keras2.py --train_sources all --cache cache/all --use_landmark_genes True --preprocess_rnaseq source_scale --no_feature_source True --no_response_source True Using TensorFlow backend. Params: {'activation': 'relu', 'batch_size': 32, 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0, 'epochs': 10, 'learning_rate': None, 'loss': -'mse', 'optimizer': 'adam', 'residual': False, 'rng_seed': 2018, 'save': 'save/uno', 'scaling': 'std', 'feature_subsample': 0, 'validation_split': 0.2, 'solr_root': '', 'timeout' +'mse', 'optimizer': 'adam', 'residual': False, 'rng_seed': 2018, 'save': 'save/uno', 'scaling': 'std', 'feature_subsample': 0, 'validation_split': 0.2, 'timeout' : -1, 'train_sources': ['all'], 'test_sources': ['train'], 'cell_types': None, 'cell_features': ['rnaseq'], 'drug_features': ['descriptors', 'fingerprints'], 'cv': 1, 'max_val_lo ss': 1.0, 'base_lr': None, 'reduce_lr': False, 'warmup_lr': False, 'batch_normalization': False, 'no_gen': False, 'config_file': '/raid/fangfang/Benchmarks/Pilot1/Uno/uno_default _model.txt', 'verbose': False, 'logfile': None, 'train_bool': True, 'shuffle': True, 'alpha_dropout': False, 'gpus': [], 'experiment_id': 'EXP.000', 'run_id': 'RUN.000', 'by_cell diff --git a/Pilot1/Uno/uno.py b/Pilot1/Uno/uno.py index d4731e50..bebea29c 100644 --- a/Pilot1/Uno/uno.py +++ b/Pilot1/Uno/uno.py @@ -48,7 +48,7 @@ def set_locals(self): 'help': 'use rnaseq cell line feature set or none at all'}, {'name': 'drug_features', 'nargs': '+', - 'choices': ['descriptors', 'fingerprints', 'none'], + 'choices': ['descriptors', 'fingerprints', 'none', 'mordred'], 'help': 'use dragon7 descriptors or fingerprint descriptors for drug features or none at all'}, {'name': 'by_cell', 'type': str, @@ -100,6 +100,16 @@ def set_locals(self): 'nargs': '+', 'type': int, 'help': 'number of neurons in intermediate dense layers in the feature encoding submodels'}, + {'name': 'dense_cell_feature_layers', + 'nargs': '+', + 'type': int, + 'default': None, + 'help': 'number of neurons in intermediate dense layers in the cell feature encoding submodels'}, + {'name': 'dense_drug_feature_layers', + 'nargs': '+', + 'type': int, + 'default': None, + 'help': 'number of neurons in intermediate dense layers in the drug feature encoding submodels'}, {'name': 'use_landmark_genes', 'type': candle.str2bool, 'default': False, @@ -140,6 +150,10 @@ def set_locals(self): 'type': float, 'default': None, 'help': 'base learning rate'}, + {'name': 'es', + 'type': candle.str2bool, + 'default': False, + 'help': 'early stopping on val_loss'}, {'name': 'cp', 'type': candle.str2bool, 'default': False, @@ -192,14 +206,14 @@ def set_locals(self): 'type': int, 'default': 0, 'help': 'number of bins to use when discretizing growth response'}, - {'name' : 'initial_weights', - 'type' : str, + {'name': 'initial_weights', + 'type': str, 'default': None, - 'help' : 'file name of initial weights'}, - {'name' : 'save_weights', + 'help': 'file name of initial weights'}, + {'name': 'save_weights', 'type': str, - 'default' : None, - 'help': 'name of file to save weights to' } + 'default': None, + 'help': 'name of file to save weights to'} ] required = [ @@ -207,7 +221,7 @@ def set_locals(self): 'batch_size', 'dense', 'dense_feature_layers', - 'drop', + 'dropout', 'epochs', 'feature_subsample', 'learning_rate', @@ -217,7 +231,6 @@ def set_locals(self): 'rng_seed', 'save_path', 'scaling', - 'validation_split', - 'solr_root', + 'val_split', 'timeout' ] diff --git a/Pilot1/Uno/uno_auc_clr_model.txt b/Pilot1/Uno/uno_auc_clr_model.txt new file mode 100644 index 00000000..437b19e9 --- /dev/null +++ b/Pilot1/Uno/uno_auc_clr_model.txt @@ -0,0 +1,50 @@ +[Global_Params] +train_sources=['CCLE'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors'] +dense=[1000, 1000, 1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adamax' +scaling='std' +dropout=.1 +epochs=50 +batch_size=32 +val_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=0.0001 +base_lr=None +agg_dose='AUC' +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +no_gen=False +verbose=False + + +preprocess_rnaseq='source_scale' +gpus=[0] +use_landmark_genes=True +no_feature_source=True +no_response_source=True +cp=True +save_path='save/uno' + +single=True + +[Monitor_Params] +timeout=-1 + +[CLR_Params] +clr_flag = True +clr_mode = 'trng1' +clr_base_lr = 0.001 +clr_max_lr = 0.01 +clr_gamma = 0.999 diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt new file mode 100644 index 00000000..13168906 --- /dev/null +++ b/Pilot1/Uno/uno_auc_model.txt @@ -0,0 +1,43 @@ +[Global_Params] +train_sources=['CCLE'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors'] +dense=[1000, 1000, 1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adamax' +scaling='std' +dropout=.1 +epochs=50 +batch_size=32 +val_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=0.0001 +base_lr=None +agg_dose='AUC' +residual=False +reduce_lr=True +warmup_lr=True +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +no_gen=False +verbose=False + + +preprocess_rnaseq='source_scale' +gpus=[0] +use_landmark_genes=True +no_feature_source=True +no_response_source=True +cp=True +save_path='save/uno' + +single=True + +[Monitor_Params] +timeout=-1 diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 722f9482..b7926e12 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -18,10 +18,6 @@ from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error from scipy.stats.stats import pearsonr -# For non-interactive plotting -import matplotlib as mpl -import matplotlib.pyplot as plt - import uno as benchmark import candle @@ -29,7 +25,6 @@ from uno_data import CombinedDataLoader, CombinedDataGenerator, DataFeeder -mpl.use('Agg') logger = logging.getLogger(__name__) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -81,8 +76,8 @@ def extension_from_parameters(args): ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: ext += '.FS={}'.format(args.feature_subsample) - if args.drop > 0: - ext += '.DR={}'.format(args.drop) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) if args.warmup_lr: ext += '.wu_lr' if args.reduce_lr: @@ -135,20 +130,6 @@ def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'): logger.info(' {}: {:.4f}'.format(metric, value)) -#def plot_history(out, history, metric='loss', title=None): -# title = title or 'model {}'.format(metric) -# val_metric = 'val_{}'.format(metric) -# plt.figure(figsize=(8, 6)) -# plt.plot(history.history[metric], marker='o') -# plt.plot(history.history[val_metric], marker='d') -# plt.title(title) -# plt.ylabel(metric) -# plt.xlabel('epoch') -# plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center') -# png = '{}.plot.{}.png'.format(out, metric) -# plt.savefig(png, bbox_inches='tight') -# - class LoggingCallback(Callback): def __init__(self, print_fcn=print): Callback.__init__(self) @@ -201,7 +182,8 @@ def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], model = Model(x_input, h, name=name) return model -class SimpleWeightSaver(Callback): + +class SimpleWeightSaver(Callback): def __init__(self, fname): self.fname = fname @@ -214,17 +196,23 @@ def set_model(self, model): def on_train_end(self, logs={}): self.model.save_weights(self.fname) - def build_model(loader, args, permanent_dropout=True, silent=False): input_models = {} - dropout_rate = args.drop + dropout_rate = args.dropout for fea_type, shape in loader.feature_shapes.items(): base_type = fea_type.split('.')[0] if base_type in ['cell', 'drug']: + if args.dense_cell_feature_layers is not None and base_type == 'cell': + dense_feature_layers = args.dense_cell_feature_layers + elif args.dense_drug_feature_layers is not None and base_type == 'drug': + dense_feature_layers = args.dense_drug_feature_layers + else: + dense_feature_layers = args.dense_feature_layers + box = build_feature_model(input_shape=shape, name=fea_type, - dense_layers=args.dense_feature_layers, + dense_layers=dense_feature_layers, dropout_rate=dropout_rate, permanent_dropout=permanent_dropout) if not silent: logger.debug('Feature encoding submodel for %s:', fea_type) @@ -265,14 +253,14 @@ def build_model(loader, args, permanent_dropout=True, silent=False): return Model(inputs, output) -def initialize_parameters(): +def initialize_parameters(default_model='uno_default_model.txt'): # Build benchmark object - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras', prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') # Initialize parameters - gParameters = candle.initialize_parameters(unoBmk) + gParameters = candle.finalize_parameters(unoBmk) # benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -289,7 +277,7 @@ def run(params): ext = extension_from_parameters(args) verify_path(args.save_path) prefix = args.save_path + ext - logfile = args.logfile if args.logfile else prefix+'.log' + logfile = args.logfile if args.logfile else prefix + '.log' set_up_logger(logfile, args.verbose) logger.info('Params: {}'.format(params)) @@ -318,10 +306,11 @@ def run(params): test_sources=args.test_sources, embed_feature_source=not args.no_feature_source, encode_response_source=not args.no_response_source, + use_exported_data=args.use_exported_data, ) target = args.agg_dose or 'Growth' - val_split = args.validation_split + val_split = args.val_split train_split = 1 - val_split if args.export_csv: @@ -366,13 +355,20 @@ def run(params): store.append('y_{}'.format(partition), y.astype({target: 'float32'}), format='table', data_column=True, min_itemsize=config_min_itemsize) logger.info('Generating {} dataset. {} / {}'.format(partition, i, gen.steps)) + + # save input_features and feature_shapes from loader + store.put('model', pd.DataFrame()) + store.get_storer('model').attrs.input_features = loader.input_features + store.get_storer('model').attrs.feature_shapes = loader.feature_shapes + store.close() logger.info('Completed generating {}'.format(fname)) return - loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, - cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, - cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) + if args.use_exported_data is None: + loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) model = build_model(loader, args) logger.info('Combined model:') @@ -403,7 +399,7 @@ def warmup_scheduler(epoch): template_model = build_model(loader, args, silent=True) if args.initial_weights: - logger.info("Loading weights from {}".format(args.initial_weights)) + logger.info("Loading initial weights from {}".format(args.initial_weights)) template_model.load_weights(args.initial_weights) if len(args.gpus) > 1: @@ -419,7 +415,6 @@ def warmup_scheduler(epoch): if args.learning_rate: K.set_value(optimizer.lr, args.learning_rate) - model.compile(loss=args.loss, optimizer=optimizer, metrics=[mae, r2]) # calculate trainable and non-trainable params @@ -427,14 +422,17 @@ def warmup_scheduler(epoch): candle_monitor = candle.CandleRemoteMonitor(params=params) timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + es_monitor = keras.callbacks.EarlyStopping(patience=10, verbose=1) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) warmup_lr = LearningRateScheduler(warmup_scheduler) checkpointer = MultiGPUCheckpoint(prefix + cv_ext + '.model.h5', save_best_only=True) tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext)) history_logger = LoggingCallback(logger.debug) - + callbacks = [candle_monitor, timeout_monitor, history_logger] + if args.es: + callbacks.append(es_monitor) if args.reduce_lr: callbacks.append(reduce_lr) if args.warmup_lr: @@ -444,14 +442,17 @@ def warmup_scheduler(epoch): if args.tb: callbacks.append(tensorboard) if args.save_weights: - callbacks.append(SimpleWeightSaver(args.save_path + '/' + args.save_weights)) + logger.info("Will save weights to: " + args.save_weights) + callbacks.append(MultiGPUCheckpoint(args.save_weights)) if args.use_exported_data is not None: train_gen = DataFeeder(filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) val_gen = DataFeeder(partition='val', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) + test_gen = DataFeeder(partition='test', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) else: train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) + test_gen = CombinedDataGenerator(loader, partition='test', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) df_val = val_gen.get_response(copy=True) y_val = df_val[target].values @@ -468,20 +469,27 @@ def warmup_scheduler(epoch): callbacks=callbacks, validation_data=(x_val_list, y_val)) else: - logger.info('Data points per epoch: train = %d, val = %d', train_gen.size, val_gen.size) - logger.info('Steps per epoch: train = %d, val = %d', train_gen.steps, val_gen.steps) + logger.info('Data points per epoch: train = %d, val = %d, test = %d', train_gen.size, val_gen.size, test_gen.size) + logger.info('Steps per epoch: train = %d, val = %d, test = %d', train_gen.steps, val_gen.steps, test_gen.steps) history = model.fit_generator(train_gen, train_gen.steps, epochs=args.epochs, callbacks=callbacks, validation_data=val_gen, validation_steps=val_gen.steps) - if args.no_gen: - y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) + # prediction on holdout(test) when exists or use validation set + if test_gen.size > 0: + df_val = test_gen.get_response(copy=True) + y_val = df_val[target].values + y_val_pred = model.predict_generator(test_gen, test_gen.steps + 1) + y_val_pred = y_val_pred[:test_gen.size] else: - val_gen.reset() - y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) - y_val_pred = y_val_pred[:val_gen.size] + if args.no_gen: + y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) + else: + val_gen.reset() + y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) + y_val_pred = y_val_pred[:val_gen.size] y_val_pred = y_val_pred.flatten() @@ -493,10 +501,7 @@ def warmup_scheduler(epoch): df_val[target + 'Error'] = y_val_pred - y_val df_pred_list.append(df_val) - if hasattr(history, 'loss'): - plot_history(prefix, history, 'loss') - if hasattr(history, 'r2'): - plot_history(prefix, history, 'r2') + candle.plot_metrics(history, title=None, skip_ep=0, outdir='./save/', add_lr=True) pred_fname = prefix + '.predicted.tsv' df_pred = pd.concat(df_pred_list) diff --git a/Pilot1/Uno/uno_by_drug_example.txt b/Pilot1/Uno/uno_by_drug_example.txt index daa028a1..6fb41364 100644 --- a/Pilot1/Uno/uno_by_drug_example.txt +++ b/Pilot1/Uno/uno_by_drug_example.txt @@ -10,10 +10,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=10 batch_size=128 -validation_split=0.2 +val_split=0.2 cv=1 max_val_loss=1.0 learning_rate=None @@ -34,7 +34,7 @@ use_landmark_genes=True partition_by='cell' by_drug='paclitaxel' cache='cache.pac' +gpus = [0] [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/Uno/uno_clr_keras2.py b/Pilot1/Uno/uno_clr_keras2.py new file mode 100644 index 00000000..aa14f04c --- /dev/null +++ b/Pilot1/Uno/uno_clr_keras2.py @@ -0,0 +1,567 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import logging +import os +import random + +import numpy as np +import pandas as pd + +import keras +from keras import backend as K +from keras import optimizers +from keras.models import Model +from keras.layers import Input, Dense, Dropout +from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error +from scipy.stats.stats import pearsonr + +import uno as benchmark +import candle + +import uno_data +from uno_data import CombinedDataLoader, CombinedDataGenerator, DataFeeder + + +logger = logging.getLogger(__name__) +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def set_seed(seed): + os.environ['PYTHONHASHSEED'] = '0' + np.random.seed(seed) + + random.seed(seed) + + if K.backend() == 'tensorflow': + import tensorflow as tf + tf.set_random_seed(seed) + candle.set_parallelism_threads() + + +def verify_path(path): + folder = os.path.dirname(path) + if folder and not os.path.exists(folder): + os.makedirs(folder) + + +def set_up_logger(logfile, verbose): + verify_path(logfile) + fh = logging.FileHandler(logfile) + fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) + fh.setLevel(logging.DEBUG) + + sh = logging.StreamHandler() + sh.setFormatter(logging.Formatter('')) + sh.setLevel(logging.DEBUG if verbose else logging.INFO) + + for log in [logger, uno_data.logger]: + log.setLevel(logging.DEBUG) + log.addHandler(fh) + log.addHandler(sh) + + +def extension_from_parameters(args): + """Construct string for saving model with annotation of parameters""" + ext = '' + ext += '.A={}'.format(args.activation) + ext += '.B={}'.format(args.batch_size) + ext += '.E={}'.format(args.epochs) + ext += '.O={}'.format(args.optimizer) + # ext += '.LEN={}'.format(args.maxlen) + ext += '.LR={}'.format(args.learning_rate) + ext += '.CF={}'.format(''.join([x[0] for x in sorted(args.cell_features)])) + ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) + if args.feature_subsample > 0: + ext += '.FS={}'.format(args.feature_subsample) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) + if args.warmup_lr: + ext += '.wu_lr' + if args.reduce_lr: + ext += '.re_lr' + if args.residual: + ext += '.res' + if args.use_landmark_genes: + ext += '.L1000' + if args.no_gen: + ext += '.ng' + for i, n in enumerate(args.dense): + if n > 0: + ext += '.D{}={}'.format(i + 1, n) + if args.dense_feature_layers != args.dense: + for i, n in enumerate(args.dense): + if n > 0: + ext += '.FD{}={}'.format(i + 1, n) + + return ext + + +def discretize(y, bins=5): + percentiles = [100 / bins * (i + 1) for i in range(bins - 1)] + thresholds = [np.percentile(y, x) for x in percentiles] + classes = np.digitize(y, thresholds) + return classes + + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res / (SS_tot + K.epsilon())) + + +def mae(y_true, y_pred): + return keras.metrics.mean_absolute_error(y_true, y_pred) + + +def evaluate_prediction(y_true, y_pred): + mse = mean_squared_error(y_true, y_pred) + mae = mean_absolute_error(y_true, y_pred) + r2 = r2_score(y_true, y_pred) + corr, _ = pearsonr(y_true, y_pred) + return {'mse': mse, 'mae': mae, 'r2': r2, 'corr': corr} + + +def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'): + logger.info(description) + for metric, value in metric_outputs.items(): + logger.info(' {}: {:.4f}'.format(metric, value)) + + +class LoggingCallback(Callback): + def __init__(self, print_fcn=print): + Callback.__init__(self) + self.print_fcn = print_fcn + + def on_epoch_end(self, epoch, logs={}): + msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) + self.print_fcn(msg) + + +class PermanentDropout(Dropout): + def __init__(self, rate, **kwargs): + super(PermanentDropout, self).__init__(rate, **kwargs) + self.uses_learning_phase = False + + def call(self, x, mask=None): + if 0. < self.rate < 1.: + noise_shape = self._get_noise_shape(x) + x = K.dropout(x, self.rate, noise_shape) + return x + + +class MultiGPUCheckpoint(ModelCheckpoint): + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + + +def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], + activation='relu', residual=False, + dropout_rate=0, permanent_dropout=True): + x_input = Input(shape=input_shape) + h = x_input + for i, layer in enumerate(dense_layers): + x = h + h = Dense(layer, activation=activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + model = Model(x_input, h, name=name) + return model + + +class SimpleWeightSaver(Callback): + + def __init__(self, fname): + self.fname = fname + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + + def on_train_end(self, logs={}): + self.model.save_weights(self.fname) + + +def build_model(loader, args, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.dropout + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + if args.dense_cell_feature_layers is not None and base_type == 'cell': + dense_feature_layers = args.dense_cell_feature_layers + elif args.dense_drug_feature_layers is not None and base_type == 'drug': + dense_feature_layers = args.dense_drug_feature_layers + else: + dense_feature_layers = args.dense_feature_layers + + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=dense_feature_layers, + dropout_rate=dropout_rate, permanent_dropout=permanent_dropout) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.' + fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + h = Dense(layer, activation=args.activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(1)(h) + + return Model(inputs, output) + + +def initialize_parameters(default_model='uno_clr_model.txt'): + + # Build benchmark object + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras', + prog='uno_clr', desc='Build neural network based models to predict tumor response to single and paired drugs.') + + # Initialize parameters + gParameters = candle.finalize_parameters(unoBmk) + # benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +class Struct: + def __init__(self, **entries): + self.__dict__.update(entries) + + +def run(params): + + candle.check_flag_conflicts(params) + args = Struct(**params) + set_seed(args.rng_seed) + ext = extension_from_parameters(args) + verify_path(args.save_path) + prefix = args.save_path + ext + logfile = args.logfile if args.logfile else prefix + '.log' + set_up_logger(logfile, args.verbose) + logger.info('Params: {}'.format(params)) + + if (len(args.gpus) > 0): + import tensorflow as tf + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = ",".join(map(str, args.gpus)) + K.set_session(tf.Session(config=config)) + + loader = CombinedDataLoader(seed=args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + use_exported_data=args.use_exported_data, + ) + + target = args.agg_dose or 'Growth' + val_split = args.val_split + train_split = 1 - val_split + + if args.export_csv: + fname = args.export_csv + loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) + train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle) + val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle) + + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, dataframe=True, single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, dataframe=True, single=args.single) + df_train = pd.concat([y_train] + x_train_list, axis=1) + df_val = pd.concat([y_val] + x_val_list, axis=1) + df = pd.concat([df_train, df_val]).reset_index(drop=True) + if args.growth_bins > 1: + df = uno_data.discretize(df, 'Growth', bins=args.growth_bins) + df.to_csv(fname, sep='\t', index=False, float_format="%.3g") + return + + if args.export_data: + fname = args.export_data + loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) + train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle) + val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle) + store = pd.HDFStore(fname, complevel=9, complib='blosc:snappy') + + config_min_itemsize = {'Sample': 30, 'Drug1': 10} + if not args.single: + config_min_itemsize['Drug2'] = 10 + + for partition in ['train', 'val']: + gen = train_gen if partition == 'train' else val_gen + for i in range(gen.steps): + x_list, y = gen.get_slice(size=args.batch_size, dataframe=True, single=args.single) + + for j, input_feature in enumerate(x_list): + input_feature.columns = [''] * len(input_feature.columns) + store.append('x_{}_{}'.format(partition, j), input_feature.astype('float32'), format='table', data_column=True) + store.append('y_{}'.format(partition), y.astype({target: 'float32'}), format='table', data_column=True, + min_itemsize=config_min_itemsize) + logger.info('Generating {} dataset. {} / {}'.format(partition, i, gen.steps)) + + # save input_features and feature_shapes from loader + store.put('model', pd.DataFrame()) + store.get_storer('model').attrs.input_features = loader.input_features + store.get_storer('model').attrs.feature_shapes = loader.feature_shapes + + store.close() + logger.info('Completed generating {}'.format(fname)) + return + + if args.use_exported_data is None: + loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) + + model = build_model(loader, args) + logger.info('Combined model:') + model.summary(print_fn=logger.info) + # plot_model(model, to_file=prefix+'.model.png', show_shapes=True) + + if args.cp: + model_json = model.to_json() + with open(prefix + '.model.json', 'w') as f: + print(model_json, file=f) + + def warmup_scheduler(epoch): + lr = args.learning_rate or base_lr * args.batch_size / 100 + if epoch <= 5: + K.set_value(model.optimizer.lr, (base_lr * (5 - epoch) + lr * epoch) / 5) + logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr))) + return K.get_value(model.optimizer.lr) + + df_pred_list = [] + + cv_ext = '' + cv = args.cv if args.cv > 1 else 1 + + for fold in range(cv): + if args.cv > 1: + logger.info('Cross validation fold {}/{}:'.format(fold + 1, cv)) + cv_ext = '.cv{}'.format(fold + 1) + + template_model = build_model(loader, args, silent=True) + if args.initial_weights: + logger.info("Loading initial weights from {}".format(args.initial_weights)) + template_model.load_weights(args.initial_weights) + + if len(args.gpus) > 1: + from keras.utils import multi_gpu_model + gpu_count = len(args.gpus) + logger.info("Multi GPU with {} gpus".format(gpu_count)) + model = multi_gpu_model(template_model, cpu_merge=False, gpus=gpu_count) + else: + model = template_model + + optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}}) + base_lr = args.base_lr or K.get_value(optimizer.lr) + if args.learning_rate: + K.set_value(optimizer.lr, args.learning_rate) + + model.compile(loss=args.loss, optimizer=optimizer, metrics=[mae, r2]) + + # calculate trainable and non-trainable params + params.update(candle.compute_trainable_params(model)) + + # Here is where we set a bunch of callback + # Set the CLR first so it will invalidate the warmup_lr, reduce_lr flags if needed + clr_args = candle.clr_set_args(params) + if clr_args['mode'] is not None: + clrCallback = candle.clr_callback(**clr_args) + + candle_monitor = candle.CandleRemoteMonitor(params=params) + timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + es_monitor = keras.callbacks.EarlyStopping(patience=10, verbose=1) + + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) + warmup_lr = LearningRateScheduler(warmup_scheduler) + checkpointer = MultiGPUCheckpoint(prefix + cv_ext + '.model.h5', save_best_only=True) + tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext)) + history_logger = LoggingCallback(logger.debug) + + callbacks = [candle_monitor, timeout_monitor, history_logger] + if args.es: + callbacks.append(es_monitor) + if args.reduce_lr: + callbacks.append(reduce_lr) + if args.warmup_lr: + callbacks.append(warmup_lr) + if args.cp: + callbacks.append(checkpointer) + if args.tb: + callbacks.append(tensorboard) + if args.save_weights: + logger.info("Will save weights to: " + args.save_weights) + callbacks.append(MultiGPUCheckpoint(args.save_weights)) + if clr_args['mode'] is not None: + callbacks.append(clrCallback) + + if args.use_exported_data is not None: + train_gen = DataFeeder(filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) + val_gen = DataFeeder(partition='val', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) + test_gen = DataFeeder(partition='test', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) + else: + train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) + val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) + test_gen = CombinedDataGenerator(loader, partition='test', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) + + df_val = val_gen.get_response(copy=True) + y_val = df_val[target].values + y_shuf = np.random.permutation(y_val) + log_evaluation(evaluate_prediction(y_val, y_shuf), + description='Between random pairs in y_val:') + + if args.no_gen: + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single) + history = model.fit(x_train_list, y_train, + batch_size=args.batch_size, + epochs=args.epochs, + callbacks=callbacks, + validation_data=(x_val_list, y_val)) + else: + logger.info('Data points per epoch: train = %d, val = %d, test = %d', train_gen.size, val_gen.size, test_gen.size) + logger.info('Steps per epoch: train = %d, val = %d, test = %d', train_gen.steps, val_gen.steps, test_gen.steps) + history = model.fit_generator(train_gen, train_gen.steps, + epochs=args.epochs, + callbacks=callbacks, + validation_data=val_gen, + validation_steps=val_gen.steps) + + # prediction on holdout(test) when exists or use validation set + if test_gen.size > 0: + df_val = test_gen.get_response(copy=True) + y_val = df_val[target].values + y_val_pred = model.predict_generator(test_gen, test_gen.steps + 1) + y_val_pred = y_val_pred[:test_gen.size] + else: + if args.no_gen: + y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) + else: + val_gen.reset() + y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) + y_val_pred = y_val_pred[:val_gen.size] + + y_val_pred = y_val_pred.flatten() + + scores = evaluate_prediction(y_val, y_val_pred) + log_evaluation(scores) + + # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred - y_val) + df_val['Predicted' + target] = y_val_pred + df_val[target + 'Error'] = y_val_pred - y_val + df_pred_list.append(df_val) + + candle.plot_metrics(history, title=None, skip_ep=0, outdir='./save/', add_lr=True) + + pred_fname = prefix + '.predicted.tsv' + df_pred = pd.concat(df_pred_list) + if args.agg_dose: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) + else: + df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target], inplace=True) + else: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + else: + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + + if args.cv > 1: + scores = evaluate_prediction(df_pred[target], df_pred['Predicted' + target]) + log_evaluation(scores, description='Combining cross validation folds:') + + for test_source in loader.test_sep_sources: + test_gen = CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source) + df_test = test_gen.get_response(copy=True) + y_test = df_test[target].values + n_test = len(y_test) + if n_test == 0: + continue + if args.no_gen: + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) + else: + y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + y_test_pred = y_test_pred[:test_gen.size] + y_test_pred = y_test_pred.flatten() + scores = evaluate_prediction(y_test, y_test_pred) + log_evaluation(scores, description='Testing on data from {} ({})'.format(test_source, n_test)) + + if K.backend() == 'tensorflow': + K.clear_session() + + logger.handlers = [] + + return history + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/Pilot1/Uno/uno_clr_model.txt b/Pilot1/Uno/uno_clr_model.txt new file mode 100644 index 00000000..b077cbb7 --- /dev/null +++ b/Pilot1/Uno/uno_clr_model.txt @@ -0,0 +1,40 @@ +[Global_Params] +train_sources=['GDSC', 'CTRP', 'ALMANAC'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors', 'fingerprints'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +dropout=0 +epochs=10 +batch_size=32 +val_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=True +warmup_lr=True +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save/uno' +no_gen=False +verbose = False +gpus = [0] + +[Monitor_Params] +timeout=3600 + +[CLR_Params] +clr_flag = False +clr_mode = 'trng1' +clr_base_lr = 0.00001 +clr_max_lr = 0.001 +clr_gamma = 0.999 diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 5ede815e..274f5ece 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -13,7 +13,11 @@ from itertools import cycle, islice -from sklearn.preprocessing import Imputer +try: + from sklearn.impute import SimpleImputer as Imputer +except ImportError: + from sklearn.preprocessing import Imputer + from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler from sklearn.model_selection import ShuffleSplit, KFold @@ -76,7 +80,7 @@ def impute_and_scale(df, scaling='std', imputing='mean', dropna='all'): if imputing is None or imputing.lower() == 'none': mat = df.values else: - imputer = Imputer(strategy=imputing, axis=0) + imputer = Imputer(strategy=imputing) mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': @@ -281,6 +285,38 @@ def load_drug_data(ncols=None, scaling='std', imputing='mean', dropna=None, add_ return df_desc, df_fp +def load_mordred_descriptors(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): + path = get_file(DATA_URL + 'extended_combined_mordred_descriptors') + + df = pd.read_csv(path, engine='c', sep='\t', na_values=['na', '-', '']) + df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce') + df.iloc[:, 1:] = df.iloc[:, 1:].astype(np.float32) + + df1 = pd.DataFrame(df.loc[:, 'DRUG']) + df1.rename(columns={'DRUG': 'Drug'}, inplace=True) + + df2 = df.drop('DRUG', 1) + if add_prefix: + df2 = df2.add_prefix('mordred.') + + df2 = impute_and_scale(df2, scaling, imputing) + + df_desc = pd.concat([df1, df2], axis=1) + + df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) + df2 = df_desc.drop('Drug', 1) + if add_prefix: + df2 = df2.add_prefix('mordred.') + if feature_subset: + df2 = df2[[x for x in df2.columns if x in feature_subset]] + df2 = impute_and_scale(df2, scaling=scaling, imputing=imputing, dropna=dropna) + df_desc = pd.concat([df1, df2], axis=1) + + logger.info('Loaded Mordred drug descriptors: %s', df_desc.shape) + + return df_desc + + def load_drug_descriptors(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): df_info = load_drug_info() df_info['Drug'] = df_info['PUBCHEM'] @@ -622,6 +658,7 @@ def __init__(self, seed=SEED): self.seed = seed def load_from_cache(self, cache, params): + """ NOTE: How does this function return an error? (False?) -Wozniak """ param_fname = '{}.params.json'.format(cache) if not os.path.isfile(param_fname): logger.warning('Cache parameter file does not exist: %s', param_fname) @@ -632,10 +669,10 @@ def load_from_cache(self, cache, params): except json.JSONDecodeError as e: logger.warning('Could not decode parameter file %s: %s', param_fname, e) return False - ignore_keys = ['cache', 'partition_by', 'single'] + ignore_keys = ['cache', 'partition_by', 'single', 'use_exported_data'] equal, diffs = dict_compare(params, cached_params, ignore_keys) if not equal: - logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttemptd to load: %s', diffs, cached_params, params) + logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttempted to load: %s', diffs, cached_params, params) logger.warning('\nRemove %s to rebuild data cache.\n', param_fname) raise ValueError('Could not load from a cache with incompatible keys:', diffs) else: @@ -648,12 +685,17 @@ def load_from_cache(self, cache, params): self.__dict__.update(obj.__dict__) logger.info('Loaded data from cache: %s', fname) return True + # NOTE: This is unreachable -Wozniak return False def save_to_cache(self, cache, params): for k in ['self', 'cache', 'single']: if k in params: del params[k] + dirname = os.path.dirname(cache) + if not os.path.exists(dirname): + logger.debug('Creating directory for cache: %s', dirname) + os.mkdir(dirname) param_fname = '{}.params.json'.format(cache) with open(param_fname, 'w') as param_file: json.dump(params, param_file, sort_keys=True) @@ -799,7 +841,7 @@ def load(self, cache=None, ncols=None, scaling='std', dropna=None, cell_feature_subset_path=None, drug_feature_subset_path=None, drug_lower_response=1, drug_upper_response=-1, drug_response_span=0, drug_median_response_min=-1, drug_median_response_max=1, - use_landmark_genes=False, use_filtered_genes=False, + use_landmark_genes=False, use_filtered_genes=False, use_exported_data=None, preprocess_rnaseq=None, single=False, # train_sources=['GDSC', 'CTRP', 'ALMANAC', 'NCI60'], train_sources=['GDSC', 'CTRP', 'ALMANAC'], @@ -821,6 +863,19 @@ def load(self, cache=None, ncols=None, scaling='std', dropna=None, self.build_feature_list(single=single) return + # rebuild cache equivalent from the exported dataset + if use_exported_data is not None: + with pd.HDFStore(use_exported_data, 'r') as store: + if '/model' in store.keys(): + self.input_features = store.get_storer('model').attrs.input_features + self.feature_shapes = store.get_storer('model').attrs.feature_shapes + self.input_dim = sum([np.prod(self.feature_shapes[x]) for x in self.input_features.values()]) + self.test_sep_sources = [] + return + else: + logger.warning('\nExported dataset does not have model info. Please rebuild the dataset.\n') + raise ValueError('Could not load model info from the dataset:', use_exported_data) + logger.info('Loading data from scratch ...') if agg_dose: @@ -872,13 +927,16 @@ def load(self, cache=None, ncols=None, scaling='std', dropna=None, df_drug_desc = load_drug_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) elif fea == 'fingerprints': df_drug_fp = load_drug_fingerprints(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) + elif fea == 'mordred': + df_drug_mordred = load_mordred_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) # df_drug_desc, df_drug_fp = load_drug_data(ncols=ncols, scaling=scaling, dropna=dropna) cell_df_dict = {'rnaseq': 'df_cell_rnaseq'} drug_df_dict = {'descriptors': 'df_drug_desc', - 'fingerprints': 'df_drug_fp'} + 'fingerprints': 'df_drug_fp', + 'mordred': 'df_drug_mordred'} # df_cell_ids = df_cell_rnaseq[['Sample']].drop_duplicates() # df_drug_ids = pd.concat([df_drug_desc[['Drug']], df_drug_fp[['Drug']]]).drop_duplicates() @@ -952,16 +1010,21 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals self.single = single self.agg_dose = agg_dose self.target = agg_dose if agg_dose is not None else 'Growth' - # 4 inputs for single drug model (cell, dose1, descriptor, fingerprint) - # 7 inputs for drug pair model (cell, dose1, dose1, dr1.descriptor, dr1.fingerprint, dr2.descriptor, dr2.fingerprint) - self.input_size = 4 if self.single else 7 - self.input_size = 3 if agg_dose else self.input_size self.store = pd.HDFStore(filename, mode='r') - y = self.store.select('y_{}'.format(self.partition)) - self.index = y.index + self.input_size = len(list(filter(lambda x: x.startswith('/x_train'), self.store.keys()))) + try: + y = self.store.select('y_{}'.format(self.partition)) + self.index = y.index + except KeyError: + self.index = [] + self.size = len(self.index) - self.steps = self.size // self.batch_size + if self.size >= self.batch_size: + self.steps = self.size // self.batch_size + else: + self.steps = 1 + self.batch_size = self.size self.index_map = np.arange(self.steps) if self.shuffle: np.random.shuffle(self.index_map) @@ -973,7 +1036,7 @@ def __getitem__(self, idx): start = self.index_map[idx] * self.batch_size stop = (self.index_map[idx] + 1) * self.batch_size x = [self.store.select('x_{0}_{1}'.format(self.partition, i), start=start, stop=stop) for i in range(self.input_size)] - y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop, columns=[self.target]) + y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop)[self.target] return x, y def reset(self): @@ -982,12 +1045,16 @@ def reset(self): pass def get_response(self, copy=False): - self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)] - df = self.store.get('y_{}'.format(self.partition)).iloc[self.index,:] + if self.shuffle: + self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)] + df = self.store.get('y_{}'.format(self.partition)).iloc[self.index, :] + else: + df = self.store.get('y_{}'.format(self.partition)) + if self.agg_dose is None: - df['Dose1'] = self.store.get('x_{}_0'.format(self.partition)).iloc[self.index,:] + df['Dose1'] = self.store.get('x_{}_0'.format(self.partition)).iloc[self.index, :] if not self.single: - df['Dose2'] = self.store.get('x_{}_1'.format(self.partition)).iloc[self.index,:] + df['Dose2'] = self.store.get('x_{}_1'.format(self.partition)).iloc[self.index, :] return df.copy() if copy else df def close(self): @@ -1008,7 +1075,7 @@ def __init__(self, data, partition='train', fold=0, source=None, batch_size=32, elif partition == 'val': index = data.val_indexes[fold] else: - index = data.test_indexes[fold] + index = data.test_indexes[fold] if hasattr(data, 'test_indexes') else [] if source: df = data.df_response[['Source']].iloc[index, :] diff --git a/Pilot1/Uno/uno_default_model.txt b/Pilot1/Uno/uno_default_model.txt index 64a88df6..72ddeb45 100644 --- a/Pilot1/Uno/uno_default_model.txt +++ b/Pilot1/Uno/uno_default_model.txt @@ -10,10 +10,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=10 batch_size=32 -validation_split=0.2 +val_split=0.2 cv=1 max_val_loss=1.0 learning_rate=None @@ -27,7 +27,7 @@ rng_seed=2018 save_path='save/uno' no_gen=False verbose = False +gpus = [0] [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/Uno/uno_fom_model.txt b/Pilot1/Uno/uno_fom_model.txt new file mode 100644 index 00000000..99cfd839 --- /dev/null +++ b/Pilot1/Uno/uno_fom_model.txt @@ -0,0 +1,38 @@ +[Global_Params] +train_sources=['GDSC'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors', 'fingerprints'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +dropout=0 +epochs=50 +batch_size=512 +val_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save/uno' +no_gen=False +verbose = False +use_landmark_genes=True +preprocess_rnaseq='source_scale' +no_feature_source=True +no_response_source=True +single=True +gpus = [0] + +[Monitor_Params] +timeout=-1 diff --git a/Pilot1/Uno/uno_infer.py b/Pilot1/Uno/uno_infer.py index 2bf43d1e..f4dcd62d 100644 --- a/Pilot1/Uno/uno_infer.py +++ b/Pilot1/Uno/uno_infer.py @@ -20,13 +20,18 @@ def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'): def get_parser(): parser = argparse.ArgumentParser(description='Uno infer script') parser.add_argument("--data", + required=True, help="data file to infer on. expect exported file from uno_baseline_keras2.py") - parser.add_argument("--model_file", help="json model description file") + parser.add_argument("--model_file", required=True, help="json model description file") parser.add_argument("--weights_file", help="model weights file") parser.add_argument("--partition", default='all', choices=['train', 'val', 'all'], help="partition of test dataset") parser.add_argument("-n", "--n_pred", type=int, default=1, help="the number of predictions to make") + parser.add_argument("--single", default=False, help="do not use drug pair representation") + parser.add_argument("--agg_dose", default=None, + choices=['AUC', 'IC50', 'HS', 'AAC1', 'AUC1', 'DSS1'], + help="use dose-independent response data with the specified aggregation metric") return parser @@ -49,18 +54,21 @@ def main(): cv_y_list = [] df_pred_list = [] cv_stats = {'mae': [], 'mse': [], 'r2': [], 'corr': []} + target = args.agg_dose or 'Growth' + for cv in range(args.n_pred): cv_pred = [] dataset = ['train', 'val'] if args.partition == 'all' else [args.partition] for partition in dataset: - test_gen = DataFeeder(filename=args.data, partition=partition, batch_size=1024) - y_test_pred = model.predict_generator(test_gen, test_gen.steps) + test_gen = DataFeeder(filename=args.data, partition=partition, batch_size=1024, single=args.single, agg_dose=args.agg_dose) + y_test_pred = model.predict_generator(test_gen, test_gen.steps + 1) + y_test_pred = y_test_pred[:test_gen.size] y_test_pred = y_test_pred.flatten() df_y = test_gen.get_response(copy=True) - y_test = df_y['Growth'].values + y_test = df_y[target].values - df_pred = df_y.assign(PredictedGrowth=y_test_pred, GrowthError=y_test_pred - y_test) + df_pred = df_y.assign(**{f'Predicted{target}': y_test_pred, f'{target}Error': y_test_pred - y_test}) df_pred_list.append(df_pred) test_gen.close() @@ -70,7 +78,7 @@ def main(): cv_pred_list.append(np.concatenate(cv_pred)) # calcuate stats for mse, mae, r2, corr - scores = evaluate_prediction(df_pred['Growth'], df_pred['PredictedGrowth']) + scores = evaluate_prediction(df_pred[target], df_pred[f'Predicted{target}']) # log_evaluation(scores, description=cv) [cv_stats[key].append(scores[key]) for key in scores.keys()] @@ -78,21 +86,27 @@ def main(): cv_y = pd.concat(cv_y_list) # save to tsv - df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + headers = ['Sample', 'Drug1'] + if not args.single: headers.append('Drug2') + if not args.agg_dose: headers.append('Dose1') + if not args.single and not args.agg_dose: headers.append('Dose2') + headers.append(target) + + df_pred.sort_values(headers, inplace=True) df_pred.to_csv('uno_pred.all.tsv', sep='\t', index=False, float_format='%.6g') df_sum = cv_y.assign() - df_sum['PredGrowthMean'] = np.mean(cv_pred_list, axis=0) - df_sum['PredGrowthStd'] = np.std(cv_pred_list, axis=0) - df_sum['PredGrowthMin'] = np.min(cv_pred_list, axis=0) - df_sum['PredGrowthMax'] = np.max(cv_pred_list, axis=0) + df_sum[f'Pred{target}Mean'] = np.mean(cv_pred_list, axis=0) + df_sum[f'Pred{target}Std'] = np.std(cv_pred_list, axis=0) + df_sum[f'Pred{target}Min'] = np.min(cv_pred_list, axis=0) + df_sum[f'Pred{target}Max'] = np.max(cv_pred_list, axis=0) df_sum.to_csv('uno_pred.tsv', index=False, sep='\t', float_format='%.6g') - # scores = evaluate_prediction(df_sum['Growth'], df_sum['PredGrowthMean']) - scores = evaluate_prediction(df_pred['Growth'], df_pred['PredictedGrowth']) + scores = evaluate_prediction(df_pred[f'{target}'], df_pred[f'Predicted{target}']) log_evaluation(scores, description='Testing on data from {} on partition {} ({} rows)'.format(args.data, args.partition, len(cv_y))) + print(' mean std min max') for key in ['mse', 'mae', 'r2', 'corr']: print('{}: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(key, np.around(np.mean(cv_stats[key], axis=0), decimals=4), diff --git a/Pilot1/Uno/uno_perf_bench_model.txt b/Pilot1/Uno/uno_perf_bench_model.txt index 234334f9..b8f7b213 100644 --- a/Pilot1/Uno/uno_perf_bench_model.txt +++ b/Pilot1/Uno/uno_perf_bench_model.txt @@ -10,10 +10,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=3 batch_size=32 -validation_split=0.2 +val_split=0.2 cv=1 max_val_loss=1.0 learning_rate=None @@ -28,7 +28,7 @@ save_path='save/uno' no_gen=False verbose = False use_landmark_genes=True +gpus = [0] [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/UnoMT/README.md b/Pilot1/UnoMT/README.md index e9f76b90..491e2fdb 100644 --- a/Pilot1/UnoMT/README.md +++ b/Pilot1/UnoMT/README.md @@ -78,7 +78,6 @@ Configuration file: ./unoMT_default_model.txt 'rnaseq_scaling': 'std', 'rng_seed': 0, 'save_path': 'save/unoMT', - 'solr_root': '', 'timeout': 3600, 'train_sources': 'NCI60', 'trn_batch_size': 32, @@ -142,7 +141,6 @@ Params: 'run_id': 'RUN000', 'save_path': 'save/unoMT', 'shuffle': False, - 'solr_root': '', 'timeout': 3600, 'train_bool': True, 'train_sources': 'NCI60', diff --git a/Pilot1/UnoMT/unoMT.py b/Pilot1/UnoMT/unoMT.py index 7ffa2497..b18a4bdb 100644 --- a/Pilot1/UnoMT/unoMT.py +++ b/Pilot1/UnoMT/unoMT.py @@ -247,7 +247,7 @@ 'resp_num_layers_per_block', 'resp_num_blocks', 'resp_num_layers', - 'drop', + 'dropout', 'resp_activation', 'cl_clf_layer_dim', 'cl_clf_num_layers', @@ -271,7 +271,6 @@ 'epochs', 'rng_seed', 'val_split', - 'solr_root', 'timeout', ] diff --git a/Pilot1/UnoMT/unoMT_baseline_pytorch.py b/Pilot1/UnoMT/unoMT_baseline_pytorch.py index f9e698b0..95842f41 100644 --- a/Pilot1/UnoMT/unoMT_baseline_pytorch.py +++ b/Pilot1/UnoMT/unoMT_baseline_pytorch.py @@ -21,16 +21,16 @@ np.set_printoptions(precision=4) -def initialize_parameters(): +def initialize_parameters(default_model = 'unoMT_default_model.txt'): # Build benchmark object - unoMTb = unoMT.unoMTBk(unoMT.file_path, 'unoMT_default_model.txt', 'pytorch', + unoMTb = unoMT.unoMTBk(unoMT.file_path, default_model, 'pytorch', prog='unoMT_baseline', desc='Multi-task combined single and combo drug prediction for cross-study data - Pilot 1') print("Created unoMT benchmark") # Initialize parameters - gParameters = candle.initialize_parameters(unoMTb) + gParameters = candle.finalize_parameters(unoMTb) print("Parameters initialized") @@ -45,6 +45,10 @@ def run(params): # Setting up random seed for reproducible and deterministic results seed_random_state(args.rng_seed) + # check for sufficient number of epochs to start validation + if params['epochs'] < params['resp_val_start_epoch']: + raise Exception('Number of epochs is less than validation threshold (resp_val_start_epoch)') + # Construct extension to save validation results now = datetime.datetime.now() ext = '%02d%02d_%02d%02d_pytorch' \ diff --git a/Pilot1/UnoMT/unoMT_default_model.txt b/Pilot1/UnoMT/unoMT_default_model.txt index 8743f53a..df64d076 100644 --- a/Pilot1/UnoMT/unoMT_default_model.txt +++ b/Pilot1/UnoMT/unoMT_default_model.txt @@ -35,7 +35,7 @@ resp_layer_dim=2048 resp_num_layers_per_block=2 resp_num_blocks=4 resp_num_layers=2 -drop=0.1 +dropout=0.1 resp_activation='none' # Cell line classification network(s) @@ -91,6 +91,5 @@ rng_seed=0 save_path='save/unoMT' [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/UnoMT/unoMT_pytorch_model.py b/Pilot1/UnoMT/unoMT_pytorch_model.py index 2d94a0f0..59513161 100644 --- a/Pilot1/UnoMT/unoMT_pytorch_model.py +++ b/Pilot1/UnoMT/unoMT_pytorch_model.py @@ -261,7 +261,7 @@ def build_nn(self): resp_num_layers_per_block=args.resp_num_layers_per_block, resp_num_blocks=args.resp_num_blocks, resp_num_layers=args.resp_num_layers, - resp_dropout=args.drop, + resp_dropout=args.dropout, resp_activation=args.resp_activation).to(device) @@ -372,7 +372,7 @@ def update_l2regularizer(self, reg): def update_dropout(self, dropout_rate): - self.args.drop = dropout_rate + self.args.dropout = dropout_rate # Regressor for drug response self.resp_net = RespNet( @@ -386,7 +386,7 @@ def update_dropout(self, dropout_rate): resp_num_layers_per_block=self.args.resp_num_layers_per_block, resp_num_blocks=self.args.resp_num_blocks, resp_num_layers=self.args.resp_num_layers, - resp_dropout=self.args.drop, + resp_dropout=self.args.dropout, resp_activation=self.args.resp_activation).to(self.device) @@ -407,7 +407,7 @@ def pre_train_config(self): def train(self): - + args = self.args device = self.device @@ -429,7 +429,7 @@ def train(self): print('=' * 80 + '\nTraining Epoch %3i:' % (epoch + 1)) epoch_start_time = time.time() - + self.resp_lr_decay.step(epoch) self.cl_clf_lr_decay.step(epoch) self.drug_target_lr_decay.step(epoch) @@ -469,9 +469,9 @@ def train(self): if epoch >= args.resp_val_start_epoch: - + resp_r2 = self.validation(epoch) - + #print('\nValidation Results:') # Record the best R2 score (same data source) @@ -492,7 +492,7 @@ def train(self): def validation(self, epoch): - + args = self.args device = self.device @@ -504,7 +504,7 @@ def validation(self, epoch): site_clf_net=self.site_clf_net, type_clf_net=self.type_clf_net, data_loader=self.cl_clf_val_loader, ) - + self.val_cl_clf_acc.append([cl_category_acc, cl_site_acc, cl_type_acc]) # Validating drug target classifier @@ -519,7 +519,7 @@ def validation(self, epoch): valid_drug_qed(device=device, drug_qed_net=self.drug_qed_net, data_loader=self.drug_qed_val_loader) - + self.val_drug_qed_mse.append(drug_qed_mse) self.val_drug_qed_mae.append(drug_qed_mae) self.val_drug_qed_r2.append(drug_qed_r2) @@ -536,10 +536,10 @@ def validation(self, epoch): self.val_resp_r2.append(resp_r2) return resp_r2 - + def print_final_stats(self): - + args = self.args val_cl_clf_acc = np.array(self.val_cl_clf_acc).reshape(-1, 3) diff --git a/Pilot1/Uno_UQ/README.md b/Pilot1/Uno_UQ/README.md new file mode 100644 index 00000000..05bdeb74 --- /dev/null +++ b/Pilot1/Uno_UQ/README.md @@ -0,0 +1,537 @@ +## Uno_UQ: Predicting Tumor Dose Response across Multiple Data Sources with added UQ functionality. + + + +## Functionality + +Uno_UQ adds uncertainty quantification (UQ) functionality to the Uno model. For information about the underlaying model, please refer to the Uno benchmark. + + + +This page overviews the added UQ functionality provided, which includes: + +- Generation of holdout set. + +- Training excluding the holdout set. + +- Inference for the specified data. + +- Training for homoscedastic and heteroscedastic models. + +- Empirical calibration of UQ for the trained models. + + + +## Holdout + +The holdout script generates a set of identifiers to holdout during training, depending on the --partition_by argument. + +If --partition_by is 'drug_pair' it generates a set of drug IDs. + +If --partition_by is 'cell' it generates a set of cell IDs. + +In any other case it generates a set of indices. + + + +The fraction to reserve in the holdout set is given by the --val_split argument. + + + +#### Example output + +``` +python uno_holdoutUQ_data.py +Using TensorFlow backend. +Importing candle utils for keras +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': None, + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'cp': False, + 'cv': 1, + 'data_type': , + 'dense': [1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'dropout': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors', 'fingerprints'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 10, + 'exclude_cells': [], + 'exclude_drugs': [], + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.01, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'sgd', + 'output_dir': './Output/EXP000/RUN000', + 'partition_by': 'cell', + 'preprocess_rnaseq': 'none', + 'profiling': False, + 'reduce_lr': False, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'sample_repetition': False, + 'save_path': 'save_default/', + 'save_weights': 'default.weights.h5', + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': 3600, + 'train_bool': True, + 'train_sources': ['gCSI'], + 'use_exported_data': None, + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'val_split': 0.2, + 'verbose': None, + 'warmup_lr': False} +partition_by: cell +Cell IDs in holdout set written in file: save_default/infer_cell_ids + +``` + + + +## Train + +The train script trains the model, as in the underlying Uno benchmark, but excluding the IDs in the holdout file. The file with the holdout set should be provided via one of the following arguments + +- --uq_exclude_drugs_file='file' if the file contains a set of drug IDs. + +- --uq_exclude_cells_file='file' if the file contains a set of cell IDs. + +- --uq_exclude_indices_file='file' if the file contains a set of indices. + + + +An additional --loss heteroscedastic option is available. This will learn the input-dependent noise level as well as the main regression variable specified (i.e. growth or AUC). + + + +#### Example output + +``` + +python uno_trainUQ_keras2.py --cp True --uq_exclude_cells_file 'save_default/infer_cell_ids' + +Using TensorFlow backend. +Importing candle utils for keras +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': None, + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'cp': True, + 'cv': 1, + 'data_type': , + 'dense': [1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'dropout': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors', 'fingerprints'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 10, + 'exclude_cells': [], + 'exclude_drugs': [], + 'exclude_indices': [], + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.01, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'sgd', + 'output_dir': './Output/EXP000/RUN000', + 'partition_by': 'cell', + 'preprocess_rnaseq': 'none', + 'reduce_lr': False, + 'reg_l2': 0.0, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'sample_repetition': False, + 'save_path': 'save_default/', + 'save_weights': 'saved.weights.h5', + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': 3600, + 'train_bool': True, + 'train_sources': ['gCSI'], + 'uq_exclude_cells_file': 'save_default/infer_cell_ids', + 'use_exported_data': None, + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'val_split': 0.2, + 'verbose': None, + 'warmup_lr': False} +Read file: save_default/infer_cell_ids +Number of elements read: 72 +Cells to exclude: ['gCSI.NCI-H889', 'gCSI.MEWO', 'gCSI.PA-TU-8902', 'gCSI.BCPAP', 'gCSI.CAL-12T', 'gCSI.NCI-H727', 'gCSI.HUH-1', 'gCSI.NUGC-4', 'gCSI.MKN74', 'gCSI.PK-1', 'gCSI.A2058', 'gCSI.RAJI', 'gCSI.JHH-7', 'gCSI.SUIT-2', 'gCSI.OE21', 'gCSI.HCC1806', 'gCSI.PANC-10-05', 'gCSI.RMG-I', 'gCSI.NCI-H1703', 'gCSI.KMS-34', 'gCSI.G-361', 'gCSI.EPLC-272H', 'gCSI.HEP-G2', 'gCSI.RERF-LC-MS', 'gCSI.COLO-800', 'gCSI.KM12', 'gCSI.DOHH-2', 'gCSI.EFM-19', 'gCSI.MDA-MB-468', 'gCSI.MHH-ES-1', 'gCSI.IPC-298', 'gCSI.GRANTA-519', 'gCSI.8305C', 'gCSI.KYSE-140', 'gCSI.MALME-3M', 'gCSI.MIA-PACA-2', 'gCSI.NCI-H1666', 'gCSI.PC-3', 'gCSI.RT4', 'gCSI.HUP-T4', 'gCSI.NCI-H1869', 'gCSI.WM-266-4', 'gCSI.KMM-1', 'gCSI.OE33', 'gCSI.SU-DHL-6', 'gCSI.QGP-1', 'gCSI.IGR-37', 'gCSI.VMRC-RCW', 'gCSI.NCI-H1838', 'gCSI.SW948', 'gCSI.COLO-679', 'gCSI.CAL-51', 'gCSI.HUCCT1', 'gCSI.LP-1', 'gCSI.RPMI-7951', 'gCSI.HPAF-II', 'gCSI.OCUM-1', 'gCSI.HOP-92', 'gCSI.NCI-H661', 'gCSI.TOV-112D', 'gCSI.PANC-03-27', 'gCSI.AGS', 'gCSI.HEC-59', 'gCSI.LN-18', 'gCSI.U-87-MG', 'gCSI.U-2-OS', 'gCSI.ABC-1', 'gCSI.IGR-1', 'gCSI.SK-MEL-3', 'gCSI.A549', 'gCSI.HCC4006', 'gCSI.NCI-H1355'] +Combined model: +__________________________________________________________________________________________________ +Layer (type) Output Shape Param # Connected to +================================================================================================== +input.cell.rnaseq (InputLayer) (None, 942) 0 +__________________________________________________________________________________________________ +input.drug1.descriptors (InputL (None, 5270) 0 +__________________________________________________________________________________________________ +input.drug1.fingerprints (Input (None, 2048) 0 +__________________________________________________________________________________________________ +cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] +__________________________________________________________________________________________________ +drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] +__________________________________________________________________________________________________ +drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] +__________________________________________________________________________________________________ +concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] + drug.descriptors[1][0] + drug.fingerprints[1][0] +__________________________________________________________________________________________________ +dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] +__________________________________________________________________________________________________ +permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] +__________________________________________________________________________________________________ +dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] +__________________________________________________________________________________________________ +permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] +__________________________________________________________________________________________________ +dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] +__________________________________________________________________________________________________ +permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] +__________________________________________________________________________________________________ +dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] +================================================================================================== +Total params: 19,273,001 +Trainable params: 19,273,001 +Non-trainable params: 0 +__________________________________________________________________________________________________ +Training homoscedastic model: +partition:train, rank:0, sharded index size:2784, batch_size:32, steps:87 +partition:val, rank:0, sharded index size:704, batch_size:32, steps:22 +Between random pairs in y_val: + mse: 0.0604 + mae: 0.1978 + r2: -0.9105 + corr: 0.0447 +Data points per epoch: train = 2784, val = 704 +Steps per epoch: train = 87, val = 22 +Epoch 1/10 +87/87 [==============================] - 15s 174ms/step - loss: 0.2165 - mae: 0.2144 - r2: -6.4761 - val_loss: 0.0247 - val_mae: 0.1244 - val_r2: 0.1916 +Current time ....15.176 +Epoch 2/10 +87/87 [==============================] - 12s 142ms/step - loss: 0.0247 - mae: 0.1240 - r2: 0.1302 - val_loss: 0.0208 - val_mae: 0.1147 - val_r2: 0.3058 +Current time ....28.323 +Epoch 3/10 +87/87 [==============================] - 12s 143ms/step - loss: 0.0219 - mae: 0.1157 - r2: 0.2278 - val_loss: 0.0197 - val_mae: 0.1112 - val_r2: 0.3565 +Current time ....41.321 +Epoch 4/10 +87/87 [==============================] - 12s 143ms/step - loss: 0.0203 - mae: 0.1111 - r2: 0.2897 - val_loss: 0.0182 - val_mae: 0.1072 - val_r2: 0.3980 +Current time ....54.330 +Epoch 5/10 +87/87 [==============================] - 13s 153ms/step - loss: 0.0187 - mae: 0.1066 - r2: 0.3388 - val_loss: 0.0189 - val_mae: 0.1090 - val_r2: 0.3804 +Current time ....68.120 +Epoch 6/10 +87/87 [==============================] - 13s 148ms/step - loss: 0.0185 - mae: 0.1075 - r2: 0.3412 - val_loss: 0.0186 - val_mae: 0.1088 - val_r2: 0.3921 +Current time ....80.967 +Epoch 7/10 +87/87 [==============================] - 13s 147ms/step - loss: 0.0185 - mae: 0.1069 - r2: 0.3468 - val_loss: 0.0177 - val_mae: 0.1043 - val_r2: 0.4259 +Current time ....93.769 +Epoch 8/10 +87/87 [==============================] - 13s 150ms/step - loss: 0.0176 - mae: 0.1031 - r2: 0.3791 - val_loss: 0.0159 - val_mae: 0.0994 - val_r2: 0.4793 +Current time ....107.421 +Epoch 9/10 +87/87 [==============================] - 13s 150ms/step - loss: 0.0177 - mae: 0.1034 - r2: 0.3745 - val_loss: 0.0161 - val_mae: 0.1000 - val_r2: 0.4696 +Current time ....120.945 +Epoch 10/10 +87/87 [==============================] - 14s 159ms/step - loss: 0.0169 - mae: 0.1022 - r2: 0.4086 - val_loss: 0.0173 - val_mae: 0.1029 - val_r2: 0.4337 +Current time ....134.744 +Comparing y_true and y_pred: + mse: 0.0165 + mae: 0.1016 + r2: 0.4782 + corr: 0.7072 +Testing predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted.tsv +Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.json +Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.h5 +Model weights stored in file: save_default//default.weights.h5 +partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 + +``` + + + +## Infer + +The infer script does inference on a trained model, as in the underlying Uno benchmark. This script is able to use a pre-generated file or it can construct the data to do inference if a set of identifiers are provided. + + + +The argument --uq_infer_file must be used to specify the name of the file with the data (or the identifiers) to do inference. + + + +Additionally, if the data needs to be constructed, then one of the following arguments should be used to specify what type of identifiers are provided + +- --uq_infer_given_drugs=True if the file contains a set of drug IDs. + +- --uq_infer_given_cells=True if the file contains a set of cell IDs. + +- --uq_infer_given_indices=True if the file contains a set of indices. + + + +Note that the latter works if all the arguments for the data construction are set as well (usually those are taken from the model configuration file). Of course this specification and the trained model should be consistent for the script to work. + + + +Likewise, in the case that a pre-generated file is provided, the features included and the trained model should be consistent for the script to work. + + + +Note also that the --loss heteroscedastic option should be specified if the model was trained to predict the heterogeneous noise as well. + + + +#### Example output + +This assumes that a trained model (files default.model.json and default.weights.h5) is available at save_default folder. A sample json file compatible with the default model used in the training demo script is provided. After running the training script a default.weights.h5 file should be generated. Both, in combination, can be used for testing the inference demo script and would produce a similar output to the one shown next. + +``` + +python uno_inferUQ_keras2.py --uq_infer_file save_default/infer_cell_ids --uq_infer_given_cells True --model_file save_default/uno.A\=relu.B\=32.E\=10.O\=sgd.LS\=mse.LR\=0.01.CF\=r.DF\=df.DR\=0.1.L1000.D1\=1000.D2\=1000.D3\=1000.model.h5 --weights_file save_default/saved.weights.h5 --n_pred 10 +Using TensorFlow backend. +Importing candle utils for keras +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': None, + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'cp': False, + 'cv': 1, + 'data_type': , + 'dense': [1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'dropout': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors', 'fingerprints'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 10, + 'exclude_cells': [], + 'exclude_drugs': [], + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.01, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'model_file': 'save_default/default.model.json', + 'n_pred': 10, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'sgd', + 'output_dir': './Output/EXP000/RUN000', + 'partition_by': 'cell', + 'preprocess_rnaseq': 'none', + 'profiling': False + 'reduce_lr': False, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'sample_repetition': False, + 'save_path': 'save_default/', + 'save_weights': None, + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': 3600, + 'train_bool': True, + 'train_sources': ['gCSI'], + 'uq_infer_file': 'save_default/infer_cell_ids', + 'uq_infer_given_cells': True, + 'uq_infer_given_drugs': False, + 'uq_infer_given_indices': False, + 'use_exported_data': None, + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'val_split': 0.2, + 'verbose': None, + 'warmup_lr': False, + 'weights_file': 'save_default/saved.weights.h5'} +__________________________________________________________________________________________________ +Layer (type) Output Shape Param # Connected to +================================================================================================== +input.cell.rnaseq (InputLayer) (None, 942) 0 +__________________________________________________________________________________________________ +input.drug1.descriptors (InputL (None, 5270) 0 +__________________________________________________________________________________________________ +input.drug1.fingerprints (Input (None, 2048) 0 +__________________________________________________________________________________________________ +cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] +__________________________________________________________________________________________________ +drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] +__________________________________________________________________________________________________ +drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] +__________________________________________________________________________________________________ +concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] + drug.descriptors[1][0] + drug.fingerprints[1][0] +__________________________________________________________________________________________________ +dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] +__________________________________________________________________________________________________ +permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] +__________________________________________________________________________________________________ +dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] +__________________________________________________________________________________________________ +permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] +__________________________________________________________________________________________________ +dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] +__________________________________________________________________________________________________ +permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] +__________________________________________________________________________________________________ +dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] +================================================================================================== +Total params: 19,273,001 +Trainable params: 19,273,001 +Non-trainable params: 0 +__________________________________________________________________________________________________ +partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 +Read file: save_default/infer_cell_ids +Number of elements read: 72 +Comparing y_true and y_pred: + mse: 0.0173 + mae: 0.1012 + r2: 0.4687 + corr: 0.7001 +Comparing y_true and y_pred: + mse: 0.0172 + mae: 0.1005 + r2: 0.4720 + corr: 0.7010 +Comparing y_true and y_pred: + mse: 0.0171 + mae: 0.1033 + r2: 0.4751 + corr: 0.7064 +Comparing y_true and y_pred: + mse: 0.0175 + mae: 0.1045 + r2: 0.4627 + corr: 0.6945 +Comparing y_true and y_pred: + mse: 0.0162 + mae: 0.1007 + r2: 0.5017 + corr: 0.7277 +Comparing y_true and y_pred: + mse: 0.0166 + mae: 0.1008 + r2: 0.4921 + corr: 0.7141 +Comparing y_true and y_pred: + mse: 0.0181 + mae: 0.1059 + r2: 0.4443 + corr: 0.6878 +Comparing y_true and y_pred: + mse: 0.0167 + mae: 0.1015 + r2: 0.4875 + corr: 0.7087 +Comparing y_true and y_pred: + mse: 0.0169 + mae: 0.1032 + r2: 0.4805 + corr: 0.7106 +Comparing y_true and y_pred: + mse: 0.0169 + mae: 0.0999 + r2: 0.4817 + corr: 0.7075 +Predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=None.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted_INFER.tsv +``` + + + +## Empirical Calibration + +Scripts included in the calibration subfolder compute empirical calibration for the inference results. The scripts with suffix HOM compute empirical calibration for inference with homoscedastic model, while the script with suffix HET computes empirical calibration for inference with a heteroscedastic model. + + + +To run the scripts it is necessary to provide the path to the file and the file with the inference results. Note that it is assumed that the file with the inference results includes each realization of the inference (implicit in the 'all' suffix), but for the homoscedastic case a script is provided to process an inference file with only the consolidated statistics (generally the average over all the realizations). Also, note that a specific format of the file with the inference results is assumed. Thus, a set of default values, reflecting the format of current CANDLE infer scripts, is used. More arbitrary formats may be usable, if they incurr in similar column offsets, but it would require passing the right parameters to the function reading the inference file. + + + +The script generates a series of plots and pickle (dill) files, displaying and encoding the empirical calibration computed. + diff --git a/Pilot1/Uno_UQ/calibration/calibration_HET.py b/Pilot1/Uno_UQ/calibration/calibration_HET.py new file mode 100644 index 00000000..ab354d76 --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_HET.py @@ -0,0 +1,115 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_HET.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ( "FILENAME: usually .predicted_INFER_HET.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + index_dp = filename.find('DR=') + if index_dp == -1: # DR is not in filename + print('Enter dropout rate ') + dp_perc = input() + else: + if filename[index_dp + 6] == '.': + dp = float(filename[index_dp+3:index_dp+3+3]) + else: + dp = float(filename[index_dp+3:index_dp+3+4]) + + print('Droput rate: ', dp) + dp_perc = dp * 100. + method = 'Dropout ' + str(dp_perc) + '%' + prefix = folder_out + 'heteroscedastic_DR=' + str(dp_perc) + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name = candle.compute_statistics_heteroscedastic(df_data) + + # storing sigma + fname = prefix + '_sigma.pkl' + with open(fname, 'wb') as f: + pickle.dump(sigma, f, protocol=4) + print('Sigma stored in file: ', fname) + + #plots + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 31 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/calibration/calibration_HOM.py b/Pilot1/Uno_UQ/calibration/calibration_HOM.py new file mode 100644 index 00000000..a9440fcb --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_HOM.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_HOM.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ("FILENAME: usually _pred.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + method = 'Dropout' + prefix = folder_out + 'homoscedastic_DR' + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name = candle.compute_statistics_homoscedastic(df_data) + + #plots + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 60 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/calibration/calibration_HOM_all.py b/Pilot1/Uno_UQ/calibration/calibration_HOM_all.py new file mode 100644 index 00000000..df7e064b --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_HOM_all.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_HOM_all.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ("FILENAME: usually .predicted_INFER.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + method = 'Dropout' + prefix = folder_out + 'homoscedastic_DR' + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name = candle.compute_statistics_homoscedastic_all(df_data) + + #plots + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 60 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/calibration/calibration_QTL.py b/Pilot1/Uno_UQ/calibration/calibration_QTL.py new file mode 100644 index 00000000..65f12710 --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_QTL.py @@ -0,0 +1,117 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_QTL.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ( "FILENAME: usually .predicted_INFER_QTL.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + index_dp = filename.find('DR=') + if index_dp == -1: # DR is not in filename + print('Enter dropout rate ') + dp_perc = input() + else: + if filename[index_dp + 6] == '.': + dp = float(filename[index_dp+3:index_dp+3+3]) + else: + dp = float(filename[index_dp+3:index_dp+3+4]) + + print('Droput rate: ', dp) + dp_perc = dp * 100. + method = 'Dropout ' + str(dp_perc) + '%' + prefix = folder_out + 'quantile_DR=' + str(dp_perc) + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name, Ypred_10p_mean, Ypred_90p_mean = candle.compute_statistics_quantile(df_data) + + # storing sigma + fname = prefix + '_sigma.pkl' + with open(fname, 'wb') as f: + pickle.dump(sigma, f, protocol=4) + print('Sigma stored in file: ', fname) + + #plots + percentile_list = ['50p', '10p', '90p'] + candle.plot_percentile_predictions(Ypred_mean, Ypred_10p_mean, Ypred_90p_mean, percentile_list, pred_name, prefix) + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 31 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/data_utils_/__init__.py b/Pilot1/Uno_UQ/data_utils_/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/__init__.py @@ -0,0 +1 @@ + diff --git a/Pilot1/Uno_UQ/data_utils_/cellline_data.py b/Pilot1/Uno_UQ/data_utils_/cellline_data.py new file mode 100644 index 00000000..af7e369a --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/cellline_data.py @@ -0,0 +1,97 @@ + +import pandas as pd +import numpy as np + +import candle_keras as candle + +from uno import get_file_p1 as get_file +from uno import loggerUno as logger +from uno import DATA_URL + + +def load_cell_metadata(): + path = get_file(DATA_URL + 'cl_metadata') + df = pd.read_csv(path, sep='\t') + return df + + +def cell_name_to_ids(name, source=None): + path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt') + df1 = pd.read_csv(path, sep='\t') + hits1 = candle.lookup(df1, name, 'NCI60.ID', ['NCI60.ID', 'CELLNAME', 'Name'], match='contains') + path = get_file(DATA_URL + 'cl_mapping') + df2 = pd.read_csv(path, sep='\t', header=None) + hits2 = candle.lookup(df2, name, [0, 1], [0, 1], match='contains') + hits = hits1 + hits2 + if source: + hits = [x for x in hits if x.startswith(source.upper()+'.')] + return hits + + +def load_cell_rnaseq(ncols=None, scaling='std', imputing='mean', add_prefix=True, + use_landmark_genes=False, use_filtered_genes=False, + feature_subset=None, preprocess_rnaseq=None, + embed_feature_source=False, sample_set=None, index_by_sample=False): + + if use_landmark_genes: + filename = 'combined_rnaseq_data_lincs1000' + elif use_filtered_genes: + filename = 'combined_rnaseq_data_filtered' + else: + filename = 'combined_rnaseq_data' + + if preprocess_rnaseq and preprocess_rnaseq != 'none': + scaling = None + filename += ('_' + preprocess_rnaseq) # 'source_scale' or 'combat' + + path = get_file(DATA_URL + filename) + df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0) + total = df_cols.shape[1] - 1 # remove Sample column + if 'Cancer_type_id' in df_cols.columns: + total -= 1 + usecols = None + if ncols and ncols < total: + usecols = np.random.choice(total, size=ncols, replace=False) + usecols = np.append([0], np.add(sorted(usecols), 2)) + df_cols = df_cols.iloc[:, usecols] + if feature_subset: + with_prefix = lambda x: 'rnaseq.'+x if add_prefix else x + usecols = [0] + [i for i, c in enumerate(df_cols.columns) if with_prefix(c) in feature_subset] + df_cols = df_cols.iloc[:, usecols] + + dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) + df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict) + if 'Cancer_type_id' in df.columns: + df.drop('Cancer_type_id', axis=1, inplace=True) + + prefixes = df['Sample'].str.extract('^([^.]*)', expand=False).rename('Source') + sources = prefixes.drop_duplicates().reset_index(drop=True) + df_source = pd.get_dummies(sources, prefix='rnaseq.source', prefix_sep='.') + df_source = pd.concat([sources, df_source], axis=1) + + df1 = df['Sample'] + if embed_feature_source: + df_sample_source = pd.concat([df1, prefixes], axis=1) + df1 = df_sample_source.merge(df_source, on='Source', how='left').drop('Source', axis=1) + logger.info('Embedding RNAseq data source into features: %d additional columns', df1.shape[1]-1) + + df2 = df.drop('Sample', 1) + if add_prefix: + df2 = df2.add_prefix('rnaseq.') + + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing) + + df = pd.concat([df1, df2], axis=1) + + # scaling needs to be done before subsampling + if sample_set: + chosen = df['Sample'].str.startswith(sample_set) + df = df[chosen].reset_index(drop=True) + + if index_by_sample: + df = df.set_index('Sample') + + logger.info('Loaded combined RNAseq data: %s', df.shape) + + return df + diff --git a/Pilot1/Uno_UQ/data_utils_/drug_data.py b/Pilot1/Uno_UQ/data_utils_/drug_data.py new file mode 100644 index 00000000..cad8e326 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/drug_data.py @@ -0,0 +1,188 @@ + +import pandas as pd +import numpy as np + +import candle_keras as candle + +from uno import get_file_p1 as get_file +from uno import loggerUno as logger +from uno import DATA_URL + + +def load_drug_data(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True): + df_info = load_drug_info() + df_info['Drug'] = df_info['PUBCHEM'] + + df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols) + df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols) + + df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + + df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None) + df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None) + + df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True) + df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) + df2 = df_desc.drop('Drug', 1) + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=scaling, imputing=imputing, dropna=dropna) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + df_desc = pd.concat([df1, df2], axis=1) + + df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True) + df1 = pd.DataFrame(df_fp.loc[:, 'Drug']) + df2 = df_fp.drop('Drug', 1) + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=None, imputing=imputing, dropna=dropna) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + df_fp = pd.concat([df1, df2], axis=1) + + logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape) + logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape) + + return df_desc, df_fp + + +def load_drug_descriptors(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): + df_info = load_drug_info() + df_info['Drug'] = df_info['PUBCHEM'] + + df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols) + df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + + df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None) + + df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True) + df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) + df2 = df_desc.drop('Drug', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + if feature_subset: + df2 = df2[[x for x in df2.columns if x in feature_subset]] + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=scaling, imputing=imputing, dropna=dropna) + df_desc = pd.concat([df1, df2], axis=1) + + logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape) + + return df_desc + + +def load_drug_fingerprints(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): + df_info = load_drug_info() + df_info['Drug'] = df_info['PUBCHEM'] + + df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols) + df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + + df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None) + + df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True) + df1 = pd.DataFrame(df_fp.loc[:, 'Drug']) + df2 = df_fp.drop('Drug', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + if feature_subset: + df2 = df2[[x for x in df2.columns if x in feature_subset]] + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=None, imputing=imputing, dropna=dropna) + df_fp = pd.concat([df1, df2], axis=1) + + logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape) + + return df_fp + + +def load_drug_info(): + path = get_file(DATA_URL + 'drug_info') + df = pd.read_csv(path, sep='\t', dtype=object) + df['PUBCHEM'] = 'PubChem.CID.' + df['PUBCHEM'] + return df + + +def drug_name_to_ids(name, source=None): + df1 = load_drug_info() + path = get_file(DATA_URL + 'NCI_IOA_AOA_drugs') + df2 = pd.read_csv(path, sep='\t', dtype=str) + df2['NSC'] = 'NSC.' + df2['NSC'] + hits1 = candle.lookup(df1, name, 'ID', ['ID', 'NAME', 'CLEAN_NAME', 'PUBCHEM']) + hits2 = candle.lookup(df2, name, 'NSC', ['NSC', 'Generic Name', 'Preffered Name']) + hits = hits1 + hits2 + if source: + hits = [x for x in hits if x.startswith(source.upper()+'.')] + return hits + + +def load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=None, usecols=None, + scaling=None, imputing=None, add_prefix=False): + path = get_file(DATA_URL + '{}_dragon7_descriptors.tsv'.format(drug_set)) + + df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0) + total = df_cols.shape[1] - 1 + if usecols is not None: + usecols = [x for x in usecols if x in df_cols.columns] + if usecols[0] != 'NAME': + usecols = ['NAME'] + usecols + df_cols = df_cols.loc[:, usecols] + elif ncols and ncols < total: + usecols = np.random.choice(total, size=ncols, replace=False) + usecols = np.append([0], np.add(sorted(usecols), 1)) + df_cols = df_cols.iloc[:, usecols] + + dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) + df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict, + na_values=['na', '-', '']) + + df1 = pd.DataFrame(df.loc[:, 'NAME']) + df1.rename(columns={'NAME': 'Drug'}, inplace=True) + + df2 = df.drop('NAME', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing, dropna=None) + + df = pd.concat([df1, df2], axis=1) + return df + + +def load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=None, usecols=None, + scaling=None, imputing=None, add_prefix=False): + fps = ['PFP', 'ECFP'] + usecols_all = usecols + df_merged = None + for fp in fps: + path = get_file(DATA_URL + '{}_dragon7_{}.tsv'.format(drug_set, fp)) + df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0, skiprows=1, header=None) + total = df_cols.shape[1] - 1 + if usecols_all is not None: + usecols = [x.replace(fp+'.', '') for x in usecols_all] + usecols = [int(x) for x in usecols if x.isdigit()] + usecols = [x for x in usecols if x in df_cols.columns] + if usecols[0] != 0: + usecols = [0] + usecols + df_cols = df_cols.loc[:, usecols] + elif ncols and ncols < total: + usecols = np.random.choice(total, size=ncols, replace=False) + usecols = np.append([0], np.add(sorted(usecols), 1)) + df_cols = df_cols.iloc[:, usecols] + + dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) + df = pd.read_csv(path, engine='c', sep='\t', skiprows=1, header=None, + usecols=usecols, dtype=dtype_dict) + df.columns = ['{}.{}'.format(fp, x) for x in df.columns] + + col1 = '{}.0'.format(fp) + df1 = pd.DataFrame(df.loc[:, col1]) + df1.rename(columns={col1: 'Drug'}, inplace=True) + + df2 = df.drop(col1, 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing, dropna=None) + + df = pd.concat([df1, df2], axis=1) + + df_merged = df if df_merged is None else df_merged.merge(df) + + return df_merged diff --git a/Pilot1/Uno_UQ/data_utils_/response_data.py b/Pilot1/Uno_UQ/data_utils_/response_data.py new file mode 100644 index 00000000..d4080da8 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/response_data.py @@ -0,0 +1,175 @@ + +import pandas as pd +import numpy as np + +from uno import get_file_p1 as get_file +from uno import loggerUno as logger +from uno import DATA_URL + +global_cache = {} + +def save_combined_dose_response(): + df1 = load_single_dose_response(combo_format=True, fraction=False) + df2 = load_combo_dose_response(fraction=False) + df = pd.concat([df1, df2]) + df.to_csv('combined_drug_growth', index=False, sep='\t') + + +def load_combined_dose_response(rename=True): + df1 = load_single_dose_response(combo_format=True) + logger.info('Loaded {} single drug dose response measurements'.format(df1.shape[0])) + + df2 = load_combo_dose_response() + logger.info('Loaded {} drug pair dose response measurements'.format(df2.shape[0])) + + df = pd.concat([df1, df2]) + logger.info('Combined dose response data contains sources: {}'.format(df['SOURCE'].unique())) + + if rename: + df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', + 'DRUG1': 'Drug1', 'DRUG2': 'Drug2', + 'DOSE1': 'Dose1', 'DOSE2': 'Dose2', + 'GROWTH': 'Growth', 'STUDY': 'Study'}) + return df + + +def load_single_dose_response(combo_format=False, fraction=True): + # path = get_file(DATA_URL + 'combined_single_drug_growth') + path = get_file(DATA_URL + 'rescaled_combined_single_drug_growth') + + df = global_cache.get(path) + if df is None: + df = pd.read_csv(path, sep='\t', engine='c', + na_values=['na', '-', ''], + # nrows=10, + dtype={'SOURCE': str, 'DRUG_ID': str, + 'CELLNAME': str, 'CONCUNIT': str, + 'LOG_CONCENTRATION': np.float32, + 'EXPID': str, 'GROWTH': np.float32}) + global_cache[path] = df + + df['DOSE'] = -df['LOG_CONCENTRATION'] + + df = df.rename(columns={'CELLNAME': 'CELL', 'DRUG_ID': 'DRUG', 'EXPID': 'STUDY'}) + df = df[['SOURCE', 'CELL', 'DRUG', 'DOSE', 'GROWTH', 'STUDY']] + + if fraction: + df['GROWTH'] /= 100 + + if combo_format: + df = df.rename(columns={'DRUG': 'DRUG1', 'DOSE': 'DOSE1'}) + df['DRUG2'] = np.nan + df['DOSE2'] = np.nan + df['DRUG2'] = df['DRUG2'].astype(object) + df['DOSE2'] = df['DOSE2'].astype(np.float32) + df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']] + + return df + + +def load_combo_dose_response(fraction=True): + path = get_file(DATA_URL + 'ComboDrugGrowth_Nov2017.csv') + df = global_cache.get(path) + if df is None: + df = pd.read_csv(path, sep=',', engine='c', + na_values=['na','-',''], + usecols=['CELLNAME', 'NSC1', 'CONC1', 'NSC2', 'CONC2', + 'PERCENTGROWTH', 'VALID', 'SCREENER', 'STUDY'], + # nrows=10000, + dtype={'CELLNAME': str, 'NSC1': str, 'NSC2': str, + 'CONC1': np.float32, 'CONC2': np.float32, + 'PERCENTGROWTH':np.float32, 'VALID': str, + 'SCREENER': str, 'STUDY': str}, + error_bad_lines=False, warn_bad_lines=True) + global_cache[path] = df + + df = df[df['VALID'] == 'Y'] + + df['SOURCE'] = 'ALMANAC.' + df['SCREENER'] + + cellmap_path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt') + df_cellmap = pd.read_csv(cellmap_path, sep='\t') + df_cellmap.set_index('Name', inplace=True) + cellmap = df_cellmap[['NCI60.ID']].to_dict()['NCI60.ID'] + + df['CELL'] = df['CELLNAME'].map(lambda x: cellmap[x]) + + df['DOSE1'] = -np.log10(df['CONC1']) + df['DOSE2'] = -np.log10(df['CONC2']) + + df['DRUG1'] = 'NSC.' + df['NSC1'] + df['DRUG2'] = 'NSC.' + df['NSC2'] + + if fraction: + df['GROWTH'] = df['PERCENTGROWTH'] / 100 + else: + df['GROWTH'] = df['PERCENTGROWTH'] + + df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']] + + return df + + +def load_aggregated_single_response(target='AUC', min_r2_fit=0.3, max_ec50_se=3, combo_format=False, rename=True): + path = get_file(DATA_URL + 'combined_single_response_agg') + + df = global_cache.get(path) + if df is None: + df = pd.read_csv(path, engine='c', sep='\t', + dtype={'SOURCE': str, 'CELL': str, 'DRUG': str, 'STUDY': str, + 'AUC': np.float32, 'IC50': np.float32, + 'EC50': np.float32, 'EC50se': np.float32, + 'R2fit': np.float32, 'Einf': np.float32, + 'HS': np.float32, 'AAC1': np.float32, + 'AUC1': np.float32, 'DSS1': np.float32}) + global_cache[path] = df + + total = len(df) + + df = df[(df['R2fit'] >= min_r2_fit) & (df['EC50se'] <= max_ec50_se)] + df = df[['SOURCE', 'CELL', 'DRUG', target, 'STUDY']] + df = df[~df[target].isnull()] + + logger.info('Loaded %d dose independent response samples (filtered by EC50se <= %f & R2fit >=%f from a total of %d).', len(df), max_ec50_se, min_r2_fit, total) + + if combo_format: + df = df.rename(columns={'DRUG': 'DRUG1'}) + df['DRUG2'] = np.nan + df['DRUG2'] = df['DRUG2'].astype(object) + df = df[['SOURCE', 'CELL', 'DRUG1', 'DRUG2', target, 'STUDY']] + if rename: + df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', + 'DRUG1': 'Drug1', 'DRUG2': 'Drug2', 'STUDY': 'Study'}) + else: + if rename: + df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', + 'DRUG': 'Drug', 'STUDY': 'Study'}) + + return df + + + +def select_drugs_with_response_range(df_response, lower=0, upper=0, span=0, lower_median=None, upper_median=None): + df = df_response.groupby(['Drug1', 'Sample'])['Growth'].agg(['min', 'max', 'median']) + df['span'] = df['max'].clip(lower=-1, upper=1) - df['min'].clip(lower=-1, upper=1) + df = df.groupby('Drug1').mean().reset_index().rename(columns={'Drug1': 'Drug'}) + mask = (df['min'] <= lower) & (df['max'] >= upper) & (df['span'] >= span) + if lower_median: + mask &= (df['median'] >= lower_median) + if upper_median: + mask &= (df['median'] <= upper_median) + df_sub = df[mask] + return df_sub + + +def summarize_response_data(df, target=None): + target = target or 'Growth' + df_sum = df.groupby('Source').agg({target: 'count', 'Sample': 'nunique', + 'Drug1': 'nunique', 'Drug2': 'nunique'}) + if 'Dose1' in df_sum: + df_sum['MedianDose'] = df.groupby('Source').agg({'Dose1': 'median'}) + return df_sum + + + + diff --git a/Pilot1/Uno_UQ/data_utils_/uno.py b/Pilot1/Uno_UQ/data_utils_/uno.py new file mode 100644 index 00000000..26e80d4e --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/uno.py @@ -0,0 +1,352 @@ +from __future__ import print_function + +import os +import sys +import logging +import argparse +try: + import configparser +except ImportError: + import ConfigParser as configparser + +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error +from scipy.stats.stats import pearsonr + +#file_path = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.dirname(os.path.realpath(os.path.join(__file__, '..'))) +lib_path = os.path.abspath(os.path.join(file_path, '..')) +sys.path.append(lib_path) +lib_path = os.path.abspath(os.path.join(file_path, 'data_utils_')) +sys.path.append(lib_path) +lib_path = os.path.abspath(os.path.join(file_path, 'model_utils_')) +sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) + + +import candle + +P1B3_URL = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B3/' +DATA_URL = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/combo/' + +loggerUno = logging.getLogger(__name__) + + +def set_up_logger(logfile, logger1, logger2, verbose): + candle.verify_path(logfile) + fh = logging.FileHandler(logfile) + fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) + fh.setLevel(logging.DEBUG) + + sh = logging.StreamHandler() + sh.setFormatter(logging.Formatter('')) + sh.setLevel(logging.DEBUG if verbose else logging.INFO) + + for log in [logger1, logger2]: + log.setLevel(logging.DEBUG) + log.addHandler(fh) + log.addHandler(sh) + + +def extension_from_parameters(args): + """Construct string for saving model with annotation of parameters""" + ext = '' + ext += '.A={}'.format(args.activation) + ext += '.B={}'.format(args.batch_size) + ext += '.E={}'.format(args.epochs) + ext += '.O={}'.format(args.optimizer) + ext += '.LS={}'.format(args.loss) + # ext += '.LEN={}'.format(args.maxlen) + ext += '.LR={}'.format(args.learning_rate) + ext += '.CF={}'.format(''.join([x[0] for x in sorted(args.cell_features)])) + ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) + if args.feature_subsample > 0: + ext += '.FS={}'.format(args.feature_subsample) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) + if args.warmup_lr: + ext += '.wu_lr' + if args.reduce_lr: + ext += '.re_lr' + if args.residual: + ext += '.res' + if args.use_landmark_genes: + ext += '.L1000' + if args.no_gen: + ext += '.ng' + for i, n in enumerate(args.dense): + if n > 0: + ext += '.D{}={}'.format(i+1, n) + if args.dense_feature_layers != args.dense: + for i, n in enumerate(args.dense): + if n > 0: + ext += '.FD{}={}'.format(i+1, n) + + return ext + +def set_up_logger_data(verbose=False): + sh = logging.StreamHandler() + sh.setFormatter(logging.Formatter('')) + sh.setLevel(logging.DEBUG if verbose else logging.INFO) + + logger.setLevel(logging.DEBUG) + logger.addHandler(sh) + + +def log_evaluation(metric_outputs, logger, description='Comparing y_true and y_pred:'): + logger.info(description) + for metric, value in metric_outputs.items(): + logger.info(' {}: {:.4f}'.format(metric, value)) + + +def get_file_p1(url): + fname = os.path.basename(url) + return candle.get_file(fname, origin=url, cache_subdir='Pilot1') + + +def dict_compare(d1, d2, ignore=[], expand=False): + d1_keys = set(d1.keys()) - set(ignore) + d2_keys = set(d2.keys()) - set(ignore) + intersect_keys = d1_keys.intersection(d2_keys) + added = d1_keys - d2_keys + removed = d2_keys - d1_keys + modified = set({x : (d1[x], d2[x]) for x in intersect_keys if d1[x] != d2[x]}) + common = set(x for x in intersect_keys if d1[x] == d2[x]) + equal = not (added or removed or modified) + if expand: + return equal, added, removed, modified, common + else: + return equal, added | removed | modified + + +def evaluate_prediction(y_true, y_pred): + mse = mean_squared_error(y_true, y_pred) + mae = mean_absolute_error(y_true, y_pred) + r2 = r2_score(y_true, y_pred) + corr, _ = pearsonr(y_true, y_pred) + return {'mse': mse, 'mae': mae, 'r2': r2, 'corr': corr} + + +def read_IDs_file(fname): + + with open(fname, 'r') as f: + read_ids = f.read().splitlines() + + loggerUno.info('Read file: {}'.format(fname)) + loggerUno.info('Number of elements read: {}'.format(len(read_ids))) + + return read_ids + + +class BenchmarkUno(candle.Benchmark): + + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +additional_definitions = [ +# Feature selection + {'name':'agg_dose', + 'type': str, + 'default': None, + 'choices':['AUC', 'IC50', 'EC50', 'HS', 'AAC1', 'AUC1', 'DSS1'], + 'help':'use dose-independent response data with the specified aggregation metric'}, + {'name':'cell_features', + 'nargs':'+', + 'choices':['rnaseq', 'none'], + 'help':'use rnaseq cell line feature set or none at all'}, + {'name':'drug_features', + 'nargs':'+', + 'choices':['descriptors', 'fingerprints', 'none'], + 'help':'use dragon7 descriptors or fingerprint descriptors for drug features or none at all'}, + {'name': 'by_cell', + 'type':str, + 'default':None, + 'help':'sample ID for building a by-cell model'}, + {'name': 'by_drug', + 'type':str, + 'default':None, + 'help':'drug ID or name for building a by-drug model'}, +# Data set selection + {'name':'train_sources', + 'nargs':'+', + 'choices':['all', 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60', 'SCL', 'SCLC', 'ALMANAC'], + 'help':'use one or more sources of drug response data for training'}, + {'name':'test_sources', + 'nargs':'+', + 'choices':['train', 'all', 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60', 'SCL', 'SCLC', 'ALMANAC'], + 'help':'use one or more sources of drug response data for testing'}, +# Sample selection + {'name':'cell_types', + 'nargs':'+', + 'help':'limit training and test data to one or more tissue types'}, + {'name':'cell_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited molecular sample IDs to keep'}, + {'name':'drug_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited drug IDs to keep'}, + {'name':'drug_median_response_min', + 'type':float, + 'default':-1, + 'help':'keep drugs whose median response is greater than the threshold'}, + {'name':'drug_median_response_max', + 'type':float, + 'default':1, + 'help':'keep drugs whose median response is less than the threshold'}, +# Training + {'name':'no_feature_source', + 'type': candle.str2bool, + 'default': False, + 'help':'do not embed cell or drug feature source as part of input'}, + {'name':'no_response_source', + 'type': candle.str2bool, + 'default': False, + 'help':'do not encode response data source as an input feature'}, + {'name':'dense_feature_layers', + 'nargs':'+', + 'type':int, + 'help':'number of neurons in intermediate dense layers in the feature encoding submodels'}, + {'name':'use_landmark_genes', + 'type': candle.str2bool, + 'default': False, + 'help':'use the 978 landmark genes from LINCS (L1000) as expression features'}, + {'name':'use_filtered_genes', + 'type': candle.str2bool, + 'default': False, + 'help':'use the variance filtered genes as expression features'}, + {'name':'feature_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited features to keep'}, + {'name':'cell_feature_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited molecular features to keep'}, + {'name':'drug_feature_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited drug features to keep'}, + {'name':'preprocess_rnaseq', + 'choices':['source_scale', 'combat', 'none'], + 'default':'none', + 'help':'preprocessing method for RNAseq data; none for global normalization'}, + {'name':'residual', + 'type': candle.str2bool, + 'default': False, + 'help':'add skip connections to the layers'}, + {'name':'reduce_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'reduce learning rate on plateau'}, + {'name':'warmup_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'gradually increase learning rate on start'}, + {'name':'base_lr', + 'type':float, + 'default':None, + 'help':'base learning rate'}, + {'name':'cp', + 'type': candle.str2bool, + 'default': False, + 'help':'checkpoint models with best val_loss'}, + {'name':'tb', + 'type': candle.str2bool, + 'default': False, + 'help':'use tensorboard'}, + {'name': 'tb_prefix', + 'type': str, + 'default': 'tb', + 'help': 'prefix name for tb log'}, + {'name':'max_val_loss', + 'type':float, + 'default':argparse.SUPPRESS, + 'help':'retrain if val_loss is greater than the threshold'}, + {'name':'partition_by', + 'choices':['index', 'drug_pair', 'cell'], + 'default':None, + 'help':'cross validation paritioning scheme'}, + {'name':'cv', + 'type':int, + 'default':argparse.SUPPRESS, + 'help':'cross validation folds'}, + {'name':'no_gen', + 'type': candle.str2bool, + 'default': False, + 'help':'do not use generator for training and validation data'}, + {'name':'cache', + 'type': str, + 'default': None, + 'help':'prefix of data cache files to use'}, + {'name':'single', + 'type': candle.str2bool, + 'default': False, + 'help':'do not use drug pair representation'}, + {'name': 'export_csv', + 'type': str, + 'default': None, + 'help': 'output csv file name'}, + {'name':'export_data', + 'type': str, + 'default': None, + 'help':'output dataframe file name'}, + {'name': 'use_exported_data', + 'type': str, + 'default': None, + 'help': 'exported file name'}, + {'name':'growth_bins', + 'type': int, + 'default': 0, + 'help':'number of bins to use when discretizing growth response'}, + {'name' : 'initial_weights', + 'type' : str, + 'default': None, + 'help' : 'file name of initial weights'}, + {'name' : 'save_weights', + 'type': str, + 'default' : None, + 'help': 'name of file to save weights to' }, + {'name':'exclude_cells', 'nargs':'+', + 'default': [], + 'help':'cell line IDs to exclude'}, + {'name':'exclude_drugs', 'nargs':'+', + 'default': [], + 'help':'drug line IDs to exclude'}, + {'name':'sample_repetition', + 'type': candle.str2bool, + 'default': False, + 'help':'allow repetition of training data'} +] + + + +required = [ + 'activation', + 'batch_size', + 'dense', + 'dense_feature_layers', + 'dropout', + 'epochs', + 'feature_subsample', + 'learning_rate', + 'loss', + 'optimizer', + 'residual', + 'rng_seed', + 'save_path', + 'scaling', + 'val_split', + 'timeout' + ] diff --git a/Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py new file mode 100644 index 00000000..649780c2 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py @@ -0,0 +1,257 @@ + +from itertools import cycle, islice + +import numpy as np +import pandas as pd + +from keras.utils import Sequence + +def values_or_dataframe(df, contiguous=False, dataframe=False): + if dataframe: + return df + mat = df.values + if contiguous: + mat = np.ascontiguousarray(mat) + return mat + + +class CombinedDataGenerator(Sequence):#object): + """Generate training, validation or testing batches from loaded data + """ +# def __init__(self, data, partition='train', fold=0, source=None, batch_size=32, shuffle=True): + def __init__(self, data, partition='train', fold=0, source=None, batch_size=32, shuffle=True, single=False, rank=0, total_ranks=1): + + self.data = data + self.partition = partition + self.batch_size = batch_size + self.single = single + + if partition == 'train': + index = data.train_indexes[fold] + elif partition == 'val': + index = data.val_indexes[fold] + else: + index = data.test_indexes[fold] + + if source: + df = data.df_response[['Source']].iloc[index, :] + index = df.index[df['Source'] == source] + + if shuffle: + index = np.random.permutation(index) + # index = index[:len(index)//10] + + # sharing by rank + samples_per_rank = len(index) // total_ranks + samples_per_rank = self.batch_size * (samples_per_rank // self.batch_size) + + self.index = index[rank * samples_per_rank:(rank + 1) * samples_per_rank] + self.index_cycle = cycle(self.index) + self.size = len(self.index) + self.steps = self.size // self.batch_size + print("partition:{0}, rank:{1}, sharded index size:{2}, batch_size:{3}, steps:{4}".format(partition, rank, self.size, self.batch_size, self.steps)) + + +# self.index = index +# self.index_cycle = cycle(index) +# self.size = len(index) +# self.steps = np.ceil(self.size / batch_size) +# # self.steps = np.ceil(self.size / batch_size / 100) + + def __len__(self): + return self.steps + + def __getitem__(self, idx): + shard = self.index[idx * self.batch_size:(idx + 1) * self.batch_size] + x_list, y = self.get_slice(self.batch_size, single=self.single, partial_index=shard) + return x_list, y + + def reset(self): + self.index_cycle = cycle(self.index) + + def get_response(self, copy=False): + df = self.data.df_response.iloc[self.index, :].drop(['Group'], axis=1) + return df.copy() if copy else df + +# def get_slice(self, size=None, contiguous=True, single=False, dataframe=False): + def get_slice(self, size=None, contiguous=True, single=False, dataframe=False, partial_index=None): + size = size or self.size + single = single or self.data.agg_dose + target = self.data.agg_dose or 'Growth' + +# index = list(islice(self.index_cycle, size)) + if partial_index is not None: + index = partial_index + else: + index = list(islice(self.index_cycle, size)) + df_orig = self.data.df_response.iloc[index, :] + df = df_orig.copy() + + if not single: + df['Swap'] = np.random.choice([True, False], df.shape[0]) + swap = df_orig['Drug2'].notnull() & df['Swap'] + df.loc[swap, 'Drug1'] = df_orig.loc[swap, 'Drug2'] + df.loc[swap, 'Drug2'] = df_orig.loc[swap, 'Drug1'] + if not self.data.agg_dose: + df['DoseSplit'] = np.random.uniform(0.001, 0.999, df.shape[0]) + df.loc[swap, 'Dose1'] = df_orig.loc[swap, 'Dose2'] + df.loc[swap, 'Dose2'] = df_orig.loc[swap, 'Dose1'] + + split = df_orig['Drug2'].isnull() + if not single: + df.loc[split, 'Drug2'] = df_orig.loc[split, 'Drug1'] + if not self.data.agg_dose: + df.loc[split, 'Dose1'] = df_orig.loc[split, 'Dose1'] - np.log10(df.loc[split, 'DoseSplit']) + df.loc[split, 'Dose2'] = df_orig.loc[split, 'Dose1'] - np.log10(1 - df.loc[split, 'DoseSplit']) + + if dataframe: + cols = [target, 'Sample', 'Drug1', 'Drug2'] if not single else [target, 'Sample', 'Drug1'] + y = df[cols].reset_index(drop=True) + else: + y = values_or_dataframe(df[target], contiguous, dataframe) + + x_list = [] + + if not self.data.agg_dose: + doses = ['Dose1', 'Dose2'] if not single else ['Dose1'] + for dose in doses: + x = values_or_dataframe(df[[dose]].reset_index(drop=True), contiguous, dataframe) + x_list.append(x) + + if self.data.encode_response_source: + df_x = pd.merge(df[['Source']], self.data.df_source, on='Source', how='left') + df_x.drop(['Source'], axis=1, inplace=True) + x = values_or_dataframe(df_x, contiguous, dataframe) + x_list.append(x) + + for fea in self.data.cell_features: + df_cell = getattr(self.data, self.data.cell_df_dict[fea]) + df_x = pd.merge(df[['Sample']], df_cell, on='Sample', how='left') + df_x.drop(['Sample'], axis=1, inplace=True) + x = values_or_dataframe(df_x, contiguous, dataframe) + x_list.append(x) + + drugs = ['Drug1', 'Drug2'] if not single else ['Drug1'] + for drug in drugs: + for fea in self.data.drug_features: + df_drug = getattr(self.data, self.data.drug_df_dict[fea]) + df_x = pd.merge(df[[drug]], df_drug, left_on=drug, right_on='Drug', how='left') + df_x.drop([drug, 'Drug'], axis=1, inplace=True) + if dataframe and not single: + df_x = df_x.add_prefix(drug + '.') + x = values_or_dataframe(df_x, contiguous, dataframe) + x_list.append(x) + + # print(x_list, y) + return x_list, y + + def flow(self, single=False): + while 1: + x_list, y = self.get_slice(self.batch_size, single=single) + yield x_list, y + + +def test_generator(loader): + gen = CombinedDataGenerator(loader).flow() + x_list, y = next(gen) + print('x shapes:') + for x in x_list: + print(x.shape) + print('y shape:') + print(y.shape) + + +def find_columns_with_str(df, substr): + col_indices = [df.columns.get_loc(col) for col in df.columns if substr in col] + + return col_indices + +class FromFileDataGenerator(object): + """Generate testing batches from loaded data + """ + def __init__(self, df_data, indices, target_str, feature_names_list, num_features_list, batch_size=32, shuffle=True): + + self.batch_size = batch_size + + index = indices + + if shuffle: + index = np.random.permutation(index) + + self.index = index + self.index_cycle = cycle(index) + self.size = len(index) + self.steps = np.ceil(self.size / batch_size) + + self.num_features_list = num_features_list + + try : # Try to get the 'target_str' column + target = df_data.columns.get_loc(target_str) + except KeyError: # The 'target_str' column is not available in data file + # No ground truth available + y_fake = np.zeros(df_data.shape[0]) + df_data['fake_target'] = y_fake + self.target = df_data.columns.get_loc('fake_target') + else: # 'target_str' column is available --> use this column + self.target = target + + self.df_data = df_data + self.offset = self.compute_offset(feature_names_list) + + def compute_offset(self, feature_names): + offset = self.df_data.shape[1] + for name in feature_names: + col_indices = find_columns_with_str(self.df_data, name) + if len(col_indices) > 0: + first_col = np.min(col_indices) + if first_col < offset: + offset = first_col + + if offset == self.df_data.shape[1]: + raise Exception('ERROR ! Feature names from model are not in file. ' \ + 'These are features in model: ' + str(sorted(feature_names)) + \ + '... Exiting') + + return offset + + def reset(self): + self.index_cycle = cycle(self.index) + + def get_response(self, copy=False): + df = self.df_data.iloc[self.index, :] + return df.copy() if copy else df + + def get_slice(self, size=None, contiguous=True): + + size = size or self.size + index = list(islice(self.index_cycle, size)) + df_orig = self.df_data.iloc[index, :] + df = df_orig.copy() + + #Features --> + x_list = [] + start = self.offset + # features need to be provided in the partitions expected by the model + for i,numf in enumerate(self.num_features_list): + end = start + numf + mat = df.iloc[:,start:end].values + if contiguous: + mat = np.ascontiguousarray(mat) + x_list.append(mat) + start = end + + # Target + mat = df.iloc[:,self.target].values + if contiguous: + mat = np.ascontiguousarray(mat) + y = mat + + # print(x_list, y) + return x_list, y + + + def flow(self, single=False): + while 1: + x_list, y = self.get_slice(self.batch_size) + yield x_list, y + diff --git a/Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py new file mode 100644 index 00000000..be5a8483 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py @@ -0,0 +1,427 @@ +from __future__ import print_function + +import collections +import json +import logging +import os +import pickle + +import pandas as pd +import numpy as np + +from sklearn.model_selection import ShuffleSplit, KFold + +import cellline_data +import drug_data +import response_data + +from uno import loggerUno as logger +from uno import dict_compare + +SEED = 2019 + +def encode_sources(sources): + df = pd.get_dummies(sources, prefix='source', prefix_sep='.') + df['Source'] = sources + source_l1 = df['Source'].str.extract('^(\S+)\.', expand=False) + df1 = pd.get_dummies(source_l1, prefix='source.L1', prefix_sep='.') + df = pd.concat([df1, df], axis=1) + df = df.set_index('Source').reset_index() + return df + +def read_set_from_file(path): + if path: + with open(path, 'r') as f: + text = f.read().strip() + subset = text.split() + else: + subset = None + return subset + + +def assign_partition_groups(df, partition_by='drug_pair'): + if partition_by == 'cell': + group = df['Sample'] + elif partition_by == 'drug_pair': + df_info = drug_data.load_drug_info() + id_dict = df_info[['ID', 'PUBCHEM']].drop_duplicates(['ID']).set_index('ID').iloc[:, 0] + group = df['Drug1'].copy() + group[(df['Drug2'].notnull()) & (df['Drug1'] <= df['Drug2'])] = df['Drug1'] + ',' + df['Drug2'] + group[(df['Drug2'].notnull()) & (df['Drug1'] > df['Drug2'])] = df['Drug2'] + ',' + df['Drug1'] + group2 = group.map(id_dict) + mapped = group2.notnull() + group[mapped] = group2[mapped] + elif partition_by == 'index': + group = df.reset_index()['index'] + logger.info('Grouped response data by %s: %d groups', partition_by, group.nunique()) + return group + + +class CombinedDataLoader(object): + def __init__(self, seed=SEED): + self.seed = seed + self.test_indexes = [[]] + + def load_from_cache(self, cache, params): + param_fname = '{}.params.json'.format(cache) + if not os.path.isfile(param_fname): + logger.warning('Cache parameter file does not exist: %s', param_fname) + return False + with open(param_fname) as param_file: + try: + cached_params = json.load(param_file) + except json.JSONDecodeError as e: + logger.warning('Could not decode parameter file %s: %s', param_fname, e) + return False + ignore_keys = ['cache', 'partition_by', 'single'] + equal, diffs = dict_compare(params, cached_params, ignore_keys) + if not equal: + logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttemptd to load: %s', diffs, cached_params, params) + logger.warning('\nRemove %s to rebuild data cache.\n', param_fname) + raise ValueError('Could not load from a cache with incompatible keys:', diffs) + else: + fname = '{}.pkl'.format(cache) + if not os.path.isfile(fname): + logger.warning('Cache file does not exist: %s', fname) + return False + with open(fname, 'rb') as f: + obj = pickle.load(f) + self.__dict__.update(obj.__dict__) + logger.info('Loaded data from cache: %s', fname) + return True + return False + + def save_to_cache(self, cache, params): + for k in ['self', 'cache', 'single']: + if k in params: + del params[k] + param_fname = '{}.params.json'.format(cache) + with open(param_fname, 'w') as param_file: + json.dump(params, param_file, sort_keys=True) + fname = '{}.pkl'.format(cache) + with open(fname, 'wb') as f: + pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) + logger.info('Saved data to cache: %s', fname) + + def partition_data(self, partition_by=None, cv_folds=1, train_split=0.7, val_split=0.2, + cell_types=None, by_cell=None, by_drug=None, + cell_subset_path=None, drug_subset_path=None, + exclude_cells=[], exclude_drugs=[], exclude_indices=[]): + + seed = self.seed + train_sep_sources = self.train_sep_sources + test_sep_sources = self.test_sep_sources + df_response = self.df_response + + + if not partition_by: + if by_drug and by_cell: + partition_by = 'index' + elif by_drug: + partition_by = 'cell' + else: + partition_by = 'drug_pair' + + + # Exclude specified cells / drugs / indices + if exclude_cells != []: + df_response = df_response[~df_response['Sample'].isin(exclude_cells)] + if exclude_drugs != []: + if np.isin('Drug', df_response.columns.values): + df_response = df_response[~df_response['Drug1'].isin(exclude_drugs)] + else: + df_response = df_response[~df_response['Drug1'].isin(exclude_drugs) & ~df_response['Drug2'].isin(exclude_drugs)] + if exclude_indices != []: + df_response = df_response.drop(exclude_indices, axis=0) + logger.info('Excluding indices specified') + + if partition_by != self.partition_by: + df_response = df_response.assign(Group = assign_partition_groups(df_response, partition_by)) + + mask = df_response['Source'].isin(train_sep_sources) + test_mask = df_response['Source'].isin(test_sep_sources) + + if by_drug: + drug_ids = drug_data.drug_name_to_ids(by_drug) + logger.info('Mapped drug IDs for %s: %s', by_drug, drug_ids) + mask &= (df_response['Drug1'].isin(drug_ids)) & (df_response['Drug2'].isnull()) + test_mask &= (df_response['Drug1'].isin(drug_ids)) & (df_response['Drug2'].isnull()) + + if by_cell: + cell_ids = cellline_data.cell_name_to_ids(by_cell) + logger.info('Mapped sample IDs for %s: %s', by_cell, cell_ids) + mask &= (df_response['Sample'].isin(cell_ids)) + test_mask &= (df_response['Sample'].isin(cell_ids)) + + if cell_subset_path: + cell_subset = read_set_from_file(cell_subset_path) + mask &= (df_response['Sample'].isin(cell_subset)) + test_mask &= (df_response['Sample'].isin(cell_subset)) + + if drug_subset_path: + drug_subset = read_set_from_file(drug_subset_path) + mask &= (df_response['Drug1'].isin(drug_subset)) & ((df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset))) + test_mask &= (df_response['Drug1'].isin(drug_subset)) & ((df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset))) + + if cell_types: + df_type = cellline_data.load_cell_metadata() + cell_ids = set() + for cell_type in cell_types: + cells = df_type[~df_type['TUMOR_TYPE'].isnull() & df_type['TUMOR_TYPE'].str.contains(cell_type, case=False)] + cell_ids |= set(cells['ANL_ID'].tolist()) + logger.info('Mapped sample tissue types for %s: %s', cell_type, set(cells['TUMOR_TYPE'].tolist())) + mask &= (df_response['Sample'].isin(cell_ids)) + test_mask &= (df_response['Sample'].isin(cell_ids)) + + + df_group = df_response[mask]['Group'].drop_duplicates().reset_index(drop=True) + + if cv_folds > 1: + selector = KFold(n_splits=cv_folds, shuffle=True, random_state=seed) + else: + selector = ShuffleSplit(n_splits=1, train_size=train_split, test_size=val_split, random_state=seed) + + splits = selector.split(df_group) + + train_indexes = [] + val_indexes = [] + test_indexes = [] + + for index, (train_group_index, val_group_index) in enumerate(splits): + train_groups = set(df_group.values[train_group_index]) + val_groups = set(df_group.values[val_group_index]) + train_index = df_response.index[df_response['Group'].isin(train_groups) & mask] + val_index = df_response.index[df_response['Group'].isin(val_groups) & mask] + test_index = df_response.index[~df_response['Group'].isin(train_groups) & ~df_response['Group'].isin(val_groups) & test_mask] + + train_indexes.append(train_index) + val_indexes.append(val_index) + test_indexes.append(test_index) + if logger.isEnabledFor(logging.DEBUG): + logger.debug('CV fold %d: train data = %s, val data = %s, test data = %s', index, train_index.shape[0], val_index.shape[0], test_index.shape[0]) + logger.debug(' train groups (%d): %s', df_response.loc[train_index]['Group'].nunique(), df_response.loc[train_index]['Group'].unique()) + logger.debug(' val groups ({%d}): %s', df_response.loc[val_index]['Group'].nunique(), df_response.loc[val_index]['Group'].unique()) + logger.debug(' test groups ({%d}): %s', df_response.loc[test_index]['Group'].nunique(), df_response.loc[test_index]['Group'].unique()) + + + self.partition_by = partition_by + self.cv_folds = cv_folds + self.train_indexes = train_indexes + self.val_indexes = val_indexes + self.test_indexes = test_indexes + + def build_feature_list(self, single=False): + input_features = collections.OrderedDict() + feature_shapes = collections.OrderedDict() + + if not self.agg_dose: + doses = ['dose1', 'dose2'] if not single else ['dose1'] + for dose in doses: + input_features[dose] = 'dose' + feature_shapes['dose'] = (1,) + + if self.encode_response_source: + input_features['response.source'] = 'response.source' + feature_shapes['response.source'] = (self.df_source.shape[1] - 1,) + + for fea in self.cell_features: + feature_type = 'cell.' + fea + feature_name = 'cell.' + fea + df_cell = getattr(self, self.cell_df_dict[fea]) + input_features[feature_name] = feature_type + feature_shapes[feature_type] = (df_cell.shape[1] - 1,) + + drugs = ['drug1', 'drug2'] if not single else ['drug1'] + for drug in drugs: + for fea in self.drug_features: + feature_type = 'drug.' + fea + feature_name = drug + '.' + fea + df_drug = getattr(self, self.drug_df_dict[fea]) + input_features[feature_name] = feature_type + feature_shapes[feature_type] = (df_drug.shape[1] - 1,) + + input_dim = sum([np.prod(feature_shapes[x]) for x in input_features.values()]) + + self.input_features = input_features + self.feature_shapes = feature_shapes + self.input_dim = input_dim + + logger.info('Input features shapes:') + for k, v in self.input_features.items(): + logger.info(' {}: {}'.format(k, self.feature_shapes[v])) + logger.info('Total input dimensions: {}'.format(self.input_dim)) + + + def load(self, cache=None, ncols=None, scaling='std', dropna=None, + agg_dose=None, embed_feature_source=True, encode_response_source=True, + cell_features=['rnaseq'], drug_features=['descriptors', 'fingerprints'], + cell_feature_subset_path=None, drug_feature_subset_path=None, + drug_lower_response=1, drug_upper_response=-1, drug_response_span=0, + drug_median_response_min=-1, drug_median_response_max=1, + use_landmark_genes=False, use_filtered_genes=False, + preprocess_rnaseq=None, single=False, + # train_sources=['GDSC', 'CTRP', 'ALMANAC', 'NCI60'], + train_sources=['GDSC', 'CTRP', 'ALMANAC'], + # val_sources='train', + # test_sources=['CCLE', 'gCSI'], + test_sources=['train'], + partition_by='drug_pair'): + + params = locals().copy() + del params['self'] + + if not cell_features or 'none' in [x.lower() for x in cell_features]: + cell_features = [] + + if not drug_features or 'none' in [x.lower() for x in drug_features]: + drug_features = [] + + if cache and self.load_from_cache(cache, params): + self.build_feature_list(single=single) + return + + logger.info('Loading data from scratch ...') + + if agg_dose: + df_response = response_data.load_aggregated_single_response(target=agg_dose, combo_format=True) + else: + df_response = response_data.load_combined_dose_response() + + if logger.isEnabledFor(logging.INFO): + logger.info('Summary of combined dose response by source:') + logger.info(response_data.summarize_response_data(df_response, target=agg_dose)) + + all_sources = df_response['Source'].unique() + df_source = encode_sources(all_sources) + + if 'all' in train_sources: + train_sources = all_sources + if 'all' in test_sources: + test_sources = all_sources + elif 'train' in test_sources: + test_sources = train_sources + + train_sep_sources = [x for x in all_sources for y in train_sources if x.startswith(y)] + test_sep_sources = [x for x in all_sources for y in test_sources if x.startswith(y)] + + ids1 = df_response[['Drug1']].drop_duplicates().rename(columns={'Drug1':'Drug'}) + ids2 = df_response[['Drug2']].drop_duplicates().rename(columns={'Drug2':'Drug'}) + df_drugs_with_response = pd.concat([ids1, ids2]).drop_duplicates().dropna().reset_index(drop=True) + df_cells_with_response = df_response[['Sample']].drop_duplicates().reset_index(drop=True) + logger.info('Combined raw dose response data has %d unique samples and %d unique drugs', df_cells_with_response.shape[0], df_drugs_with_response.shape[0]) + + if agg_dose: + df_selected_drugs = None + else: + logger.info('Limiting drugs to those with response min <= %g, max >= %g, span >= %g, median_min <= %g, median_max >= %g ...', drug_lower_response, drug_upper_response, drug_response_span, drug_median_response_min, drug_median_response_max) + df_selected_drugs = response_data.select_drugs_with_response_range(df_response, span=drug_response_span, lower=drug_lower_response, upper=drug_upper_response, lower_median=drug_median_response_min, upper_median=drug_median_response_max) + logger.info('Selected %d drugs from %d', df_selected_drugs.shape[0], df_response['Drug1'].nunique()) + + + cell_feature_subset = read_set_from_file(cell_feature_subset_path) + drug_feature_subset = read_set_from_file(drug_feature_subset_path) + + for fea in cell_features: + fea = fea.lower() + if fea == 'rnaseq' or fea == 'expression': + df_cell_rnaseq = cellline_data.load_cell_rnaseq(ncols=ncols, scaling=scaling, use_landmark_genes=use_landmark_genes, use_filtered_genes=use_filtered_genes, feature_subset=cell_feature_subset, preprocess_rnaseq=preprocess_rnaseq, embed_feature_source=embed_feature_source) + + for fea in drug_features: + fea = fea.lower() + if fea == 'descriptors': + df_drug_desc = drug_data.load_drug_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) + elif fea == 'fingerprints': + df_drug_fp = drug_data.load_drug_fingerprints(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) + + # df_drug_desc, df_drug_fp = drug_data.load_drug_data(ncols=ncols, scaling=scaling, dropna=dropna) + + cell_df_dict = {'rnaseq': 'df_cell_rnaseq'} + + drug_df_dict = {'descriptors': 'df_drug_desc', + 'fingerprints': 'df_drug_fp'} + + # df_cell_ids = df_cell_rnaseq[['Sample']].drop_duplicates() + # df_drug_ids = pd.concat([df_drug_desc[['Drug']], df_drug_fp[['Drug']]]).drop_duplicates() + + logger.info('Filtering drug response data...') + + df_cell_ids = df_cells_with_response + for fea in cell_features: + df_cell = locals()[cell_df_dict[fea]] + df_cell_ids = df_cell_ids.merge(df_cell[['Sample']]).drop_duplicates() + logger.info(' %d molecular samples with feature and response data', df_cell_ids.shape[0]) + + df_drug_ids = df_drugs_with_response + for fea in drug_features: + df_drug = locals()[drug_df_dict[fea]] + df_drug_ids = df_drug_ids.merge(df_drug[['Drug']]).drop_duplicates() + + if df_selected_drugs is not None: + df_drug_ids = df_drug_ids.merge(df_selected_drugs).drop_duplicates() + logger.info(' %d selected drugs with feature and response data', df_drug_ids.shape[0]) + + df_response = df_response[df_response['Sample'].isin(df_cell_ids['Sample']) & + df_response['Drug1'].isin(df_drug_ids['Drug']) & + (df_response['Drug2'].isin(df_drug_ids['Drug']) | df_response['Drug2'].isnull())] + + df_response = df_response[df_response['Source'].isin(train_sep_sources + test_sep_sources)] + + df_response.reset_index(drop=True, inplace=True) + + if logger.isEnabledFor(logging.INFO): + logger.info('Summary of filtered dose response by source:') + logger.info(response_data.summarize_response_data(df_response, target=agg_dose)) + + df_response = df_response.assign(Group = assign_partition_groups(df_response, partition_by)) + + self.agg_dose = agg_dose + self.cell_features = cell_features + self.drug_features = drug_features + self.cell_df_dict = cell_df_dict + self.drug_df_dict = drug_df_dict + self.df_source = df_source + self.df_response = df_response + self.embed_feature_source = embed_feature_source + self.encode_response_source = encode_response_source + self.all_sources = all_sources + self.train_sources = train_sources + self.test_sources = test_sources + self.train_sep_sources = train_sep_sources + self.test_sep_sources = test_sep_sources + self.partition_by = partition_by + + for var in (list(drug_df_dict.values()) + list(cell_df_dict.values())): + value = locals().get(var) + if value is not None: + setattr(self, var, value) + + self.build_feature_list(single=single) + + if cache: + self.save_to_cache(cache, params) + + + def get_cells_in_val(self): + + val_cell_ids = list(set(self.df_response.loc[self.val_indexes[0]]['Sample'].values)) + + return val_cell_ids + + + def get_drugs_in_val(self): + + if np.isin('Drug', self.df_response.columns.values): + val_drug_ids = list(set(self.df_response.loc[self.val_indexes[0]]['Drug'].values)) + else: + val_drug_ids = list(set(self.df_response.loc[self.val_indexes[0]]['Drug1'].values)) + + return val_drug_ids + + + def get_index_in_val(self): + + val_indices = list(set(self.val_indexes[0])) + + return val_indices + + diff --git a/Pilot1/Uno_UQ/model_utils_/__init__.py b/Pilot1/Uno_UQ/model_utils_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py b/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py new file mode 100644 index 00000000..36bb666c --- /dev/null +++ b/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py @@ -0,0 +1,307 @@ +#! /usr/bin/env python + + +import numpy as np + +import keras +from keras import backend as K +from keras.models import Model +from keras.layers import Input, Dense, Dropout +from keras.callbacks import Callback +from keras import regularizers +from keras.metrics import mean_squared_error, mean_absolute_error + +import candle + + +def r2_heteroscedastic(y_true, y_pred): + y_out = K.reshape(y_pred[:,:-1], K.shape(y_true)) + SS_res = K.sum(K.square(y_true - y_out)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + + +def mae_heteroscedastic(y_true, y_pred): + y_out = K.reshape(y_pred[:,:-1], K.shape(y_true)) + return mean_absolute_error(y_true, y_out) + +def mse_heteroscedastic(y_true, y_pred): + y_out = K.reshape(y_pred[:,:-1], K.shape(y_true)) + return mean_squared_error(y_true, y_out) + +def meanS_heteroscesdastic(y_true, y_pred): + log_sig2 = y_pred[:,1] + return K.mean(log_sig2) + +def quantile_loss(quantile, y_true, y_pred): + error = (y_true - y_pred) + return K.mean(K.maximum(quantile*error, (quantile-1)*error), axis=-1) + +def quantile50(y_true, y_pred): + y_out0 = K.reshape(y_pred[:,0], K.shape(y_true)) + error = (y_true-y_out0) + quantile = 0.5 + return quantile_loss(quantile, y_true, y_out0) + + +def quantile10(y_true, y_pred): + y_out1 = K.reshape(y_pred[:,1], K.shape(y_true)) + error = (y_true-y_out1) + quantile = 0.1 + return quantile_loss(quantile, y_true, y_out1) + + +def quantile90(y_true, y_pred): + y_out2 = K.reshape(y_pred[:,2], K.shape(y_true)) + error = (y_true-y_out2) + quantile = 0.9 + return quantile_loss(quantile, y_true, y_out2) + + +class ModelRecorder(Callback): + def __init__(self, save_all_models=False): + Callback.__init__(self) + self.save_all_models = save_all_models + candle.register_permanent_dropout() + + def on_train_begin(self, logs={}): + self.val_losses = [] + self.best_val_loss = np.Inf + self.best_model = None + + def on_epoch_end(self, epoch, logs={}): + val_loss = logs.get('val_loss') + self.val_losses.append(val_loss) + if val_loss < self.best_val_loss: + self.best_model = keras.models.clone_model(self.model) + self.best_val_loss = val_loss + + +class SimpleWeightSaver(Callback): + + def __init__(self, fname): + self.fname = fname + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + + def on_train_end(self, logs={}): + self.model.save_weights(self.fname) + + +def build_model(loader, args, logger=None, permanent_dropout=True, silent=False): + if args.loss == 'heteroscedastic': + model = build_heteroscedastic_model(loader, args, logger, permanent_dropout, silent) + elif args.loss == 'quantile': + model = build_quantile_model(loader, args, logger, permanent_dropout, silent) + else: + model = build_homoscedastic_model(loader, args, logger, permanent_dropout, silent) + + return model + +def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], + activation='relu', residual=False, + dropout_rate=0, permanent_dropout=True, + reg_l2=0): + x_input = Input(shape=input_shape) + h = x_input + for i, layer in enumerate(dense_layers): + x = h + if reg_l2 > 0: + h = Dense(layer, activation=activation, kernel_regularizer=regularizers.l2(reg_l2))(h) + else: + h = Dense(layer, activation=activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + model = Model(x_input, h, name=name) + return model + + +def build_homoscedastic_model(loader, args, logger=None, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.dropout + reg_l2 = args.reg_l2 + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, permanent_dropout=permanent_dropout, + reg_l2=reg_l2) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.'+fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + if reg_l2 > 0: + h = Dense(layer, activation=args.activation, kernel_regularizer=regularizers.l2(reg_l2))(h) + else: + h = Dense(layer, activation=args.activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(1)(h) + + return Model(inputs, output) + + +def build_heteroscedastic_model(loader, args, logger=None, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.dropout + reg_l2 = args.reg_l2 + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, permanent_dropout=permanent_dropout, + reg_l2=reg_l2) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.'+fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + if reg_l2 > 0: + h = Dense(layer, activation=args.activation, kernel_regularizer=regularizers.l2(reg_l2))(h) + else: + h = Dense(layer, activation=args.activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(2, bias_initializer='ones')(h) + + return Model(inputs, output) + +def build_quantile_model(loader, args, logger=None, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.dropout + reg_l2 = args.reg_l2 + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, + permanent_dropout=permanent_dropout, + reg_l2=reg_l2) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.'+fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + h = Dense(layer, activation=args.activation, kernel_regularizer=regularizers.l2(args.reg_l2))(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(3, bias_initializer='ones')(h) + + return Model(inputs, output) + + +def heteroscedastic_loss(y_true, y_pred): + y_shape = K.shape(y_true) + y_out = K.reshape(y_pred[:,0], y_shape) + diff_sq = K.square(y_out - y_true) + log_sig2 = y_pred[:,1] + + return K.mean(K.exp(-log_sig2) * diff_sq + log_sig2) + + +def tilted_loss(quantile, y_true, f): + error = (y_true-f) + return K.mean(K.maximum(quantile*error, (quantile-1)*error), axis=-1) + + +def triple_quantile_loss(y_true, y_pred): + y_shape = K.shape(y_true) + y_out0 = K.reshape(y_pred[:,0], y_shape) + y_out1 = K.reshape(y_pred[:,1], y_shape) + y_out2 = K.reshape(y_pred[:,2], y_shape) + + return tilted_loss(0.1, y_true, y_out1) + tilted_loss(0.9, y_true, y_out2) + 2. * tilted_loss(0.5, y_true, y_out0) diff --git a/Pilot1/Uno_UQ/uno_defaultUQ_model.txt b/Pilot1/Uno_UQ/uno_defaultUQ_model.txt new file mode 100644 index 00000000..1230114b --- /dev/null +++ b/Pilot1/Uno_UQ/uno_defaultUQ_model.txt @@ -0,0 +1,39 @@ +[Global_Params] +train_sources=['gCSI'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors', 'fingerprints'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='sgd' +scaling='std' +dropout=0.1 +epochs=10 +batch_size=32 +val_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=0.01 +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save_default/' +save_weights='saved.weights.h5' +no_gen=False +verbose = False +single=True +agg_dose='AUC' +no_feature_source=True +no_response_source=True +use_landmark_genes=True +partition_by='cell' + +[Monitor_Params] +timeout=3600 diff --git a/Pilot1/Uno_UQ/uno_holdoutUQ_data.py b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py new file mode 100644 index 00000000..feb6ee49 --- /dev/null +++ b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import logging +import os + +from keras import backend as K + +import data_utils_.uno as uno +import candle + +import data_utils_.uno_combined_data_loader as uno_combined_data_loader + + +logger = logging.getLogger(__name__) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def initialize_parameters(default_model='uno_defaultUQ_model.txt'): + + # Build benchmark object + unoBmk = uno.BenchmarkUno(uno.file_path, default_model, 'keras', + prog='uno_holdoutUQ_data', desc='Build data split for UQ analysis in the problem of prediction of tumor response to drug pairs.') + + # Initialize parameters + gParameters = candle.finalize_parameters(unoBmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def run(params): + args = candle.ArgumentStruct(**params) + candle.set_seed(args.rng_seed) + ext = uno.extension_from_parameters(args) + candle.verify_path(args.save_path) + prefix = args.save_path + ext + logfile = args.logfile if args.logfile else prefix+'.log' + uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) + logger.info('Params: {}'.format(params)) + + loader = uno_combined_data_loader.CombinedDataLoader(args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + partition_by=args.partition_by + ) + + target = args.agg_dose or 'Growth' + val_split = args.val_split + train_split = 1 - val_split + + loader.partition_data(partition_by=args.partition_by, + cv_folds=args.cv, train_split=train_split, + val_split=val_split, cell_types=args.cell_types, + by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, + drug_subset_path=args.drug_subset_path + ) + + print('partition_by: ', args.partition_by) + if args.partition_by == 'drug_pair': + fname_drugs = args.save_path + 'infer_drug_ids' + pds = loader.get_drugs_in_val() + with open(fname_drugs, 'w') as f: + for item in pds: + f.write('%s\n' % item) + logger.info('Drug IDs in holdout set written in file: {}'.format(fname_drugs)) + elif args.partition_by == 'cell': + fname_cells = args.save_path + 'infer_cell_ids' + pcs = loader.get_cells_in_val() + with open(fname_cells, 'w') as f: + for item in pcs: + f.write('%s\n' % item) + logger.info('Cell IDs in holdout set written in file: {}'.format(fname_cells)) + else : # + fname_index = args.save_path + 'infer_index_ids' + pins = loader.get_index_in_val() + with open(fname_index, 'w') as f: + for item in pins: + f.write('%s\n' % item) + logger.info('Indices in holdout set written in file: {}'.format(fname_index)) + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/Pilot1/Uno_UQ/uno_inferUQ_keras2.py b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py new file mode 100644 index 00000000..505ea1cd --- /dev/null +++ b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py @@ -0,0 +1,301 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import argparse +import logging +import os + +import numpy as np +import pandas as pd + +from itertools import cycle + +from keras import backend as K + +import keras +from keras.utils import get_custom_objects + +import data_utils_.uno as uno +import candle + +import data_utils_.uno_combined_data_loader as uno_combined_data_loader +import data_utils_.uno_combined_data_generator as uno_combined_data_generator +import model_utils_.uno_model_utils as uno_model_utils + +logger = logging.getLogger(__name__) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +additional_definitions_local = [ +{'name':'uq_infer_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File to do inference'}, +{'name':'uq_infer_given_drugs', + 'type': candle.str2bool, + 'default': False, + 'help':'Use given inference file to obtain drug ids to do inference'}, +{'name':'uq_infer_given_cells', + 'type': candle.str2bool, + 'default': False, + 'help':'Use given inference file to obtain cell ids to do inference'}, +{'name':'uq_infer_given_indices', + 'type': candle.str2bool, + 'default': False, + 'help':'Use given inference file to obtain indices to do inference'}, +{'name':'model_file', + 'type':str, + 'default':'saved.model.h5', + 'help':'trained model file'}, +{'name':'weights_file', + 'type':str, + 'default':'saved.weights.h5', + 'help':'trained weights file (loading model file alone sometimes does not work in keras)'}, +{'name':'n_pred', + 'type':int, + 'default':1, + 'help':'the number of predictions to make for each sample-drug combination for uncertainty quantification'} +] + +required_local = ( 'model_file', 'weights_file', 'uq_infer_file', + 'agg_dose', 'batch_size') + + +def initialize_parameters(default_model='uno_defaultUQ_model.txt'): + + # Build benchmark object + unoBmk = uno.BenchmarkUno(uno.file_path, default_model, 'keras', + prog='uno_inferUQ', desc='Read models to predict tumor response to single and paired drugs.') + + unoBmk.additional_definitions += additional_definitions_local + unoBmk.required = unoBmk.required.union(required_local) + + # Initialize parameters + gParameters = candle.finalize_parameters(unoBmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def from_file(args, model): + + df_data = pd.read_csv(args.uq_infer_file, sep='\t') + logger.info('data shape: {}'.format(df_data.shape)) + logger.info('Size of data to infer: {}'.format(df_data.shape)) + + test_indices = range(df_data.shape[0]) + target_str = args.agg_dose or 'Growth' + + # Extract size of input layers to get number of features + num_features_list = [] + feature_names_list = [] + for layer in model.layers: # All layers in model + dict = layer.get_config() # getting layer config info + name = dict['name'] # getting layer name + if name.find('input') > -1: # if layer is an input layer + feature_names_list.append(name.split('.')[-1]) + size_ = dict['batch_input_shape'] # get layer size + num_features_list.append(size_[1]) + + feature_names_list.append('dragon7') + + test_gen = uno_combined_data_generator.FromFileDataGenerator(df_data, test_indices, + target_str, feature_names_list, num_features_list, + batch_size=args.batch_size, shuffle=False) + + return test_gen + + +def given_drugs(args, loader): + + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size) + + # Include specified drugs + include_drugs = uno.read_IDs_file(args.uq_infer_file) + df_response = test_gen.data.df_response + if np.isin('Drug', df_response.columns.values): + df = df_response[['Drug']] + index = df.index[df['Drug'].isin(include_drugs)] + else: + df = df_response[['Drug1', 'Drug2']] + index = df.index[df['Drug1'].isin(include_drugs) | + df['Drug2'].isin(include_drugs)] + + # Update object + test_gen.index = index + test_gen.index_cycle = cycle(index) + test_gen.size = len(index) + test_gen.steps = np.ceil(test_gen.size / args.batch_size) + + return test_gen + + +def given_cells(args, loader): + + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size) + + # Include specified cells + include_cells = uno.read_IDs_file(args.uq_infer_file) + df = test_gen.data.df_response[['Sample']] + index = df.index[df['Sample'].isin(include_cells)] + + # Update object + test_gen.index = index + test_gen.index_cycle = cycle(index) + test_gen.size = len(index) + test_gen.steps = np.ceil(test_gen.size / args.batch_size) + + return test_gen + + +def given_indices(args, loader): + + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size) + + # Include specified indices + index = uno.read_IDs_file(args.uq_infer_file) + + # Update object + test_gen.index = index + test_gen.index_cycle = cycle(index) + test_gen.size = len(index) + test_gen.steps = np.ceil(test_gen.size / args.batch_size) + + return test_gen + + +def run(params): + args = candle.ArgumentStruct(**params) + candle.set_seed(args.rng_seed) + logfile_def = 'uno_infer_from_' + args.uq_infer_file + '.log' + logfile = args.logfile if args.logfile else logfile_def + uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) + logger.info('Params: {}'.format(params)) + + ext = uno.extension_from_parameters(args) + candle.verify_path(args.save_path) + prefix = args.save_path + 'uno' + ext + + # Load trained model + candle.register_permanent_dropout() + model = keras.models.load_model(args.model_file, compile=False) + model.load_weights(args.weights_file) + logger.info('Loaded model:') + model.summary(print_fn=logger.info) + + # Determine output to infer + target = args.agg_dose or 'Growth' + + if (args.uq_infer_given_drugs or args.uq_infer_given_cells or args.uq_infer_given_indices): + loader = uno_combined_data_loader.CombinedDataLoader(args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + ) + + if args.uq_infer_given_drugs: + test_gen = given_drugs(args, loader) + elif args.uq_infer_given_cells: + test_gen = given_cells(args, loader) + else: + test_gen = given_indices(args, loader) + + else: + test_gen = from_file(args, model) + + + df_test = test_gen.get_response(copy=True) + y_test = df_test[target].values + + for i in range(args.n_pred): + + if args.no_gen: + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) + else: + test_gen.reset() + y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + y_test_pred = y_test_pred[:test_gen.size] + + if args.loss == 'heteroscedastic': + y_test_pred_ = y_test_pred[:,0] + s_test_pred = y_test_pred[:,1] + + y_test_pred = y_test_pred_.flatten() + + df_test['Predicted_'+target+'_'+str(i+1)] = y_test_pred + df_test['Pred_S_'+target+'_'+str(i+1)] = s_test_pred + + pred_fname = prefix + '.predicted_INFER_HET.tsv' + + elif args.loss == 'quantile': + + y_test_pred_50q = y_test_pred[:,0] + y_test_pred_10q = y_test_pred[:,1] + y_test_pred_90q = y_test_pred[:,2] + + y_test_pred = y_test_pred_50q.flatten() # 50th quantile prediction + + df_test['Predicted_50q_'+target+'_'+str(i+1)] = y_test_pred + df_test['Predicted_10q_'+target+'_'+str(i+1)] = y_test_pred_10q.flatten() + df_test['Predicted_90q_'+target+'_'+str(i+1)] = y_test_pred_90q.flatten() + + pred_fname = prefix + '.predicted_INFER_QTL.tsv' + + else: + y_test_pred = y_test_pred.flatten() + df_test['Predicted_'+target+'_'+str(i+1)] = y_test_pred + pred_fname = prefix + '.predicted_INFER.tsv' + + if args.n_pred < 21: + scores = uno.evaluate_prediction(y_test, y_test_pred) + uno.log_evaluation(scores, logger) + + df_pred = df_test + if args.agg_dose: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) + else: + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', target], inplace=True) + else: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + else: + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + + df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + logger.info('Predictions stored in file: {}'.format(pred_fname)) + + + if K.backend() == 'tensorflow': + K.clear_session() + + logger.handlers = [] + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() + diff --git a/Pilot1/Uno_UQ/uno_trainUQ_keras2.py b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py new file mode 100644 index 00000000..49f5cd2f --- /dev/null +++ b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py @@ -0,0 +1,404 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import argparse +import logging +import os + +import numpy as np +import pandas as pd + + +from keras import backend as K +from keras import optimizers +from keras.models import Model +from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard +from keras.utils.vis_utils import plot_model + +import data_utils_.uno as uno +import candle + +import data_utils_.uno_combined_data_loader as uno_combined_data_loader +import data_utils_.uno_combined_data_generator as uno_combined_data_generator +import model_utils_.uno_model_utils as uno_model_utils + +from model_utils_.uno_model_utils import heteroscedastic_loss, triple_quantile_loss + +logger = logging.getLogger(__name__) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +additional_definitions = [ +{'name':'uq_exclude_drugs_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File with drug ids to exclude from training'}, +{'name':'uq_exclude_cells_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File with cell ids to exclude from training'}, +{'name':'uq_exclude_indices_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File with indices to exclude from training'}, +{'name':'exclude_indices', 'nargs':'+', + 'default': [], + 'help':'indices to exclude'}, +{'name':'reg_l2', + 'type': float, + 'default': 0., + 'help':'weight of regularization for l2 norm of nn weights'} +] + +required = ['exclude_drugs', 'exclude_cells', 'exclude_indices'] + +class UQUno(candle.Benchmark): + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(uno.required) + self.required.update(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + uno.additional_definitions + + + +def initialize_parameters(default_model='uno_defaultUQ_model.txt'): + + # Build benchmark object + unoUQBmk = UQUno(uno.file_path, default_model, 'keras', + prog='uno_trainUQ', desc='Build neural network based models to predict tumor response to single and paired drugs, including UQ analysis.') + + # Initialize parameters + gParameters = candle.finalize_parameters(unoUQBmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def run(params): + args = candle.ArgumentStruct(**params) + candle.set_seed(args.rng_seed) + ext = uno.extension_from_parameters(args) + candle.verify_path(args.save_path) + prefix = args.save_path + 'uno' + ext + logfile = args.logfile if args.logfile else prefix+'.log' + uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) + logger.info('Params: {}'.format(params)) + + # Exclude drugs / cells for UQ + if 'uq_exclude_drugs_file' in params.keys(): + args.exclude_drugs = uno.read_IDs_file(args.uq_exclude_drugs_file) + logger.info('Drugs to exclude: {}'.format(args.exclude_drugs)) + else: + args.exclude_drugs = [] + if 'uq_exclude_cells_file' in params.keys(): + args.exclude_cells = uno.read_IDs_file(args.uq_exclude_cells_file) + logger.info('Cells to exclude: {}'.format(args.exclude_cells)) + else: + args.exclude_cells = [] + + if 'uq_exclude_indices_file' in params.keys(): + exclude_indices_ = uno.read_IDs_file(args.uq_exclude_indices_file) + args.exclude_indices = [int(x) for x in exclude_indices_] + logger.info('Indices to exclude: {}'.format(args.exclude_indices)) + else: + args.exclude_indices = [] + + + if (len(args.gpus) > 0): + import tensorflow as tf + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = ",".join(map(str, args.gpus)) + K.set_session(tf.Session(config=config)) + + loader = uno_combined_data_loader.CombinedDataLoader(seed=args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + ) + + target = args.agg_dose or 'Growth' + val_split = args.val_split + train_split = 1 - val_split + + loader.partition_data(partition_by=args.partition_by, + cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, + drug_subset_path=args.drug_subset_path, + exclude_cells=args.exclude_cells, + exclude_drugs=args.exclude_drugs, + exclude_indices=args.exclude_indices + ) + + model = uno_model_utils.build_model(loader, args, logger) + logger.info('Combined model:') + model.summary(print_fn=logger.info) + # plot_model(model, to_file=prefix+'.model.png', show_shapes=True) + + if args.cp: + model_json = model.to_json() + with open(prefix+'.model.json', 'w') as f: + print(model_json, file=f) + + def warmup_scheduler(epoch): + lr = args.learning_rate or base_lr * args.batch_size/100 + if epoch <= 5: + K.set_value(model.optimizer.lr, (base_lr * (5-epoch) + lr * epoch) / 5) + logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr))) + return K.get_value(model.optimizer.lr) + + df_pred_list = [] + + cv_ext = '' + cv = args.cv if args.cv > 1 else 1 + + for fold in range(cv): + if args.cv > 1: + logger.info('Cross validation fold {}/{}:'.format(fold+1, cv)) + cv_ext = '.cv{}'.format(fold+1) + +# model = uno_model_utils.build_model(loader, args, logger, silent=True) + + template_model = uno_model_utils.build_model(loader, args, logger, silent=True) + if args.initial_weights: + logger.info("Loading weights from {}".format(args.initial_weights)) + template_model.load_weights(args.initial_weights) + + if len(args.gpus) > 1: + from keras.utils import multi_gpu_model + gpu_count = len(args.gpus) + logger.info("Multi GPU with {} gpus".format(gpu_count)) + model = multi_gpu_model(template_model, cpu_merge=False, gpus=gpu_count) + else: + model = template_model + + + optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}}) + base_lr = args.base_lr or K.get_value(optimizer.lr) + if args.learning_rate: + K.set_value(optimizer.lr, args.learning_rate) + + if args.loss == 'heteroscedastic': + logger.info('Training heteroscedastic model:') + model.compile(loss=heteroscedastic_loss, optimizer=optimizer, metrics=[uno_model_utils.mae_heteroscedastic, uno_model_utils.r2_heteroscedastic, uno_model_utils.meanS_heteroscesdastic]) + elif args.loss == 'quantile': + logger.info('Training quantile model:') + model.compile(loss=triple_quantile_loss, optimizer=optimizer, metrics=[uno_model_utils.quantile50, uno_model_utils.quantile10, uno_model_utils.quantile90]) + else: + logger.info('Training homoscedastic model:') + model.compile(loss=args.loss, optimizer=optimizer, metrics=[candle.mae, candle.r2]) + + # calculate trainable and non-trainable params + params.update(candle.compute_trainable_params(model)) + + candle_monitor = candle.CandleRemoteMonitor(params=params) + timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) + warmup_lr = LearningRateScheduler(warmup_scheduler) + #checkpointer = ModelCheckpoint(prefix+cv_ext+'.weights.h5', save_best_only=True, save_weights_only=True) + checkpointer = candle.MultiGPUCheckpoint(prefix + cv_ext + '.model.h5', save_best_only=True) + tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext)) + history_logger = candle.LoggingCallback(logger.debug) +# model_recorder = uno_model_utils.ModelRecorder() + + # callbacks = [history_logger, model_recorder] + callbacks = [candle_monitor, timeout_monitor, history_logger]#, model_recorder] + if args.reduce_lr: + callbacks.append(reduce_lr) + if args.warmup_lr: + callbacks.append(warmup_lr) + if args.cp: + callbacks.append(checkpointer) + if args.tb: + callbacks.append(tensorboard) + if args.save_weights: + callbacks.append(uno_model_utils.SimpleWeightSaver(args.save_path + '/' + args.save_weights)) + + + train_gen = uno_combined_data_generator.CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) + val_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) + + df_val = val_gen.get_response(copy=True) + y_val = df_val[target].values + y_shuf = np.random.permutation(y_val) + uno.log_evaluation(uno.evaluate_prediction(y_val, y_shuf), logger, + description='Between random pairs in y_val:') + + if args.no_gen: + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single) + history = model.fit(x_train_list, y_train, + batch_size=args.batch_size, + epochs=args.epochs, + callbacks=callbacks, + validation_data=(x_val_list, y_val)) + else: + logger.info('Data points per epoch: train = %d, val = %d',train_gen.size, val_gen.size) + logger.info('Steps per epoch: train = %d, val = %d',train_gen.steps, val_gen.steps) + history = model.fit_generator(train_gen, train_gen.steps, + epochs=args.epochs, + callbacks=callbacks, + validation_data=val_gen, + validation_steps=val_gen.steps) + +# if args.cp: +# model.load_weights(prefix+cv_ext+'.weights.h5') + # model = model_recorder.best_model + + if args.no_gen: + y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) + else: + val_gen.reset() + y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) + y_val_pred = y_val_pred[:val_gen.size] + + if args.loss == 'heteroscedastic': + y_val_pred_ = y_val_pred[:,0] + s_val_pred = y_val_pred[:,1] + + y_val_pred = y_val_pred_.flatten() + + df_val['Predicted_'+target] = y_val_pred + df_val[target+'_Error'] = y_val_pred-y_val + df_val['Pred_S_'+target] = s_val_pred + + elif args.loss == 'quantile': + y_val_pred_50q = y_val_pred[:,0] + y_val_pred_10q = y_val_pred[:,1] + y_val_pred_90q = y_val_pred[:,2] + + y_val_pred = y_val_pred_50q.flatten() # 50th quantile prediction + + df_val['Predicted_50q_'+target] = y_val_pred + df_val[target+'_Error_50q'] = y_val_pred-y_val + df_val['Predicted_10q_'+target] = y_val_pred_10q.flatten() + df_val['Predicted_90q_'+target] = y_val_pred_90q.flatten() + + else: + y_val_pred = y_val_pred.flatten() + + # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred-y_val) + df_val['Predicted'+target] = y_val_pred + df_val[target+'Error'] = y_val_pred-y_val + + scores = uno.evaluate_prediction(y_val, y_val_pred) + uno.log_evaluation(scores, logger) + + df_pred_list.append(df_val) + +# if args.cp: +# model_recorder.best_model.save(prefix+'.model.h5') + + if hasattr(history, 'loss'): + candle.plot_history(prefix, history, 'loss') + if args.loss == 'heteroscedastic': + if hasattr(history, 'r2_heteroscedastic'): + candle.plot_history(prefix, history, 'r2_heteroscedastic') + if hasattr(history, 'meanS_heteroscedastic'): + candle.plot_history(prefix, history, 'meanS_heteroscesdastic') + elif args.loss == 'quantile': + if hasattr(history, 'quantile50'): + candle.plot_history(prefix, history, 'quantile50') + if hasattr(history, 'quantile10'): + candle.plot_history(prefix, history, 'quantile10') + if hasattr(history, 'quantile90'): + candle.plot_history(prefix, history, 'quantile90') + else: + if hasattr(history, 'r2'): + candle.plot_history(prefix, history, 'r2') + + pred_fname = prefix + '.predicted.tsv' + df_pred = pd.concat(df_pred_list) + if args.agg_dose: + if args.single: +# df_pred.sort_values(['Source', 'Sample', 'Drug1', target], inplace=True) + df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) + else: + df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target], inplace=True) + else: + if args.single: +# df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + else: +# df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + logger.info('Testing predictions stored in file: {}'.format(pred_fname)) + + if args.cp: + logger.info('Model stored in file: {}'.format(prefix+'.model.h5')) +# logger.info('Model weights stored in file: {}'.format(prefix+cv_ext+'.weights.h5')) + logger.info('Model weights stored in file: {}'.format(args.save_path + '/' + args.save_weights)) + + if args.cv > 1: + scores = uno.evaluate_prediction(df_pred[target], df_pred['Predicted'+target]) + uno.log_evaluation(scores, logger, description='Combining cross validation folds:') + + for test_source in loader.test_sep_sources: + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source) + df_test = test_gen.get_response(copy=True) + y_test = df_test[target].values + n_test = len(y_test) + if n_test == 0: + continue + if args.no_gen: + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) + if args.loss == 'heteroscedastic': + y_test_pred = y_test_pred[:,0] + elif args.loss == 'quantile': + y_test_pred = y_test_pred[:,0] # 50th quantile prediction + else: + y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + if args.loss == 'heteroscedastic': + y_test_pred = y_test_pred[:test_gen.size,0] + elif args.loss == 'quantile': + y_test_pred = y_test_pred[:test_gen.size,0] # 50th quantile prediction + else: + y_test_pred = y_test_pred[:test_gen.size] + + y_test_pred = y_test_pred.flatten() + scores = uno.evaluate_prediction(y_test, y_test_pred) + uno.log_evaluation(scores, logger, description='Testing on data from {} ({})'.format(test_source, n_test)) + + if K.backend() == 'tensorflow': + K.clear_session() + + logger.handlers = [] + + return history + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/Pilot2/P2B1/p2b1.py b/Pilot2/P2B1/p2b1.py index ad11088b..faa0c860 100644 --- a/Pilot2/P2B1/p2b1.py +++ b/Pilot2/P2B1/p2b1.py @@ -31,10 +31,10 @@ {'name':'train_bool', 'type':candle.str2bool,'default':True,'help':'Invoke training'}, {'name':'eval_bool', 'type':candle.str2bool,'default':False,'help':'Use model for inference'}, {'name':'home_dir','help':'Home Directory','type':str,'default':'.'}, -{'name':'config_file','help':'Config File','type':str,'default':os.path.join(file_path, 'p2b1_default_model.txt')}, +#{'name':'config_file','help':'Config File','type':str,'default':os.path.join(file_path, 'p2b1_default_model.txt')}, {'name':'weight_path','help':'Trained Model Pickle File','type':str,'default':None}, {'name':'base_memo','help':'Memo','type':str,'default':None}, -{'name':'seed', 'type':candle.str2bool,'default':False,'help':'Random Seed'}, +#{'name':'seed_bool', 'type':candle.str2bool,'default':False,'help':'Random Seed'}, {'name':'case','help':'[Full, Center, CenterZ]','type':str,'default':'Full'}, {'name':'fig_bool', 'type':candle.str2bool,'default':False,'help':'Generate Prediction Figure'}, {'name':'set_sel','help':'[3k_Disordered, 3k_Ordered, 3k_Ordered_and_gel, 6k_Disordered, 6k_Ordered, 6k_Ordered_and_gel]','type':str,'default':'3k_Disordered'}, @@ -42,7 +42,16 @@ {'name':'full_conv_bool', 'type':candle.str2bool, 'default':False, 'help':'Invoke training using fully convolutional NN for inner AE'}, {'name':'type_bool', 'type':candle.str2bool, 'default':True, 'help':'Include molecule type information in desining AE'}, {'name':'nbr_type', 'type':str, 'default':'relative', 'help':'Defines the type of neighborhood data to use. [relative, invariant]'}, -{'name':'backend', 'help':'Keras Backend', 'type':str, 'default':'tensorflow'} +{'name':'backend', 'help':'Keras Backend', 'type':str, 'default':'tensorflow'}, +{'name':'cool', 'help':'Boolean: cool learning rate', 'type':candle.str2bool, 'default':False}, +{'name':'data_set', 'help':'Data set for training', 'type':str, 'default':None}, +{'name':'l2_reg', 'help':'Regularization parameter', 'type':float, 'default':None}, +{'name':'molecular_nbrs', 'help':'Data dimension for molecular autoencoder', 'type':int, 'default':None}, +{'name':'molecular_nonlinearity', 'help':'Activation for molecular netowrk', 'type':str, 'default':None}, +{'name':'molecular_num_hidden', 'nargs':'+', 'help':'Layer sizes for molecular network', 'type':int, 'default':None}, +{'name':'noise_factor', 'help':'Noise factor', 'type':float, 'default':None}, +{'name':'num_hidden', 'nargs':'+', 'help':'Dense layer specification', 'type':int, 'default':None}, +{'name':'sampling_density', 'help':'Sampling density', 'type':float, 'default':None} ] required = [ @@ -61,7 +70,7 @@ 'molecular_num_hidden', 'molecular_nonlinearity', 'molecular_nbrs', - 'drop_prob', + 'dropout', 'l2_reg', 'sampling_density', 'save_path' diff --git a/Pilot2/P2B1/p2b1_baseline_keras2.py b/Pilot2/P2B1/p2b1_baseline_keras2.py index a897d6aa..f7f9eeaf 100644 --- a/Pilot2/P2B1/p2b1_baseline_keras2.py +++ b/Pilot2/P2B1/p2b1_baseline_keras2.py @@ -12,7 +12,7 @@ from importlib import reload # Python 3.4+ except ImportError: from imp import reload # Python 3.0 - 3.3 - + TIMEOUT=3600 # in sec; set this to -1 for no timeout file_path = os.path.dirname(os.path.realpath(__file__)) #lib_path = os.path.abspath(os.path.join(file_path, '..', 'common')) @@ -22,7 +22,7 @@ from keras import backend as K -import p2b1 +import p2b1 import candle import p2b1_AE_models as AE_models @@ -40,20 +40,20 @@ def str2bool(v): return v.lower() in ("yes", "true", "t", "1") -def initialize_parameters(): +def initialize_parameters(default_model = 'p2b1_default_model.txt'): # Build benchmark object - p2b1Bmk = p2b1.BenchmarkP2B1(p2b1.file_path, 'p2b1_default_model.txt', 'keras', + p2b1Bmk = p2b1.BenchmarkP2B1(p2b1.file_path, default_model, 'keras', prog='p2b1_baseline', desc='Train Molecular Frame Autoencoder - Pilot 2 Benchmark 1') # Initialize parameters - GP = candle.initialize_parameters(p2b1Bmk) + GP = candle.finalize_parameters(p2b1Bmk) #p2b1.logger.info('Params: {}'.format(gParameters)) print ('\nTraining parameters:') for key in sorted(GP): print ("\t%s: %s" % (key, GP[key])) - + # print json.dumps(GP, indent=4, skipkeys=True, sort_keys=True) if GP['backend'] != 'theano' and GP['backend'] != 'tensorflow': @@ -79,8 +79,8 @@ def initialize_parameters(): def run(GP): # set the seed - if GP['seed']: - np.random.seed(GP['seed']) + if GP['rng_seed']: + np.random.seed(GP['rng_seed']) else: np.random.seed(np.random.randint(10000)) @@ -211,7 +211,7 @@ def run(GP): nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], - drop=float(GP['drop_prob'])) + drop=float(GP['dropout'])) elif full_conv_bool: molecular_model, molecular_encoder = AE_models.full_conv_mol_auto(bead_k_size=bead_kernel_size, mol_k_size=mol_kernel_size, @@ -220,14 +220,14 @@ def run(GP): nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], - drop=float(GP['drop_prob'])) + drop=float(GP['dropout'])) else: molecular_model, molecular_encoder = AE_models.dense_auto(weights_path=None, input_shape=(molecular_input_dim,), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], - drop=float(GP['drop_prob'])) + drop=float(GP['dropout'])) if GP['loss'] == 'mse': loss_func = 'mse' @@ -238,7 +238,7 @@ def run(GP): print ('\nModel Summary: \n') molecular_model.summary() ##### set up callbacks and cooling for the molecular_model ########## - drop = 0.5 + drop = GP['dropout'] mb_epochs = GP['epochs'] initial_lrate = GP['learning_rate'] epochs_drop = 1+int(np.floor(mb_epochs/3)) diff --git a/Pilot2/P2B1/p2b1_default_model.txt b/Pilot2/P2B1/p2b1_default_model.txt index 62e4e30c..52c876ca 100644 --- a/Pilot2/P2B1/p2b1_default_model.txt +++ b/Pilot2/P2B1/p2b1_default_model.txt @@ -13,7 +13,7 @@ molecular_nonlinearity='elu' molecular_num_hidden=[256, 128, 64, 32, 16, 8] molecular_nbrs = 200 base_memo='p2b1' -drop_prob = 0.5 +dropout = 0.5 data_set='3k_Ordered' sampling_density = 0.15 save_path='.' diff --git a/Pilot2/P2B1/p2b1_medium_model.txt b/Pilot2/P2B1/p2b1_medium_model.txt index bb178ef8..3912d0ee 100644 --- a/Pilot2/P2B1/p2b1_medium_model.txt +++ b/Pilot2/P2B1/p2b1_medium_model.txt @@ -4,6 +4,15 @@ batch_size=32 learning_rate=0.01 epochs=10 cool=True -weight_decay=0.0005 +optimizer='adam' +loss='custom' +activation='relu' +molecular_nonlinearity='elu' +molecular_num_hidden=[256, 128, 64, 32, 16, 8] +molecular_nbrs = 200 noise_factor=0 base_memo='p2b1' +dropout = 0.5 +l2_reg=0.01 +sampling_density = 0.15 +save_path='.' diff --git a/Pilot2/P2B1/p2b1_small_model.txt b/Pilot2/P2B1/p2b1_small_model.txt index 9f53dd02..196fcb15 100644 --- a/Pilot2/P2B1/p2b1_small_model.txt +++ b/Pilot2/P2B1/p2b1_small_model.txt @@ -4,13 +4,15 @@ batch_size=32 learning_rate=0.01 epochs=10 cool=False -weight_decay=0.0005 noise_factor=0.0 optimizer='adam' loss='mse' activation='relu' -molecular_nonlinearity=elu +molecular_nonlinearity='elu' molecular_num_hidden=[54,12] -molecular_epochs=1 molecular_nbrs=10 base_memo='p2b1' +dropout = 0.5 +l2_reg=0.01 +sampling_density = 0.15 +save_path='.' diff --git a/Pilot3/P3B1/p3b1.py b/Pilot3/P3B1/p3b1.py index e082497d..c67b5a3d 100644 --- a/Pilot3/P3B1/p3b1.py +++ b/Pilot3/P3B1/p3b1.py @@ -59,7 +59,7 @@ ] -required = ['learning_rate', 'batch_size', 'epochs', 'drop', \ +required = ['learning_rate', 'batch_size', 'epochs', 'dropout', \ 'activation', 'out_activation', 'loss', 'optimizer', 'metrics', \ 'n_fold', 'scaling', 'initialization', 'shared_nnet_spec', \ 'ind_nnet_spec', 'feature_names'] diff --git a/Pilot3/P3B1/p3b1_baseline_keras2.py b/Pilot3/P3B1/p3b1_baseline_keras2.py index 1cae3c19..23dc4905 100644 --- a/Pilot3/P3B1/p3b1_baseline_keras2.py +++ b/Pilot3/P3B1/p3b1_baseline_keras2.py @@ -14,14 +14,14 @@ import p3b1 as bmk import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'p3b1_default_model.txt'): # Build benchmark object - p3b1Bmk = bmk.BenchmarkP3B1(bmk.file_path, 'p3b1_default_model.txt', 'keras', + p3b1Bmk = bmk.BenchmarkP3B1(bmk.file_path, default_model, 'keras', prog='p3b1_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') - + # Initialize parameters - gParameters = candle.initialize_parameters(p3b1Bmk) + gParameters = candle.finalize_parameters(p3b1Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -35,7 +35,7 @@ def fetch_data(gParameters): path = gParameters['data_url'] fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) - + return fpath @@ -43,7 +43,7 @@ def build_model(gParameters, kerasDefaults, shared_nnet_spec, individual_nnet_spec, input_dim, Y_train, Y_test, verbose=False): - + labels_train = [] labels_test = [] @@ -52,13 +52,13 @@ def build_model(gParameters, kerasDefaults, for l in range( len( Y_train ) ): truth_train = np.array( Y_train[l], dtype='int32' ) truth_test = np.array( Y_test[l], dtype='int32' ) - + mv = int( np.max( truth_train ) ) - + label_train = np.zeros( ( len( truth_train ), mv + 1 ) ) for i in range( len( truth_train ) ): label_train[ i, truth_train[ i ] ] = 1 - + label_test = np.zeros( ( len( truth_test ), mv + 1 ) ) for i in range( len(truth_test) ): label_test[ i, truth_test[ i ] ] = 1 @@ -81,8 +81,9 @@ def build_model(gParameters, kerasDefaults, for k in range( len( shared_nnet_spec ) ): layer = Dense( shared_nnet_spec[ k ], activation=gParameters['activation'], name= 'shared_layer_' + str( k ) )( shared_layers[ -1 ] ) - if gParameters['drop'] > 0: - layer = Dropout( gParameters['drop'] )( shared_layers[ -1 ] ) + shared_layers.append( layer ) + if gParameters['dropout'] > 0: + layer = Dropout( gParameters['dropout'] )( shared_layers[ -1 ] ) shared_layers.append( layer ) @@ -100,8 +101,8 @@ def build_model(gParameters, kerasDefaults, layer = Dense( individual_nnet_spec[l][k], activation=gParameters['activation'], name= 'indiv_layer_' + str( l ) + '_' + str( k ) )( indiv_layers[-1] ) indiv_layers.append( layer ) - if gParameters['drop'] > 0: - layer = Dropout( gParameters['drop'] )( indiv_layers[-1] ) + if gParameters['dropout'] > 0: + layer = Dropout( gParameters['dropout'] )( indiv_layers[-1] ) indiv_layers.append( layer ) else: layer = Dense( n_out_nodes[l], activation=gParameters['out_activation'], diff --git a/Pilot3/P3B1/p3b1_default_model.txt b/Pilot3/P3B1/p3b1_default_model.txt index 9d93f422..41cb0c60 100644 --- a/Pilot3/P3B1/p3b1_default_model.txt +++ b/Pilot3/P3B1/p3b1_default_model.txt @@ -5,7 +5,7 @@ model_name = 'p3b1' learning_rate = 0.01 batch_size = 10 epochs = 10 -drop = 0.0 +dropout = 0.0 activation = 'relu' out_activation = 'softmax' loss = 'categorical_crossentropy' diff --git a/Pilot3/P3B2/p3b2.py b/Pilot3/P3B2/p3b2.py index 55484632..169b81b3 100644 --- a/Pilot3/P3B2/p3b2.py +++ b/Pilot3/P3B2/p3b2.py @@ -35,7 +35,7 @@ ] required = ['train_data', 'rnn_size', 'epochs', 'n_layers', \ - 'learning_rate', 'drop', 'recurrent_dropout', \ + 'learning_rate', 'dropout', 'recurrent_dropout', \ 'temperature','primetext', 'length'] class BenchmarkP3B2(candle.Benchmark): diff --git a/Pilot3/P3B2/p3b2_baseline_keras2.py b/Pilot3/P3B2/p3b2_baseline_keras2.py index 824b3be1..67e90e1e 100644 --- a/Pilot3/P3B2/p3b2_baseline_keras2.py +++ b/Pilot3/P3B2/p3b2_baseline_keras2.py @@ -15,14 +15,14 @@ import p3b2 as bmk import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'p3b2_default_model.txt'): # Build benchmark object - p3b2Bmk = bmk.BenchmarkP3B2(bmk.file_path, 'p3b2_default_model.txt', 'keras', + p3b2Bmk = bmk.BenchmarkP3B2(bmk.file_path, default_model, 'keras', prog='p3b2_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(p3b2Bmk) + gParameters = candle.finalize_parameters(p3b2Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -62,7 +62,7 @@ def run(gParameters): rnn_size = gParameters['rnn_size'] n_layers = gParameters['n_layers'] learning_rate = gParameters['learning_rate'] - dropout = gParameters['drop'] + dropout = gParameters['dropout'] recurrent_dropout = gParameters['recurrent_dropout'] n_epochs = gParameters['epochs'] data_train = data_path+'/data.pkl' diff --git a/Pilot3/P3B2/p3b2_default_model.txt b/Pilot3/P3B2/p3b2_default_model.txt index 73f77487..51089215 100644 --- a/Pilot3/P3B2/p3b2_default_model.txt +++ b/Pilot3/P3B2/p3b2_default_model.txt @@ -6,7 +6,7 @@ rnn_size = 64 epochs = 2 n_layers = 1 learning_rate = 0.01 -drop = 0.0 +dropout = 0.0 recurrent_dropout = 0.0 loss = 'categorical_crossentropy' activation = 'softmax' diff --git a/Pilot3/P3B3/p3b3.py b/Pilot3/P3B3/p3b3.py index 497cf633..6a7589fd 100644 --- a/Pilot3/P3B3/p3b3.py +++ b/Pilot3/P3B3/p3b3.py @@ -14,7 +14,6 @@ import candle -''' additional_definitions = [ {'name':'train_features', 'action':'store', @@ -56,9 +55,26 @@ 'type': str}, {'name':'n_fold', 'action':'store', - 'type':int} + 'type':int}, +{'name':'emb_l2', + 'action':'store', + 'type':float}, +{'name':'w_l2', + 'action':'store', + 'type':float}, +{'name':'wv_len', + 'action':'store', + 'type':int}, +{'name':'filter_sets', + 'nargs':'+', + 'type': int}, +{'name':'filter_sizes', + 'nargs':'+', + 'type': int}, +{'name':'num_filters', + 'nargs':'+', + 'type': int} ] -''' required = [ @@ -79,7 +95,7 @@ def set_locals(self): if required is not None: self.required = set(required) - # if additional_definitions is not None: - # self.additional_definitions = additional_definitions + if additional_definitions is not None: + self.additional_definitions = additional_definitions diff --git a/Pilot3/P3B3/p3b3_baseline_keras2.py b/Pilot3/P3B3/p3b3_baseline_keras2.py index b8bd9e3d..89407399 100644 --- a/Pilot3/P3B3/p3b3_baseline_keras2.py +++ b/Pilot3/P3B3/p3b3_baseline_keras2.py @@ -13,10 +13,6 @@ from sklearn.metrics import f1_score ''' -import p3b3 as bmk -import candle - - import os, sys, gzip import keras @@ -32,21 +28,23 @@ import argparse +import p3b3 as bmk +import candle + -def initialize_parameters(): +def initialize_parameters(default_model = 'p3b3_default_model.txt'): # Build benchmark object - p3b3Bmk = bmk.BenchmarkP3B3(bmk.file_path, 'p3b3_default_model.txt', 'keras', + p3b3Bmk = bmk.BenchmarkP3B3(bmk.file_path, default_model, 'keras', prog='p3b3_baseline', desc='Multi-task CNN for data extraction from clinical reports - Pilot 3 Benchmark 3') - + # Initialize parameters - gParameters = candle.initialize_parameters(p3b3Bmk) + gParameters = candle.finalize_parameters(p3b3Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters - def fetch_data(gParameters): """ Downloads and decompresses the data if not locally available. Since the training data depends on the model definition it is not loaded, @@ -55,7 +53,7 @@ def fetch_data(gParameters): path = gParameters['data_url'] fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) - + return fpath @@ -157,7 +155,6 @@ def run(gParameters): test_x = np.load( fpath + '/test_X.npy' ) test_y = np.load( fpath + '/test_Y.npy' ) - for task in range( len( train_y[ 0, : ] ) ): cat = np.unique( train_y[ :, task ] ) train_y[ :, task ] = [ np.where( cat == x )[ 0 ][ 0 ] for x in train_y[ :, task ] ] diff --git a/Pilot3/P3B4/p3b4.py b/Pilot3/P3B4/p3b4.py index 430f9fd9..ba502f77 100644 --- a/Pilot3/P3B4/p3b4.py +++ b/Pilot3/P3B4/p3b4.py @@ -14,12 +14,22 @@ import candle - +additional_definitions=[ +{'name':'attention_size', + 'action':'store', + 'type':int}, +{'name':'embed_train', + 'action':'store', + 'type':candle.str2bool}, +{'name':'wv_len', + 'action':'store', + 'type':int} +] required = [ 'learning_rate', 'batch_size', 'epochs', 'dropout', \ 'optimizer', 'wv_len', \ - 'attention_size'] + 'attention_size', 'embed_train'] @@ -34,8 +44,8 @@ def set_locals(self): if required is not None: self.required = set(required) - # if additional_definitions is not None: - # self.additional_definitions = additional_definitions + if additional_definitions is not None: + self.additional_definitions = additional_definitions diff --git a/Pilot3/P3B4/p3b4_baseline_keras2.py b/Pilot3/P3B4/p3b4_baseline_keras2.py index 5caacdf0..90e0900d 100644 --- a/Pilot3/P3B4/p3b4_baseline_keras2.py +++ b/Pilot3/P3B4/p3b4_baseline_keras2.py @@ -12,14 +12,14 @@ import p3b4 as bmk import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'p3b4_default_model.txt' ): # Build benchmark object - p3b3Bmk = bmk.BenchmarkP3B3(bmk.file_path, 'p3b4_default_model.txt', 'keras', + p3b3Bmk = bmk.BenchmarkP3B3(bmk.file_path, default_model, 'keras', prog='p3b4_baseline', desc='Hierarchical Convolutional Attention Networks for data extraction from clinical reports - Pilot 3 Benchmark 4') # Initialize parameters - gParameters = candle.initialize_parameters(p3b3Bmk) + gParameters = candle.finalize_parameters(p3b3Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -52,7 +52,7 @@ def run(gParameters): batch_size = gParameters[ 'batch_size' ] epochs = gParameters[ 'epochs' ] dropout = gParameters[ 'dropout' ] - + embed_train = gParameters[ 'embed_train' ] optimizer = gParameters[ 'optimizer' ] if optimizer == 0: @@ -74,8 +74,6 @@ def run(gParameters): test_x = np.load( fpath + '/test_X.npy' ) test_y = np.load( fpath + '/test_Y.npy' ) - - num_classes = [] for task in range( len( train_y[ 0, : ] ) ): cat = np.unique( train_y[ :, task ] ) @@ -123,7 +121,8 @@ def run(gParameters): attention_size= attention_size, dropout_rate = dropout, lr = learning_rate, - optimizer= optimizer + optimizer= optimizer, + embed_train = embed_train ) ret = model.train( diff --git a/Pilot3/P3B4/p3b4_default_model.txt b/Pilot3/P3B4/p3b4_default_model.txt index 02044931..76f66fed 100644 --- a/Pilot3/P3B4/p3b4_default_model.txt +++ b/Pilot3/P3B4/p3b4_default_model.txt @@ -3,10 +3,10 @@ data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot3/' train_data = 'P3B3_data.tar.gz' model_name = 'p3b4' batch_size = 64 -epochs = 1 +epochs = 10 optimizer = 0 learning_rate = 0.0001 wv_len = 50 attention_size = 500 dropout = 0.1 - +embed_train = False diff --git a/Pilot3/P3B4/tf_mthcan.py b/Pilot3/P3B4/tf_mthcan.py index ec3f8af6..c0506717 100644 --- a/Pilot3/P3B4/tf_mthcan.py +++ b/Pilot3/P3B4/tf_mthcan.py @@ -13,76 +13,39 @@ def __init__(self): class hcan(object): def __init__(self,embedding_matrix,num_classes,max_sents,max_words, - attention_size=512,dropout_rate=0.9,activation=tf.nn.elu,lr=0.0001, optimizer= 'adam'): + attention_size=512,dropout_rate=0.9,activation=tf.nn.elu,lr=0.0001, + optimizer= 'adam', embed_train = True): - tf.reset_default_graph() + tf.compat.v1.reset_default_graph() dropout_keep = dropout_rate self.dropout_keep = dropout_keep - self.dropout = tf.placeholder(tf.float32) + self.dropout = tf.compat.v1.placeholder(tf.float32) self.ms = max_sents self.mw = max_words self.embedding_matrix = embedding_matrix.astype(np.float32) self.attention_size = attention_size self.activation = activation self.num_tasks = len(num_classes) + self.embed_train = embed_train #doc input - self.doc_input = tf.placeholder(tf.int32, shape=[None,max_sents,max_words]) - doc_embeds = tf.map_fn(self._attention_step,self.doc_input,dtype=tf.float32) + self.doc_input = tf.compat.v1.placeholder(tf.int32, shape=[None,max_sents,max_words]) # batch x sents x words + batch_size = tf.shape(self.doc_input)[0] - #classification functions - logits = [] - self.predictions = [] - for i in range(self.num_tasks): - logit = tf.layers.dense(doc_embeds,num_classes[i], - kernel_initializer=tf.contrib.layers.xavier_initializer()) - logits.append(logit) - self.predictions.append(tf.nn.softmax(logit)) + words_per_sent = tf.reduce_sum(tf.sign(self.doc_input),2) # batch X sents + max_words_ = tf.reduce_max(words_per_sent) + sents_per_doc = tf.reduce_sum(tf.sign(words_per_sent),1) # batch + max_sents_ = tf.reduce_max(sents_per_doc) + doc_input_reduced = self.doc_input[:,:max_sents_,:max_words_] #clip - #loss, accuracy, and training functions - self.labels = [] - self.loss = 0 - for i in range(self.num_tasks): - label = tf.placeholder(tf.int32,shape=[None]) - self.labels.append(label) - loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i],labels=label)) - self.loss += loss/self.num_tasks - # self.optimizer = tf.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) - if optimizer == 'adam': - self.optimizer = tf.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) - elif optimizer == 'sgd': - self.optimizer = tf.train.GradientDescentOptimizer( lr ).minimize( self.loss ) - elif optimizer == 'adadelta': - self.optimizer = tf.train.AdadeltaOptimizer( learning_rate= lr ).minimize( self.loss ) - else: - self.optimizer = tf.train.RMSPropOptimizer( lr ).minimize( self.loss ) - - #init op - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - self.saver = tf.train.Saver() - self.sess = tf.Session(config=config) - self.sess.run(tf.global_variables_initializer()) - - def _attention_step(self,doc): - - words_per_line = tf.reduce_sum(tf.sign(doc),1) - num_lines = tf.reduce_sum(tf.sign(words_per_line)) - max_words_ = tf.reduce_max(words_per_line) - doc_input_reduced = doc[:num_lines,:max_words_] - num_words = words_per_line[:num_lines] + doc_input_reshape = tf.reshape(doc_input_reduced,(-1,max_words_)) # batch*sents x words #word embeddings - word_embeds = tf.gather(tf.get_variable('embeddings',initializer=self.embedding_matrix, - dtype=tf.float32),doc_input_reduced) - word_embeds = tf.nn.dropout(word_embeds,self.dropout) - - #masking - mask_base = tf.cast(tf.sequence_mask(num_words,max_words_),tf.float32) - mask = tf.tile(tf.expand_dims(mask_base,2),[1,1,self.attention_size]) - mask2 = tf.tile(tf.expand_dims(mask_base,2),[1,1,max_words_]) + word_embeds = tf.gather(tf.compat.v1.get_variable('embeddings',initializer=self.embedding_matrix, + dtype=tf.float32, trainable=self.embed_train),doc_input_reshape) + word_embeds = tf.nn.dropout(word_embeds,self.dropout) # batch*sents x words x attention_size #word self attention Q = tf.layers.conv1d(word_embeds,self.attention_size,1,padding='same', @@ -92,31 +55,26 @@ def _attention_step(self,doc): V = tf.layers.conv1d(word_embeds,self.attention_size,1,padding='same', activation=self.activation,kernel_initializer=tf.contrib.layers.xavier_initializer()) - Q = tf.where(tf.equal(mask,0),tf.zeros_like(Q),Q) - K = tf.where(tf.equal(mask,0),tf.zeros_like(K),K) - V = tf.where(tf.equal(mask,0),tf.zeros_like(V),V) - outputs = tf.matmul(Q,tf.transpose(K,[0, 2, 1])) outputs = outputs/(K.get_shape().as_list()[-1]**0.5) outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.where(tf.equal(mask2,0),tf.zeros_like(outputs),outputs) - outputs = tf.matmul(outputs,V) - outputs = tf.where(tf.equal(mask,0),tf.zeros_like(outputs),outputs) + outputs = tf.matmul(outputs,V) # batch*sents x words x attention_size #word target attention - Q = tf.get_variable('word_Q',(1,1,self.attention_size), + Q = tf.compat.v1.get_variable('word_Q',(1,1,self.attention_size), tf.float32,tf.orthogonal_initializer()) - Q = tf.tile(Q,[num_lines,1,1]) + Q = tf.tile(Q,[batch_size*max_sents_,1,1]) V = outputs outputs = tf.matmul(Q,tf.transpose(outputs,[0, 2, 1])) outputs = outputs/(K.get_shape().as_list()[-1]**0.5) outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.matmul(outputs,V) - sent_embeds = tf.transpose(outputs,[1,0,2]) - sent_embeds = tf.nn.dropout(sent_embeds,self.dropout) + outputs = tf.matmul(outputs,V) # batch*sents x 1 x attention_size + + sent_embeds = tf.reshape(outputs,(-1,max_sents_,self.attention_size)) + sent_embeds = tf.nn.dropout(sent_embeds,self.dropout) # batch x sents x attention_size #sent self attention Q = tf.layers.conv1d(sent_embeds,self.attention_size,1,padding='same', @@ -128,22 +86,57 @@ def _attention_step(self,doc): outputs = tf.matmul(Q,tf.transpose(K,[0, 2, 1])) outputs = outputs/(K.get_shape().as_list()[-1]**0.5) + outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.matmul(outputs,V) + outputs = tf.matmul(outputs,V) # batch x sents x attention_size #sent target attention - Q = tf.get_variable('sent_Q',(1,1,self.attention_size), + Q = tf.compat.v1.get_variable('sent_Q',(1,1,self.attention_size), tf.float32,tf.orthogonal_initializer()) + Q = tf.tile(Q,[batch_size,1,1]) V = outputs outputs = tf.matmul(Q,tf.transpose(outputs,[0, 2, 1])) outputs = outputs/(K.get_shape().as_list()[-1]**0.5) + outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.matmul(outputs,V) - doc_embed = tf.nn.dropout(tf.squeeze(outputs,[0]),self.dropout) + outputs = tf.matmul(outputs,V) # batch x 1 x attention_size + doc_embeds = tf.nn.dropout(tf.squeeze(outputs,[1]),self.dropout) # batch x attention_size + + #classification functions + logits = [] + self.predictions = [] + for i in range(self.num_tasks): + logit = tf.layers.dense(doc_embeds,num_classes[i], + kernel_initializer=tf.contrib.layers.xavier_initializer()) + logits.append(logit) + self.predictions.append(tf.nn.softmax(logit)) + + #loss, accuracy, and training functions + self.labels = [] + self.loss = 0 + for i in range(self.num_tasks): + label = tf.compat.v1.placeholder(tf.int32,shape=[None]) + self.labels.append(label) + loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i],labels=label)) + self.loss += loss/self.num_tasks + # self.optimizer = tf.compat.v1.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) + if optimizer == 'adam': + self.optimizer = tf.compat.v1.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) + elif optimizer == 'sgd': + self.optimizer = tf.compat.v1.train.GradientDescentOptimizer( lr ).minimize( self.loss ) + elif optimizer == 'adadelta': + self.optimizer = tf.compat.v1.train.AdadeltaOptimizer( learning_rate= lr ).minimize( self.loss ) + else: + self.optimizer = tf.compat.v1.train.RMSPropOptimizer( lr ).minimize( self.loss ) + + #init op + config = tf.compat.v1.ConfigProto() + config.gpu_options.allow_growth = True + self.saver = tf.compat.v1.train.Saver() + self.sess = tf.compat.v1.Session(config=config) + self.sess.run(tf.global_variables_initializer()) - return tf.squeeze(doc_embed,[0]) - def train(self,data,labels,batch_size=100,epochs=50,validation_data=None): if validation_data: @@ -196,7 +189,7 @@ def train(self,data,labels,batch_size=100,epochs=50,validation_data=None): #checkpoint after every epoch print("\ntraining time: %.2f" % (time.time()-start_time)) - + for i in range(self.num_tasks): micro = f1_score(y_trues[i],y_preds[i],average='micro') macro = f1_score(y_trues[i],y_preds[i],average='macro') @@ -209,11 +202,11 @@ def train(self,data,labels,batch_size=100,epochs=50,validation_data=None): #reset timer start_time = time.time() - + return history def predict(self,data,batch_size=100): - + y_preds = [[] for i in range(self.num_tasks)] for start in range(0,len(data),batch_size): @@ -237,7 +230,7 @@ def predict(self,data,batch_size=100): return y_preds def score(self,data,labels,batch_size=16): - + loss = [] y_preds = [[] for i in range(self.num_tasks)] for start in range(0,len(data),batch_size): @@ -253,7 +246,7 @@ def score(self,data,labels,batch_size=16): feed_dict[self.labels[i]] = labels[i][start:stop] retvals = self.sess.run(self.predictions+[self.loss],feed_dict=feed_dict) loss.append(retvals[-1]) - + for i in range(self.num_tasks): y_preds[i].append(np.argmax(retvals[i],1)) @@ -298,6 +291,7 @@ def load(self,filename): #create data vocab = np.random.rand(vocab_size,embedding_size) + vocab[0,:] = 0 X = np.random.randint(0,vocab_size,(train_samples+test_samples,max_lines,max_words)) #optional masking @@ -328,3 +322,4 @@ def load(self,filename): print(history.history) + diff --git a/Pilot3/P3B5/README.rst b/Pilot3/P3B5/README.rst new file mode 100644 index 00000000..aa09ec54 --- /dev/null +++ b/Pilot3/P3B5/README.rst @@ -0,0 +1,13 @@ +======================================= +P3B5 Differentiable Architecture Search +======================================= + +Differentiable architecture search (DARTS) benchmark using clinical pathology reports. + + +To test your environment, use the UPF method of running the benchmark. A UPF test script +is available at `Supervisor/workflows/upf/test/upf-1.sh`. + +.. code-block:: console + + bash upf1-test.sh p3b5 summit-world diff --git a/Pilot3/P3B5/p3b5.py b/Pilot3/P3B5/p3b5.py new file mode 100644 index 00000000..d7c41377 --- /dev/null +++ b/Pilot3/P3B5/p3b5.py @@ -0,0 +1,55 @@ +import os +import sys + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) + +import candle + +additional_definitions = [ +{'name':'learning_rate_min', + 'action':'store', + 'type':float}, +{'name':'log_interval', + 'action':'store', + 'type':int}, +{'name':'weight_decay', + 'action':'store', + 'type':float}, +{'name':'grad_clip', + 'action':'store', + 'type':int}, +{'name':'unrolled', + 'action':'store', + 'type':candle.str2bool}, +] + +required = [ + 'learning_rate', + 'learning_rate_min', + 'momentum', + 'weight_decay', + 'grad_clip', + 'rng_seed', + 'unrolled', + 'batch_size', + 'epochs', +] + + +class BenchmarkP3B5(candle.Benchmark): + """ Benchmark for P3B5 """ + + def set_locals(self): + """ Set parameters for the benchmark. + + Args: + required: set of required parameters for the benchmark. + additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py new file mode 100644 index 00000000..75829b63 --- /dev/null +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -0,0 +1,136 @@ +import torch +import torch.nn as nn +from torch import optim +import torch.nn.functional as F +from torch.utils.data import DataLoader + +import p3b5 as bmk +import candle +import darts + +from p3b5_darts import train, infer + + +def initialize_parameters(): + """ Initialize the parameters for the P3B5 benchmark """ + + p3b5_bench = bmk.BenchmarkP3B5( + bmk.file_path, + 'p3b5_default_model.txt', + 'pytorch', + prog='p3b5_baseline', + desc='Differentiable Architecture Search - Pilot 3 Benchmark 5', + ) + + # Initialize parameters + gParameters = candle.finalize_parameters(p3b5_bench) + #bmk.logger.info('Params: {}'.format(gParameters)) + return gParameters + + +def fetch_data(gParameters): + """ Download and untar data + + Args: + gParameters: parameters from candle + + Returns: + path to where the data is located + """ + path = gParameters['data_url'] + fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) + return fpath + + +def run(params): + args = candle.ArgumentStruct(**params) + args.cuda = torch.cuda.is_available() + + device = torch.device(f'cuda' if args.cuda else "cpu") + darts.banner(device=device) + + datapath = fetch_data(params) + train_data = darts.P3B3(datapath, 'train') + valid_data = darts.P3B3(datapath, 'test') + + trainloader = DataLoader(train_data, batch_size=args.batch_size) + validloader = DataLoader(valid_data, batch_size=args.batch_size) + + criterion = nn.CrossEntropyLoss().to(device) + + tasks = { + 'subsite': 15, + 'laterality': 3, + 'behavior': 3, + 'grade': 3, + } + + train_meter = darts.EpochMeter(tasks, 'train') + valid_meter = darts.EpochMeter(tasks, 'valid') + + model = darts.ConvNetwork(tasks=tasks, criterion=criterion, device=device).to(device) + architecture = darts.Architecture(model, args, device=device) + + optimizer = optim.SGD( + model.parameters(), + args.learning_rate, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + + scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, + float(args.epochs), + eta_min=args.learning_rate_min, + ) + + genotype_store = darts.GenotypeStorage(root=args.save_path) + + min_loss = 9999 + for epoch in range(args.epochs): + + scheduler.step() + lr = scheduler.get_lr()[0] + print(f'\nEpoch: {epoch} lr: {lr}') + + genotype = model.genotype() + print(f'Genotype: {genotype}') + + # training + train( + trainloader, + validloader, + model, + architecture, + criterion, + optimizer, + lr, + args, + tasks, + device, + train_meter + ) + + # validation + valid_loss = infer( + validloader, + model, + criterion, + args, + tasks, + device, + valid_meter + ) + + if valid_loss < min_loss: + genotype_store.save_genotype(genotype) + min_loss = valid_loss + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__=='__main__': + main() diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py new file mode 100644 index 00000000..c5d77fb4 --- /dev/null +++ b/Pilot3/P3B5/p3b5_darts.py @@ -0,0 +1,104 @@ +import os +import sys +import argparse +import candle +import p3b5 as bmk + +import torch +import torch.nn as nn +from torch import optim +import torch.nn.functional as F +from torch.utils.data import DataLoader + + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path = os.path.abspath(os.path.join(file_path, '..')) +sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) + + +import darts + + +def train(trainloader, validloader, model, architecture, criterion, optimizer, lr, args, tasks, device, meter): + + valid_iter = iter(trainloader) + + for step, (data, target) in enumerate(trainloader): + + batch_size = data.size(0) + model.train() + + data = data.to(device) + + for task, label in target.items(): + target[task] = target[task].to(device) + + x_search, target_search = next(valid_iter) + x_search = x_search.to(device) + + for task, label in target_search.items(): + target_search[task] = target_search[task].to(device) + + # 1. update alpha + architecture.step( + data, + target, + x_search, + target_search, + lr, + optimizer, + unrolled=args.unrolled + ) + + logits = model(data) + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') + + # 2. update weight + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) + optimizer.step() + + prec1 = darts.multitask_accuracy_topk(logits, target) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) + + if step % args.log_interval == 0: + print(f'Step: {step} loss: {meter.loss_meter.avg:.4}') + + meter.update_epoch() + meter.save(args.save_path) + + +def infer(validloader, model, criterion, args, tasks, device, meter): + model.eval() + + with torch.no_grad(): + for step, (data, target) in enumerate(validloader): + + data = data.to(device) + for task, label in target.items(): + target[task] = target[task].to(device) + + batch_size = data.size(0) + + logits = model(data) + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') + + prec1 = darts.multitask_accuracy_topk(logits, target) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) + + if step % args.log_interval == 0: + print(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') + + meter.update_epoch() + meter.save(args.save_path) + + return meter.loss_meter.avg + + +if __name__=='__main__': + main() diff --git a/Pilot3/P3B5/p3b5_default_model.txt b/Pilot3/P3B5/p3b5_default_model.txt new file mode 100644 index 00000000..f2813163 --- /dev/null +++ b/Pilot3/P3B5/p3b5_default_model.txt @@ -0,0 +1,15 @@ +[Global_Params] +model_name = 'p3b5' +unrolled = True +data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot3/' +save_path = '.' +log_interval = 10 +train_data = 'P3B3_data.tar.gz' +learning_rate = 0.01 +learning_rate_min = 0.001 +momentum = 0.9 +weight_decay = 3e-4 +grad_clip = 5 +batch_size = 100 +epochs = 10 +rng_seed = 13 diff --git a/Pilot3/P3B5/test.py b/Pilot3/P3B5/test.py new file mode 100644 index 00000000..49934f7b --- /dev/null +++ b/Pilot3/P3B5/test.py @@ -0,0 +1,16 @@ +import os +import sys + + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path = os.path.abspath(os.path.join(file_path, '..')) +sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common',)) +sys.path.append(lib_path2) + + +import darts + + +print(darts.Architecture) +print(darts.ConvNetwork) diff --git a/README.md b/README.md index 0009e403..4d49afd1 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ ECP-CANDLE Benchmarks - This repository contains the CANDLE benchmark codes. These codes implement deep learning architectures that are relevant to problems in cancer. These architectures address problems at different biological scales, specifically problems at the molecular, cellular and population scales. The naming conventions adopted reflect the different biological scales. diff --git a/README.setup.linux b/README.setup.linux index b17ceb11..daaf82a5 100644 --- a/README.setup.linux +++ b/README.setup.linux @@ -5,10 +5,10 @@ curl -o Anaconda3-2020.02-Linux-x86_64.sh https://repo.anaconda.com/archive/Anaconda3-2020.02-Linux-x86_64.sh # Make the installer executable -chmod u+x ./Anaconda3-5.1.0-Linux-x86_64.sh +chmod u+x ./Anaconda3-2020.02-Linux-x86_64.sh # Run the installer, accepting the defaults. -./Anaconda3-5.1.0-Linux-x86_64.sh +./Anaconda3-2020.02-Linux-x86_64.sh # Add anaconda2/bin to your path (assumes default install location) export PATH=$HOME/anaconda3/bin:$PATH @@ -21,6 +21,15 @@ conda install -y -c anaconda pandas conda install -y -c anaconda scikit-learn conda install -y -c anaconda matplotlib conda install -y -c conda-forge pygpu +conda install -y -c anaconda pytorch +conda install numba +conda install astropy +conda install patsy +conda install statsmodels +conda install requests +conda install torch +conda install pytorch +conda install -c conda-forge tqdm conda update -c conda-forge numpy # Download the source files for the benchmarks diff --git a/common/P1_utils.py b/common/P1_utils.py new file mode 100644 index 00000000..211489f6 --- /dev/null +++ b/common/P1_utils.py @@ -0,0 +1,567 @@ +import sys +import numpy as np +import numpy.linalg as la +import pandas as pd +import patsy +from sklearn.feature_selection import mutual_info_regression +from sklearn.preprocessing import StandardScaler +import statsmodels.api as sm + +from feature_selection_utils import select_features_by_variation + +################### Auxiliary functions of COXEN start here #################### + + + +def calculate_concordance_correlation_coefficient(u, v): + ''' + This function calculates the concordance correlation coefficient between two input 1-D numpy arrays. + + Parameters: + ----------- + u: 1-D numpy array of a variable + v: 1-D numpy array of a variable + + Returns: + -------- + ccc: a numeric value of concordance correlation coefficient between the two input variables. + ''' + a = 2 * np.mean((u - np.mean(u)) * (v - np.mean(v))) + b = np.mean(np.square(u - np.mean(u))) + np.mean(np.square(v - np.mean(v))) + np.square(np.mean(u) - np.mean(v)) + ccc = a/b + return ccc + + + +def generalization_feature_selection(data1, data2, measure, cutoff): + ''' + This function uses the Pearson correlation coefficient to select the features that are generalizable + between data1 and data2. + + Parameters: + ----------- + data1: 2D numpy array of the first dataset with a size of (n_samples_1, n_features) + data2: 2D numpy array of the second dataset with a size of (n_samples_2, n_features) + measure: string. 'pearson' indicates the Pearson correlation coefficient; + 'ccc' indicates the concordance correlation coefficient. Default is 'pearson'. + cutoff: a positive number for selecting generalizable features. If cutoff < 1, this function selects + the features with a correlation coefficient >= cutoff. If cutoff >= 1, it must be an + integer indicating the number of features to be selected based on correlation coefficient. + + Returns: + -------- + fid: 1-D numpy array containing the indices of selected features. + ''' + cor1 = np.corrcoef(np.transpose(data1)) + cor2 = np.corrcoef(np.transpose(data2)) + num = data1.shape[1] + cor = [] + if measure == 'pearson': + for i in range(num): + cor.append(np.corrcoef(np.vstack((list(cor1[:i, i]) + list(cor1[(i + 1):, i]), + list(cor2[:i, i]) + list(cor2[(i + 1):, i]))))[0, 1]) + elif measure == 'ccc': + for i in range(num): + cor.append(calculate_concordance_correlation_coefficient(np.array(list(cor1[:i, i]) + list(cor1[(i + 1):, i])), + np.array(list(cor2[:i, i]) + list(cor2[(i + 1):, i])))) + cor = np.array(cor) + fid = np.argsort(-cor)[:int(cutoff)] + return fid + + + +################### Auxiliary functions of COXEN end here #################### + +def coxen_single_drug_gene_selection(source_data, target_data, drug_response_data, drug_response_col, tumor_col, + prediction_power_measure='pearson', num_predictive_gene=100, generalization_power_measure='ccc', + num_generalizable_gene=50, multi_drug_mode=False): + ''' + This function selects genes for drug response prediction using the COXEN approach. The COXEN approach is + designed for selecting genes to predict the response of tumor cells to a specific drug. This function + assumes no missing data exist. + + Parameters: + ----------- + source_data: pandas data frame of gene expressions of tumors, for which drug response is known. Its size is + [n_source_samples, n_features]. + target_data: pandas data frame of gene expressions of tumors, for which drug response needs to be predicted. + Its size is [n_target_samples, n_features]. source_data and target_data have the same set + of features and the orders of features must match. + drug_response_data: pandas data frame of drug response values for a drug. It must include a column of drug + response values and a column of tumor IDs. + drug_response_col: non-negative integer or string. If integer, it is the column index of drug response in + drug_response_data. If string, it is the column name of drug response. + tumor_col: non-negative integer or string. If integer, it is the column index of tumor IDs in drug_response_data. + If string, it is the column name of tumor IDs. + prediction_power_measure: string. 'pearson' uses the absolute value of Pearson correlation coefficient to + measure prediction power of gene; 'mutual_info' uses the mutual information to measure prediction power + of gene. Default is 'pearson'. + num_predictive_gene: positive integer indicating the number of predictive genes to be selected. + generalization_power_measure: string. 'pearson' indicates the Pearson correlation coefficient; + 'ccc' indicates the concordance correlation coefficient. Default is 'ccc'. + num_generalizable_gene: positive integer indicating the number of generalizable genes to be selected. + multi_drug_mode: boolean, indicating whether the function runs as an auxiliary function of COXEN + gene selection for multiple drugs. Default is False. + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected genes, if multi_drug_mode is False; + 1-D numpy array of indices of sorting all genes according to their prediction power, if multi_drug_mode is True. + ''' + + if isinstance(drug_response_col, str): + drug_response_col = np.where(drug_response_data.columns == drug_response_col)[0][0] + + if isinstance(tumor_col, str): + tumor_col = np.where(drug_response_data.columns == tumor_col)[0][0] + + drug_response_data = drug_response_data.copy() + drug_response_data = drug_response_data.iloc[np.where(np.isin(drug_response_data.iloc[:, tumor_col], + source_data.index))[0], :] + + source_data = source_data.copy() + source_data = source_data.iloc[np.where(np.isin(source_data.index, drug_response_data.iloc[:, tumor_col]))[0], :] + + source_std_id = select_features_by_variation(source_data, variation_measure='std', threshold=0.00000001) + target_std_id = select_features_by_variation(target_data, variation_measure='std', threshold=0.00000001) + std_id = np.sort(np.intersect1d(source_std_id, target_std_id)) + source_data = source_data.iloc[:, std_id] + target_data = target_data.copy() + target_data = target_data.iloc[:, std_id] + + # Perform the first step of COXEN approach to select predictive genes. To avoid exceeding the memory limit, + # the prediction power of genes is calculated in batches. + batchSize = 1000 + numBatch = int(np.ceil(source_data.shape[1]/batchSize)) + prediction_power = np.empty((source_data.shape[1], 1)) + prediction_power.fill(np.nan) + for i in range(numBatch): + startIndex = i*batchSize + endIndex = min((i+1)*batchSize, source_data.shape[1]) + + if prediction_power_measure == 'pearson': + cor_i = np.corrcoef(np.vstack((np.transpose(source_data.iloc[:, startIndex:endIndex].loc[drug_response_data.iloc[:, tumor_col], + :].values), np.reshape(drug_response_data.iloc[:, drug_response_col].values, (1, drug_response_data.shape[0]))))) + prediction_power[startIndex:endIndex, 0] = abs(cor_i[:-1, -1]) + + if prediction_power_measure == 'mutual_info': + mi = mutual_info_regression(X=source_data.iloc[:, startIndex:endIndex].loc[drug_response_data.iloc[:, tumor_col], :].values, + y=drug_response_data.iloc[:, drug_response_col].values) + prediction_power[startIndex:endIndex, 0] = mi + + if multi_drug_mode: + indices = np.argsort(-prediction_power[:, 0]) + return std_id[indices] + + num_predictive_gene = int(min(num_predictive_gene, source_data.shape[1])) + gid1 = np.argsort(-prediction_power[:, 0])[:num_predictive_gene] + + # keep only predictive genes for source and target data + source_data = source_data.iloc[:, gid1] + target_data = target_data.iloc[:, gid1] + num_generalizable_gene = int(min(num_generalizable_gene, len(gid1))) + # perform the second step of COXEN approach to select generalizable genes among the predictive genes + gid2 = generalization_feature_selection(source_data.values, target_data.values, generalization_power_measure, + num_generalizable_gene) + + indices = std_id[gid1[gid2]] + + return np.sort(indices) + +def coxen_multi_drug_gene_selection(source_data, target_data, drug_response_data, drug_response_col, tumor_col, drug_col, + prediction_power_measure='lm', num_predictive_gene=100, generalization_power_measure='ccc', + num_generalizable_gene=50, union_of_single_drug_selection=False): + ''' + This function uses the COXEN approach to select genes for predicting the response of multiple drugs. + It assumes no missing data exist. It works in three modes. + (1) If union_of_single_drug_selection is True, prediction_power_measure must be either 'pearson' or 'mutual_info'. + This functions runs coxen_single_drug_gene_selection for every drug with the parameter setting and takes the + union of the selected genes of every drug as the output. The size of the selected gene set may be larger than + num_generalizable_gene. + (2) If union_of_single_drug_selection is False and prediction_power_measure is 'lm', this function uses a + linear model to fit the response of multiple drugs using the expression of a gene, while the drugs are + one-hot encoded. The p-value associated with the coefficient of gene expression is used as the prediction + power measure, according to which num_predictive_gene genes will be selected. Then, among the predictive + genes, num_generalizable_gene generalizable genes will be selected. + (3) If union_of_single_drug_selection is False and prediction_power_measure is 'pearson' or 'mutual_info', + for each drug this functions ranks the genes according to their power of predicting the + response of the drug. The union of an equal number of predictive genes for every drug will be generated, + and its size must be at least num_predictive_gene. Then, num_generalizable_gene generalizable genes + will be selected. + + Parameters: + ----------- + source_data: pandas data frame of gene expressions of tumors, for which drug response is known. Its size is + [n_source_samples, n_features]. + target_data: pandas data frame of gene expressions of tumors, for which drug response needs to be predicted. + Its size is [n_target_samples, n_features]. source_data and target_data have the same set + of features and the orders of features must match. + drug_response_data: pandas data frame of drug response that must include a column of drug response values, + a column of tumor IDs, and a column of drug IDs. + drug_response_col: non-negative integer or string. If integer, it is the column index of drug response in + drug_response_data. If string, it is the column name of drug response. + tumor_col: non-negative integer or string. If integer, it is the column index of tumor IDs in drug_response_data. + If string, it is the column name of tumor IDs. + drug_col: non-negative integer or string. If integer, it is the column index of drugs in drug_response_data. + If string, it is the column name of drugs. + prediction_power_measure: string. 'pearson' uses the absolute value of Pearson correlation coefficient to + measure prediction power of a gene; 'mutual_info' uses the mutual information to measure prediction power + of a gene; 'lm' uses the linear regression model to select predictive genes for multiple drugs. Default is 'lm'. + num_predictive_gene: positive integer indicating the number of predictive genes to be selected. + generalization_power_measure: string. 'pearson' indicates the Pearson correlation coefficient; + 'ccc' indicates the concordance correlation coefficient. Default is 'ccc'. + num_generalizable_gene: positive integer indicating the number of generalizable genes to be selected. + union_of_single_drug_selection: boolean, indicating whether the final gene set should be the union of genes + selected for every drug. + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected genes. + ''' + + if isinstance(drug_response_col, str): + drug_response_col = np.where(drug_response_data.columns == drug_response_col)[0][0] + + if isinstance(tumor_col, str): + tumor_col = np.where(drug_response_data.columns == tumor_col)[0][0] + + if isinstance(drug_col, str): + drug_col = np.where(drug_response_data.columns == drug_col)[0][0] + + drug_response_data = drug_response_data.copy() + drug_response_data = drug_response_data.iloc[np.where(np.isin(drug_response_data.iloc[:, tumor_col], + source_data.index))[0], :] + drugs = np.unique(drug_response_data.iloc[:, drug_col]) + + source_data = source_data.copy() + source_data = source_data.iloc[np.where(np.isin(source_data.index, drug_response_data.iloc[:, tumor_col]))[0], :] + + source_std_id = select_features_by_variation(source_data, variation_measure='std', threshold=0.00000001) + target_std_id = select_features_by_variation(target_data, variation_measure='std', threshold=0.00000001) + std_id = np.sort(np.intersect1d(source_std_id, target_std_id)) + source_data = source_data.iloc[:, std_id] + target_data = target_data.copy() + target_data = target_data.iloc[:, std_id] + + num_predictive_gene = int(min(num_predictive_gene, source_data.shape[1])) + + if union_of_single_drug_selection: + if prediction_power_measure != 'pearson' and prediction_power_measure != 'mutual_info': + print('pearson or mutual_info must be used as prediction_power_measure for taking the union of selected genes of every drugs') + sys.exit(1) + gid1 = np.array([]).astype(np.int64) + for d in drugs: + idd = np.where(drug_response_data.iloc[:, drug_col] == d)[0] + response_d = drug_response_data.iloc[idd, :] + gid2 = coxen_single_drug_gene_selection(source_data, target_data, response_d, drug_response_col, tumor_col, + prediction_power_measure, num_predictive_gene, generalization_power_measure, num_generalizable_gene) + gid1 = np.union1d(gid1, gid2) + return np.sort(std_id[gid1]) + + if prediction_power_measure == 'lm': + pvalue = np.empty((source_data.shape[1], 1)) + pvalue.fill(np.nan) + drug_m = np.identity(len(drugs)) + drug_m = pd.DataFrame(drug_m, index=drugs) + drug_sample = drug_m.loc[drug_response_data.iloc[:, drug_col], :].values + for i in range(source_data.shape[1]): + ge_sample = source_data.iloc[:, i].loc[drug_response_data.iloc[:, tumor_col]].values + sample = np.hstack((np.reshape(ge_sample, (len(ge_sample), 1)), drug_sample)) + sample = sm.add_constant(sample) + mod = sm.OLS(drug_response_data.iloc[:, drug_response_col].values, sample) + try: + res = mod.fit() + pvalue[i, 0] = res.pvalues[1] + except: + pvalue[i, 0] = 1 + + gid1 = np.argsort(pvalue[:, 0])[:num_predictive_gene] + + elif prediction_power_measure == 'pearson' or prediction_power_measure == 'mutual_info': + gene_rank = np.empty((len(drugs), source_data.shape[1])) + gene_rank.fill(np.nan) + gene_rank = pd.DataFrame(gene_rank, index=drugs) + for d in range(len(drugs)): + idd = np.where(drug_response_data.iloc[:, drug_col] == drugs[d])[0] + response_d = drug_response_data.iloc[idd, :] + temp_rank = coxen_single_drug_gene_selection(source_data, target_data, response_d, + drug_response_col, tumor_col, prediction_power_measure, num_predictive_gene=None, + generalization_power_measure=None, num_generalizable_gene=None, multi_drug_mode=True) + gene_rank.iloc[d, :len(temp_rank)] = temp_rank + for i in range(int(np.ceil(num_predictive_gene/len(drugs))), source_data.shape[1]+1): + gid1 = np.unique(np.reshape(gene_rank.iloc[:, :i].values, (1, gene_rank.shape[0]*i))[0, :]) + gid1 = gid1[np.where(np.invert(np.isnan(gid1)))[0]] + if len(gid1) >= num_predictive_gene: + break + gid1 = gid1.astype(np.int64) + + # keep only predictive genes for source and target data + source_data = source_data.iloc[:, gid1] + target_data = target_data.iloc[:, gid1] + num_generalizable_gene = int(min(num_generalizable_gene, len(gid1))) + + # perform the second step of COXEN approach to select generalizable genes among the predictive genes + gid2 = generalization_feature_selection(source_data.values, target_data.values, generalization_power_measure, + num_generalizable_gene) + + indices = std_id[gid1[gid2]] + + return np.sort(indices) + +def generate_gene_set_data(data, genes, gene_name_type='entrez', gene_set_category='c6.all', metric='mean', + standardize=False, data_dir='../../Data/examples/Gene_Sets/MSigDB.v7.0/'): + ''' + This function generates genomic data summarized at the gene set level. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features]. + genes: 1-D array or list of gene names with a length of n_features. It indicates which gene a genomic + feature belongs to. + gene_name_type: string, indicating the type of gene name used in genes. 'entrez' indicates Entrez gene ID and + 'symbols' indicates HGNC gene symbol. Default is 'symbols'. + gene_set_category: string, indicating the gene sets for which data will be calculated. 'c2.cgp' indicates gene sets + affected by chemical and genetic perturbations; 'c2.cp.biocarta' indicates BioCarta gene sets; 'c2.cp.kegg' + indicates KEGG gene sets; 'c2.cp.pid' indicates PID gene sets; 'c2.cp.reactome' indicates Reactome gene sets; + 'c5.bp' indicates GO biological processes; 'c5.cc' indicates GO cellular components; 'c5.mf' indicates + GO molecular functions; 'c6.all' indicates oncogenic signatures. Default is 'c6.all'. + metric: string, indicating the way to calculate gene-set-level data. 'mean' calculates the mean of gene + features belonging to the same gene set. 'sum' calculates the summation of gene features belonging + to the same gene set. 'max' calculates the maximum of gene features. 'min' calculates the minimum + of gene features. 'abs_mean' calculates the mean of absolute values. 'abs_maximum' calculates + the maximum of absolute values. Default is 'mean'. + standardize: boolean, indicating whether to standardize features before calculation. Standardization transforms + each feature to have a zero mean and a unit standard deviation. + + Returns: + -------- + gene_set_data: a data frame of calculated gene-set-level data. Column names are the gene set names. + ''' + + sample_name = None + if isinstance(data, pd.DataFrame): + sample_name = data.index + data = data.values + elif not isinstance(data, np.ndarray): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + if standardize: + scaler = StandardScaler() + data = scaler.fit_transform(data) + + genes = [str(i) for i in genes] + + if gene_name_type == 'entrez': + gene_set_category = gene_set_category + '.v7.0.entrez.gmt' + if gene_name_type == 'symbols': + gene_set_category = gene_set_category + '.v7.0.symbols.gmt' + f = open(data_dir + gene_set_category, 'r') + x = f.readlines() + gene_sets = {} + for i in range(len(x)): + temp = x[i].split('\n')[0].split('\t') + gene_sets[temp[0]] = temp[2:] + + gene_set_data = np.empty((data.shape[0], len(gene_sets))) + gene_set_data.fill(np.nan) + gene_set_names = np.array(list(gene_sets.keys())) + for i in range(len(gene_set_names)): + idi = np.where(np.isin(genes, gene_sets[gene_set_names[i]]))[0] + if len(idi) > 0: + if metric == 'sum': + gene_set_data[:, i] = np.nansum(data[:, idi], axis=1) + elif metric == 'max': + gene_set_data[:, i] = np.nanmax(data[:, idi], axis=1) + elif metric == 'min': + gene_set_data[:, i] = np.nanmin(data[:, idi], axis=1) + elif metric == 'abs_mean': + gene_set_data[:, i] = np.nanmean(np.absolute(data[:, idi]), axis=1) + elif metric == 'abs_maximum': + gene_set_data[:, i] = np.nanmax(np.absolute(data[:, idi]), axis=1) + else: #'mean' + gene_set_data[:, i] = np.nanmean(data[:, idi], axis=1) + + if sample_name is None: + gene_set_data = pd.DataFrame(gene_set_data, columns=gene_set_names) + else: + gene_set_data = pd.DataFrame(gene_set_data, columns=gene_set_names, index=sample_name) + keep_id = np.where(np.sum(np.invert(pd.isna(gene_set_data)), axis=0) > 0)[0] + gene_set_data = gene_set_data.iloc[:, keep_id] + + return gene_set_data + +################### Auxiliary functions of ComBat start here #################### + + + +def design_mat(mod, numerical_covariates, batch_levels): + # require levels to make sure they are in the same order as we use in the + # rest of the script. + design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels), + mod, return_type="dataframe") + + mod = mod.drop(["batch"], axis=1) + numerical_covariates = list(numerical_covariates) + sys.stderr.write("found %i batches\n" % design.shape[1]) + other_cols = [c for i, c in enumerate(mod.columns) + if not i in numerical_covariates] + factor_matrix = mod[other_cols] + design = pd.concat((design, factor_matrix), axis=1) + if numerical_covariates is not None: + sys.stderr.write("found %i numerical covariates...\n" + % len(numerical_covariates)) + for i, nC in enumerate(numerical_covariates): + cname = mod.columns[nC] + sys.stderr.write("\t{0}\n".format(cname)) + design[cname] = mod[mod.columns[nC]] + sys.stderr.write("found %i categorical variables:" % len(other_cols)) + sys.stderr.write("\t" + ", ".join(other_cols) + '\n') + return design + + +def it_sol(sdat, g_hat, d_hat, g_bar, t2, a, b, conv=0.0001): + n = (1 - np.isnan(sdat)).sum(axis=1) + g_old = g_hat.copy() + d_old = d_hat.copy() + + change = 1 + count = 0 + while change > conv: + # print g_hat.shape, g_bar.shape, t2.shape + g_new = postmean(g_hat, g_bar, n, d_old, t2) + sum2 = ((sdat - np.dot(g_new.values.reshape((g_new.shape[0], 1)), np.ones((1, sdat.shape[1])))) ** 2).sum( + axis=1) + d_new = postvar(sum2, n, a, b) + + change = max((abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()) + g_old = g_new # .copy() + d_old = d_new # .copy() + count = count + 1 + adjust = (g_new, d_new) + return adjust + + +def aprior(gamma_hat): + m = gamma_hat.mean() + s2 = gamma_hat.var() + return (2 * s2 + m ** 2) / s2 + + +def bprior(gamma_hat): + m = gamma_hat.mean() + s2 = gamma_hat.var() + return (m * s2 + m ** 3) / s2 + + +def postmean(g_hat, g_bar, n, d_star, t2): + return (t2 * n * g_hat + d_star * g_bar) / (t2 * n + d_star) + + +def postvar(sum2, n, a, b): + return (0.5 * sum2 + b) / (n / 2.0 + a - 1.0) + + + +################### Auxiliary functions of ComBat end here #################### + +def combat_batch_effect_removal(data, batch_labels, model=None, numerical_covariates=None): + ''' + This function corrects for batch effect in data. + + Parameters: + ----------- + data: pandas data frame of numeric values, with a size of (n_features, n_samples) + batch_labels: pandas series, with a length of n_samples. It should provide the batch labels of samples. + Its indices are the same as the column names (sample names) in "data". + model: an object of patsy.design_info.DesignMatrix. It is a design matrix describing the covariate + information on the samples that could cause batch effects. If not provided, this function + will attempt to coarsely correct just based on the information provided in "batch". + numerical_covariates: a list of the names of covariates in "model" that are numerical rather than + categorical. + + Returns: + -------- + corrected : pandas data frame of numeric values, with a size of (n_features, n_samples). It is + the data with batch effects corrected. + ''' + + if isinstance(numerical_covariates, str): + numerical_covariates = [numerical_covariates] + if numerical_covariates is None: + numerical_covariates = [] + + if model is not None and isinstance(model, pd.DataFrame): + model["batch"] = list(batch_labels) + else: + model = pd.DataFrame({'batch': batch_labels}) + + batch_items = model.groupby("batch").groups.items() + batch_levels = [k for k, v in batch_items] + batch_info = [v for k, v in batch_items] + n_batch = len(batch_info) + n_batches = np.array([len(v) for v in batch_info]) + n_array = float(sum(n_batches)) + + # drop intercept + drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True] + drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols] + model = model[[c for c in model.columns if not c in drop_cols]] + numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c + for c in numerical_covariates if not c in drop_cols] + + design = design_mat(model, numerical_covariates, batch_levels) + + sys.stderr.write("Standardizing Data across genes.\n") + B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T) + grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch, :]) + var_pooled = np.dot(((data - np.dot(design, B_hat).T) ** 2), np.ones((int(n_array), 1)) / int(n_array)) + + stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array)))) + tmp = np.array(design.copy()) + tmp[:, :n_batch] = 0 + stand_mean += np.dot(tmp, B_hat).T + + s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array))))) + + sys.stderr.write("Fitting L/S model and finding priors\n") + batch_design = design[design.columns[:n_batch]] + gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T) + + delta_hat = [] + + for i, batch_idxs in enumerate(batch_info): + delta_hat.append(s_data[batch_idxs].var(axis=1)) + + gamma_bar = gamma_hat.mean(axis=1) + t2 = gamma_hat.var(axis=1) + + a_prior = list(map(aprior, delta_hat)) + b_prior = list(map(bprior, delta_hat)) + + sys.stderr.write("Finding parametric adjustments\n") + gamma_star, delta_star = [], [] + for i, batch_idxs in enumerate(batch_info): + temp = it_sol(s_data[batch_idxs], gamma_hat[i], + delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i]) + + gamma_star.append(temp[0]) + delta_star.append(temp[1]) + + sys.stdout.write("Adjusting data\n") + bayesdata = s_data + gamma_star = np.array(gamma_star) + delta_star = np.array(delta_star) + + for j, batch_idxs in enumerate(batch_info): + dsq = np.sqrt(delta_star[j, :]) + dsq = dsq.reshape((len(dsq), 1)) + denom = np.dot(dsq, np.ones((1, n_batches[j]))) + numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T) + + bayesdata[batch_idxs] = numer / denom + + vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1)) + bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean + + return bayesdata diff --git a/common/candle/__init__.py b/common/candle/__init__.py index b8bf19c9..fc90942a 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -6,6 +6,10 @@ from data_utils import load_csv_data from data_utils import load_Xy_one_hot_data2 from data_utils import load_Xy_data_noheader +from data_utils import drop_impute_and_scale_dataframe +from data_utils import discretize_dataframe +from data_utils import discretize_array +from data_utils import lookup #import from file_utils from file_utils import get_file @@ -14,17 +18,56 @@ from default_utils import ArgumentStruct from default_utils import Benchmark from default_utils import str2bool -from default_utils import initialize_parameters +from default_utils import finalize_parameters from default_utils import fetch_file from default_utils import verify_path from default_utils import keras_default_config from default_utils import set_up_logger +from default_utils import check_flag_conflicts from generic_utils import Progbar # import from viz_utils from viz_utils import plot_history from viz_utils import plot_scatter +from viz_utils import plot_density_observed_vs_predicted +from viz_utils import plot_2d_density_sigma_vs_error +from viz_utils import plot_histogram_error_per_sigma +from viz_utils import plot_calibration_and_errors +from viz_utils import plot_percentile_predictions + + +# import from uq_utils +from uq_utils import compute_statistics_homoscedastic +from uq_utils import compute_statistics_homoscedastic_all +from uq_utils import compute_statistics_heteroscedastic +from uq_utils import compute_statistics_quantile +from uq_utils import split_data_for_empirical_calibration +from uq_utils import compute_empirical_calibration +from uq_utils import bining_for_calibration +from uq_utils import computation_of_valid_calibration_interval +from uq_utils import applying_calibration +from uq_utils import overprediction_check +from uq_utils import generate_index_distribution + +# import from profiling_utils +from profiling_utils import start_profiling +from profiling_utils import stop_profiling + +# import from data_preprocessing_utils +from data_preprocessing_utils import quantile_normalization +from data_preprocessing_utils import generate_cross_validation_partition + +# feature selection +from feature_selection_utils import select_features_by_missing_values +from feature_selection_utils import select_features_by_variation +from feature_selection_utils import select_decorrelated_features + +# P1-specific +from P1_utils import coxen_single_drug_gene_selection +from P1_utils import coxen_multi_drug_gene_selection +from P1_utils import generate_gene_set_data +from P1_utils import combat_batch_effect_removal # import benchmark-dependent utils import sys @@ -41,11 +84,30 @@ from keras_utils import PermanentDropout from keras_utils import register_permanent_dropout from keras_utils import LoggingCallback + from keras_utils import MultiGPUCheckpoint + from keras_utils import r2 + from keras_utils import mae + from keras_utils import mse + + from viz_utils import plot_metrics from solr_keras import CandleRemoteMonitor from solr_keras import compute_trainable_params from solr_keras import TerminateOnTimeOut + from uq_keras_utils import abstention_variable_initialization + from uq_keras_utils import abstention_loss + from uq_keras_utils import abs_acc + from uq_keras_utils import acc_class1 + from uq_keras_utils import abs_acc_class1 + from uq_keras_utils import modify_labels + from uq_keras_utils import add_model_output + from uq_keras_utils import AbstentionAdapt_Callback + + from clr_keras_utils import CyclicLR + from clr_keras_utils import clr_set_args + from clr_keras_utils import clr_callback + elif 'torch' in sys.modules: print ('Importing candle utils for pytorch') from pytorch_utils import set_seed diff --git a/common/candle_keras/__init__.py b/common/candle_keras/__init__.py index c5eccf06..9c685f20 100644 --- a/common/candle_keras/__init__.py +++ b/common/candle_keras/__init__.py @@ -6,6 +6,10 @@ from data_utils import load_csv_data from data_utils import load_Xy_one_hot_data2 from data_utils import load_Xy_data_noheader +from data_utils import drop_impute_and_scale_dataframe +from data_utils import discretize_dataframe +from data_utils import discretize_array +from data_utils import lookup #import from file_utils from file_utils import get_file @@ -14,12 +18,36 @@ from default_utils import ArgumentStruct from default_utils import Benchmark from default_utils import str2bool -from default_utils import initialize_parameters +from default_utils import finalize_parameters from default_utils import fetch_file from default_utils import verify_path from default_utils import keras_default_config from default_utils import set_up_logger +from generic_utils import Progbar + +# import from viz_utils +from viz_utils import plot_history +from viz_utils import plot_scatter +from viz_utils import plot_density_observed_vs_predicted +from viz_utils import plot_2d_density_sigma_vs_error +from viz_utils import plot_histogram_error_per_sigma +from viz_utils import plot_calibration_and_errors +from viz_utils import plot_percentile_predictions + +# import from uq_utils +from uq_utils import compute_statistics_homoscedastic +from uq_utils import compute_statistics_homoscedastic_all +from uq_utils import compute_statistics_heteroscedastic +from uq_utils import compute_statistics_quantile +from uq_utils import split_data_for_empirical_calibration +from uq_utils import compute_empirical_calibration +from uq_utils import bining_for_calibration +from uq_utils import computation_of_valid_calibration_interval +from uq_utils import applying_calibration +from uq_utils import overprediction_check + + #import from keras_utils #from keras_utils import dense #from keras_utils import add_dense @@ -30,10 +58,21 @@ from keras_utils import PermanentDropout from keras_utils import register_permanent_dropout from keras_utils import LoggingCallback +from keras_utils import r2 +from keras_utils import mae +from keras_utils import mse -from generic_utils import Progbar from solr_keras import CandleRemoteMonitor from solr_keras import compute_trainable_params from solr_keras import TerminateOnTimeOut +#import from uq_keras_utils +from uq_keras_utils import abstention_variable_initialization +from uq_keras_utils import abstention_loss +from uq_keras_utils import abs_acc +from uq_keras_utils import acc_class1 +from uq_keras_utils import abs_acc_class1 +from uq_keras_utils import modify_labels +from uq_keras_utils import add_model_output +from uq_keras_utils import AbstentionAdapt_Callback diff --git a/common/clr_keras_utils.py b/common/clr_keras_utils.py new file mode 100644 index 00000000..9d542505 --- /dev/null +++ b/common/clr_keras_utils.py @@ -0,0 +1,200 @@ +from keras.callbacks import Callback +from keras import backend as K +import numpy as np + +def clr_check_args(args): + req_keys = ['clr_mode','clr_base_lr','clr_max_lr','clr_gamma'] + keys_present = True + for key in req_keys: + if key not in args.keys(): + keys_present = False + return keys_present + +def clr_set_args(args): + req_keys = ['clr_mode','clr_base_lr','clr_max_lr','clr_gamma'] + exclusive_keys = ['warmup_lr', 'reduce_lr'] + keys_present = True + for key in req_keys: + if key not in args.keys(): + keys_present = False + if keys_present and args['clr_mode'] is not None: + clr_keras_kwargs = {'mode': args['clr_mode'], 'base_lr': args['clr_base_lr'], + 'max_lr': args['clr_max_lr'], 'gamma': args['clr_gamma']} + for ex_key in exclusive_keys: + if ex_key in args.keys(): + if args[ex_key] == True: + print("Key ", ex_key, " conflicts, setting to False") + args[ex_key] = False + else: + print("Incomplete CLR specification: will run without") + clr_keras_kwargs = {'mode': None, 'base_lr': 0.1, + 'max_lr': 0.1, 'gamma': 0.1} + return clr_keras_kwargs + +def clr_callback(mode=None, base_lr=1e-4, max_lr=1e-3, gamma=0.999994): + """ Creates keras callback for cyclical learning rate. """ + # keras_contrib = './keras_contrib/callbacks' + # sys.path.append(keras_contrib) + + if mode == 'trng1': + clr = CyclicLR(base_lr=base_lr, max_lr=max_lr, mode='triangular') + elif mode == 'trng2': + clr = CyclicLR(base_lr=base_lr, max_lr=max_lr, mode='triangular2') + elif mode == 'exp': + clr = CyclicLR(base_lr=base_lr, max_lr=max_lr, mode='exp_range', gamma=gamma) # 0.99994; 0.99999994; 0.999994 + return clr + +class CyclicLR(Callback): + """This callback implements a cyclical learning rate policy (CLR). + The method cycles the learning rate between two boundaries with + some constant frequency. + # Arguments + base_lr: initial learning rate which is the + lower boundary in the cycle. + max_lr: upper boundary in the cycle. Functionally, + it defines the cycle amplitude (max_lr - base_lr). + The lr at any cycle is the sum of base_lr + and some scaling of the amplitude; therefore + max_lr may not actually be reached depending on + scaling function. + step_size: number of training iterations per + half cycle. Authors suggest setting step_size + 2-8 x training iterations in epoch. + mode: one of {triangular, triangular2, exp_range}. + Default 'triangular'. + Values correspond to policies detailed above. + If scale_fn is not None, this argument is ignored. + gamma: constant in 'exp_range' scaling function: + gamma**(cycle iterations) + scale_fn: Custom scaling policy defined by a single + argument lambda function, where + 0 <= scale_fn(x) <= 1 for all x >= 0. + mode paramater is ignored + scale_mode: {'cycle', 'iterations'}. + Defines whether scale_fn is evaluated on + cycle number or cycle iterations (training + iterations since start of cycle). Default is 'cycle'. + + The amplitude of the cycle can be scaled on a per-iteration or + per-cycle basis. + This class has three built-in policies, as put forth in the paper. + "triangular": + A basic triangular cycle w/ no amplitude scaling. + "triangular2": + A basic triangular cycle that scales initial amplitude by half each cycle. + "exp_range": + A cycle that scales initial amplitude by gamma**(cycle iterations) at each + cycle iteration. + For more detail, please see paper. + + # Example for CIFAR-10 w/ batch size 100: + ```python + clr = CyclicLR(base_lr=0.001, max_lr=0.006, + step_size=2000., mode='triangular') + model.fit(X_train, Y_train, callbacks=[clr]) + ``` + + Class also supports custom scaling functions: + ```python + clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.)) + clr = CyclicLR(base_lr=0.001, max_lr=0.006, + step_size=2000., scale_fn=clr_fn, + scale_mode='cycle') + model.fit(X_train, Y_train, callbacks=[clr]) + ``` + + # References + + - [Cyclical Learning Rates for Training Neural Networks]( + https://arxiv.org/abs/1506.01186) + """ + + def __init__( + self, + base_lr=0.001, + max_lr=0.006, + step_size=2000., + mode='triangular', + gamma=1., + scale_fn=None, + scale_mode='cycle'): + super(CyclicLR, self).__init__() + + if mode not in ['triangular', 'triangular2', + 'exp_range']: + raise KeyError("mode must be one of 'triangular', " + "'triangular2', or 'exp_range'") + self.base_lr = base_lr + self.max_lr = max_lr + self.step_size = step_size + self.mode = mode + self.gamma = gamma + if scale_fn is None: + if self.mode == 'triangular': + self.scale_fn = lambda x: 1. + self.scale_mode = 'cycle' + elif self.mode == 'triangular2': + self.scale_fn = lambda x: 1 / (2.**(x - 1)) + self.scale_mode = 'cycle' + elif self.mode == 'exp_range': + self.scale_fn = lambda x: gamma ** x + self.scale_mode = 'iterations' + else: + self.scale_fn = scale_fn + self.scale_mode = scale_mode + self.clr_iterations = 0. + self.trn_iterations = 0. + self.history = {} + + self._reset() + + def _reset(self, new_base_lr=None, new_max_lr=None, + new_step_size=None): + """Resets cycle iterations. + Optional boundary/step size adjustment. + """ + if new_base_lr is not None: + self.base_lr = new_base_lr + if new_max_lr is not None: + self.max_lr = new_max_lr + if new_step_size is not None: + self.step_size = new_step_size + self.clr_iterations = 0. + + def clr(self): + cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size)) + x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1) + if self.scale_mode == 'cycle': + return self.base_lr + (self.max_lr - self.base_lr) * \ + np.maximum(0, (1 - x)) * self.scale_fn(cycle) + else: + return self.base_lr + (self.max_lr - self.base_lr) * \ + np.maximum(0, (1 - x)) * self.scale_fn(self.clr_iterations) + + def on_train_begin(self, logs={}): + logs = logs or {} + + if self.clr_iterations == 0: + K.set_value(self.model.optimizer.lr, self.base_lr) + else: + K.set_value(self.model.optimizer.lr, self.clr()) + + def on_batch_end(self, epoch, logs=None): + + logs = logs or {} + self.trn_iterations += 1 + self.clr_iterations += 1 + K.set_value(self.model.optimizer.lr, self.clr()) + + self.history.setdefault( + 'lr', []).append( + K.get_value( + self.model.optimizer.lr)) + self.history.setdefault('iterations', []).append(self.trn_iterations) + + for k, v in logs.items(): + self.history.setdefault(k, []).append(v) + + def on_epoch_end(self, epoch, logs=None): + logs = logs or {} + logs['lr'] = K.get_value(self.model.optimizer.lr) diff --git a/common/darts/README.rst b/common/darts/README.rst new file mode 100644 index 00000000..f6bc557d --- /dev/null +++ b/common/darts/README.rst @@ -0,0 +1,62 @@ +===== +DARTS +===== + + +Differentiable architecture search + +This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending +the work to handle convolutional neural networks for NLP problems and more. +Details of the original authors' approach can be found in their 2019 ICLR paper_. + +DARTS works by composing various neural net primitives, defined as Pytorch *nn.Modules*, +to create a larger directed acyclic graph (DAG) that is to be your model. This +composition is differentiable as we take the softmax of the choice of primitive types +at each layer of the network. To make this more clear, let's first define a few abstractions +in the algorithm: + +1. **Primitve**: this is the fundamental block of computation, defined as an *nn.Module*. + At each layer of your network, one of these primitves will be chosen by taking the + softmax of all possible primitives at that layer. Examples could be a convolution block, + a linear layer, a skip connect, or anything that you can come up with (subject to a few + constraints). + +2. **Cell**: this is an abstraction that holds each of the primitive types for level of your + network. This is where we perform the softmax over the possible primitive types. + +3. **Nodes**: this is the level of abstraction that would normally be considered a layer in + your network. It can contain one or more *Cells*. + +4. **Architecture**: The abstraction that contains all nodes in the graph. This computes a + Hessian product with respect to the *alpha* parameters as defined in the paper. + +5. **Genotype**: genotypes are instances of a particular configuration of the graph. As the + optimization runs, and each cell computes the softmax over their primitive types, the final + configuration of all nodes with their resulting primitive is a genotype. + +In the DARTS algorithm, we define a number of primitives that we would like to compose together +to form our neural network. The original paper started with 8 primitive types. These types +were originally designed for a vision task, and largely consist of convolution type operations. +We have since adapted these types for the *P3B5* benchmark, creating 1D convolution types for +our NLP tasks. If you would like to see how these primitives are defined, along with their +necessary constructors used by DARTS, you can find them in +`darts.modules.operations.conv.py`_. + +These primitives are then contained within a cell, and one or more cells are contained within a +node in the graph. DARTS then works by composing these nodes together and taking the softmax over +their primitives in each cell. Finally, the *Architecture* abstraction contains all nodes, and is +responsible for differentiating the composition of the nodes with respect to two *alpha* parameters +as defined in the paper. The end result is that we have a differentiable model that composes its +components as the model is training. + +As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. +The final model will be the *Genotype* with corresponding to the lowest loss. + + +.. References +.. ---------- +.. _paper: https://openreview.net/forum?id=S1eYHoC5FX +.. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py +.. _darts.modules.operations.linear.py: ../../../common/darts/modules.operations.linear.py +.. _operations.py: ./operations.py +.. _Uno example: ../uno diff --git a/common/darts/__init__.py b/common/darts/__init__.py new file mode 100644 index 00000000..02bbbc53 --- /dev/null +++ b/common/darts/__init__.py @@ -0,0 +1,36 @@ +from __future__ import absolute_import + +__author__ = 'Todd Young' +__email__ = 'youngmt1@ornl.gov' +__version__ = '0.1.0' + +# Essential pieces +from .architecture import Architecture +from .modules.network import Network +from .modules.conv.network import ConvNetwork +from .modules.linear.network import LinearNetwork +from .storage.genotype import GenotypeStorage + +# Utilities that are not neccessary +from .datasets.p3b3 import P3B3 +from .datasets.uno import Uno +from .datasets.random import RandomData +from .datasets.sample import sample +from .api.config import banner +from .meters.average import AverageMeter +from .meters.accuracy import MultitaskAccuracyMeter +from .meters.epoch import EpochMeter +from .utils.tensor import to_device +from .utils.random import SeedControl + +from .functional import ( + multitask_loss, multitask_accuracy, multitask_accuracy_topk +) + +__all__ = [ + "Architecture", + "Network", + "ConvNetwork", + "LinearNetwork", +] + diff --git a/common/darts/api/__init__.py b/common/darts/api/__init__.py new file mode 100644 index 00000000..e9cb61d9 --- /dev/null +++ b/common/darts/api/__init__.py @@ -0,0 +1,2 @@ +from .model import Model +from .dataset import InMemoryDataset diff --git a/common/darts/api/config.py b/common/darts/api/config.py new file mode 100644 index 00000000..8d60579c --- /dev/null +++ b/common/darts/api/config.py @@ -0,0 +1,88 @@ +import os +import datetime as dtm +from collections import namedtuple + +import torch + + +def banner(device): + """ Print a banner of the system config + + Parameters + ---------- + device : torch.device + """ + print("=" * 80) + info = get_torch_info() + torch_msg = ( + f"Pytorch version: {info.torch_version} ", + f"cuda version {info.cuda_version} ", + f"cudnn version {info.cudnn_version}" + ) + print(''.join(torch_msg)) + + if device.type == 'cuda': + device_idx = get_device_idx(device) + usage = memory_usage(device) + print(f"CUDA Device name {torch.cuda.get_device_name(device_idx)}") + print(f"CUDA memory - total: {usage.total} current usage: {usage.used}") + else: + print(f'Using CPU') + + print(dtm.datetime.now().strftime("%Y/%m/%d - %H:%M:%S")) + print("=" * 80) + + +def get_torch_info(): + """ Get Pytorch system info """ + VersionInfo = namedtuple( + "PytorchVersionInfo", + "torch_version cuda_version cudnn_version" + ) + return VersionInfo(torch.__version__, torch.version.cuda, torch.backends.cudnn.version()) + + +def get_device_idx(device): + """ Get the CUDA device from torch + + Parameters + ---------- + device : torch.device + + Returns + ------- + index of the CUDA device + """ + return 0 if device.index is None else device.index + + +def memory_usage(device): + """ Get GPU memory total and usage + + Parameters + ---------- + device : torch.device + + Returns + ------- + usage : namedtuple(torch.device, int, int) + Total memory of the GPU and its current usage + """ + if device.type == "cpu": + raise ValueError(f'Can only query GPU memory usage, but device is {device}') + + Usage = namedtuple("MemoryUsage", "device total used") + + if device.type == "cuda": + device_idx = get_device_idx(device) + + try: + total, used = os.popen( + 'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' + ).read().split('\n')[device_idx].split(',') + except: + raise ValueError( + f'Attempted to query CUDA device {device_idx}, does this system have that many GPUs?' + ) + + return Usage(device, int(total), int(used)) \ No newline at end of file diff --git a/common/darts/api/dataset.py b/common/darts/api/dataset.py new file mode 100644 index 00000000..488b7ce2 --- /dev/null +++ b/common/darts/api/dataset.py @@ -0,0 +1,95 @@ +from abc import abstractmethod +import pandas as pd + + +class Dataset: + """ Abstract dataset - Used for both Keras and Pytorch""" + + @abstractmethod + def __getitem__(self, idx): + """Gets batch at position `index`. + Parameters + ---------- + idx: index position of the batch in the data. + Returns + ------- + A batch + """ + raise NotImplementedError + + @abstractmethod + def __len__(self): + """Length of the dataset. + Returns + ------- + The number of samples in the data. + """ + raise NotImplementedError + + def on_epoch_end(self): + """ Keras method called at the end of every epoch. """ + pass + + def __iter__(self): + """Create a generator that iterates over the data.""" + for item in (self[i] for i in range(len(self))): + yield item + + +class InMemoryDataset(Dataset): + """ Abstract class for in memory data """ + + def load_data(self): + """ Load data and labels """ + raise NotImplementedError + + def dataframe(self): + """ Load the data as a pd.DataFrame """ + data, labels = self.load_data() + + if isinstance(labels, dict): + # We are in the multitask case + data_dict = {'data': data} + for key, value in labels.items(): + data_dict[key] = value + else: + data_dict = {'data': data, 'labels': labels} + + return pd.DataFrame(data_dict) + + def to_csv(self, path): + """ Save the data to disk """ + self.dataframe().to_csv(path, index=False) + + def load_cached(self, path): + """ Load the data from disk """ + frame = pd.read_csv(path) + + self.data = frame.pop('data') + + if len(frame.columns) > 1: + self.labels = frame.to_dict() + else: + self.labels = frame['labels'] + + +class Subset(InMemoryDataset): + """Subset of a dataset at specified indices. + + Args: + dataset (Dataset): The dataset to be subsetted + indices (sequence): Indices in the whole set selected for subset + """ + def __init__(self, dataset, indices): + self.dataset = dataset + self.indices = indices + + def __getitem__(self, idx): + return self.dataset[self.indices[idx]] + + def __len__(self): + return len(self.indices) + + def load_data(self): + return self.dataset[self.indices] + diff --git a/common/darts/api/info.py b/common/darts/api/info.py new file mode 100644 index 00000000..2a394c1f --- /dev/null +++ b/common/darts/api/info.py @@ -0,0 +1,72 @@ +import typing +from collections.abc import abc + +import torch +import numpy as np +import pandas as pd + + +class TrainingHistory: + + def __init__(self): + self.data = [] + + def add(self, epoch_result): + """ Add a datapoint to the history """ + self.data.append(epoch_result) + + def frame(self): + return pd.DataFrame(self.data).set_index('epoch_index') + + +class TrainingInfo(abc.MutableMapping): + """ Information that needs to persist through training """ + + def __init__(self, start_epoch_index=0, run_name: typing.Optional[str]=None, metrics=None, callbacks=None): + self.data_dict = {} # optional information + + self.run_name = run_name + self.history = TrainingHistory() + self.start_epoch_index = start_epoch_index + self.metrics = metrics if metrics is not None else [] + self.callbacks = callbacks if callbacks is not None else [] + + def initialize(self): + for callback in self.callbacks: + callback.on_initialization(self) + + def on_train_begin(self): + """ Start the training process - always used, even in restarts """ + for callback in self.callbacks: + callback.on_train_begin(self) + + def on_train_end(self): + """ Finalize training process """ + for callback in self.callbacks: + callback.on_train_end(self) + + def __getitem__(self, key): + return self.data[key] + + def __setitem__(self, key, value): + self.data[key] = value + + def __delitem__(self, key): + del self.data[key] + + def __iter__(self): + return iter(self.data) + + def __len__(self): + return len(self.data) + + def __contains__(self, key): + return key in self.data + + +class EpochResultAccumulator(abc.MutableMapping): + """ Result of a single epoch of training """ + + def __init__(self, global_epoch_idx, metrics): + self.metrics = metrics + self.global_epoch_idx = global_epoch_idx \ No newline at end of file diff --git a/common/darts/api/model.py b/common/darts/api/model.py new file mode 100644 index 00000000..4663f75d --- /dev/null +++ b/common/darts/api/model.py @@ -0,0 +1,48 @@ +import hashlib + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Model(nn.Module): + """ Class representing sampleable neural network model """ + + def num_params(self): + """ Get the number of model parameters. """ + return sum(p.numel() for p in self.parameters()) + + def summary(self, hashsummary=False): + print(self) + print("-" * 80) + n_params = self.num_params() + print(f"Number of model parameters: {n_params}") + print("-" * 80) + + if hashsummary: + print('Hash Summary:') + for idx, hashvalue in enumerate(self.hashsummary()): + print(f"{idx}: {hashvalue}") + + def hashsummary(self): + """ Print a model summary - checksums of each layer parameters """ + children = list(self.children()) + + result = [] + for child in children: + result.extend(hashlib.sha256(x.detach().cpu().numpy().tobytes()).hexdigest() for x in child.parameters()) + + return result + + def loss(self, x_data, y_true, reduce='mean'): + """ Forward propagate network and return a value of loss function """ + # TODO: This may need to be moved to the model. + if reduce not in (None, 'sum', 'mean'): + raise ValueError("`reduce` must be either None, `sum`, or `mean`!") + + y_pred = self(x_data) + return y_pred, self.loss_value(x_data, y_true, y_pred, reduce=reduce) + + def loss_value(self, x_data, y_true, y_pred, reduce=None): + """ Calculate a value of loss function """ + raise NotImplementedError diff --git a/common/darts/architecture.py b/common/darts/architecture.py new file mode 100644 index 00000000..4bc829de --- /dev/null +++ b/common/darts/architecture.py @@ -0,0 +1,196 @@ +import torch +from torch import optim, autograd + +import darts.functional as F + + +class Hyperparameters: + alpha_lr = 3e-4 + alpha_wd = 1e-3 + + +class Architecture: + + def __init__(self, model, args, hyperparams=Hyperparameters(), device='cpu'): + self.momentum = args.momentum # momentum for optimizer of theta + self.wd = args.weight_decay # weight decay for optimizer of model's theta + self.model = model # main model with respect to theta and alpha + self.device = device + + # this is the optimizer to optimize alpha parameter + self.optimizer = optim.Adam( + self.model.arch_parameters(), + lr=hyperparams.alpha_lr, + betas=(0.5, 0.999), + weight_decay=hyperparams.alpha_wd + ) + + def comp_unrolled_model(self, data, target, eta, optimizer): + """ Loss on train set and then update w_pi, not-in-place + + Parameters + ---------- + data : torch.tensor + + target : torch.tensor + eta : float + optimizer : torch.optim.optimizer + optimizer of theta, not optimizer of alpha + + Returns + ------- + model_unrolled + """ + # forward to get loss + loss = self.model.loss(data, target) + # flatten current weights + theta = F.flatten(self.model.parameters()).detach() + # theta: torch.Size([1930618]) + # print('theta:', theta.shape) + try: + # fetch momentum data from theta optimizer + moment = F.flatten(optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()) + moment.mul_(self.momentum) + except: + moment = torch.zeros_like(theta) + + # flatten all gradients + dtheta = F.flatten(autograd.grad(loss, self.model.parameters())).data + # indeed, here we implement a simple SGD with momentum and weight decay + # theta = theta - eta * (moment + weight decay + dtheta) + theta = theta.sub(eta, moment + dtheta + self.wd * theta) + # construct a new model + unrolled_model = self.construct_model_from_theta(theta) + + return unrolled_model.to(self.device) + + def step(self, x_train, target_train, x_valid, target_valid, eta, optimizer, unrolled): + """ + update alpha parameter by manually computing the gradients + :param x_train: + :param target_train: + :param x_valid: + :param target_valid: + :param eta: + :param optimizer: theta optimizer + :param unrolled: + :return: + """ + # alpha optimizer + self.optimizer.zero_grad() + + # compute the gradient and write it into tensor.grad + # instead of generated by loss.backward() + if unrolled: + self.backward_step_unrolled(x_train, target_train, x_valid, target_valid, eta, optimizer) + else: + # directly optimize alpha on w, instead of w_pi + self.backward_step(x_valid, target_valid) + + self.optimizer.step() + + def backward_step(self, x_valid, target_valid): + """ + simply train on validate set and backward + :param x_valid: + :param target_valid: + :return: + """ + _, loss = self.model.loss(x_valid, target_valid, reduce='mean') + # both alpha and theta require grad but only alpha optimizer will + # step in current phase. + loss.backward() + + def backward_step_unrolled(self, x_train, target_train, x_valid, target_valid, eta, optimizer): + """ + train on validate set based on update w_pi + :param x_train: + :param target_train: + :param x_valid: + :param target_valid: + :param eta: 0.01, according to author's comments + :param optimizer: theta optimizer + :return: + """ + # theta_pi = theta - lr * grad + unrolled_model = self.comp_unrolled_model(x_train, target_train, eta, optimizer) + # calculate loss on theta_pi + unrolled_loss = unrolled_model.loss(x_valid, target_valid) + + # this will update theta_pi model, but NOT theta model + unrolled_loss.backward() + # grad(L(w', a), a), part of Eq. 6 + dalpha = [v.grad for v in unrolled_model.arch_parameters()] + vector = [v.grad.data for v in unrolled_model.parameters()] + implicit_grads = self.hessian_vector_product(vector, x_train, target_train) + + for g, ig in zip(dalpha, implicit_grads): + # g = g - eta * ig, from Eq. 6 + g.data.sub_(eta, ig.data) + + # write updated alpha into original model + for v, g in zip(self.model.arch_parameters(), dalpha): + if v.grad is None: + v.grad = g.data + else: + v.grad.data.copy_(g.data) + + def construct_model_from_theta(self, theta): + """ + construct a new model with initialized weight from theta + it use .state_dict() and load_state_dict() instead of + .parameters() + fill_() + :param theta: flatten weights, need to reshape to original shape + :return: + """ + model = self.model.new() + state_dict = self.model.state_dict() + + params, offset = {}, 0 + for k, v in self.model.named_parameters(): + v_length = v.numel() + # restore theta[] value to original shape + params[k] = theta[offset: offset + v_length].view(v.size()) + offset += v_length + + assert offset == len(theta) + state_dict.update(params) + model.load_state_dict(state_dict) + model.to(self.device) + return model + + def hessian_vector_product(self, vector, data, target, r=1e-2): + """ + slightly touch vector value to estimate the gradient with respect to alpha + refer to Eq. 7 for more details. + :param vector: gradient.data of parameters theta + :param x: + :param target: + :param r: + :return: + """ + R = r / F.flatten(vector).norm() + + for p, v in zip(self.model.parameters(), vector): + # w+ = w + R * v + p.data.add_(R, v) + + loss = self.model.loss(data, target) + # gradient with respect to alpha + grads_p = autograd.grad(loss, self.model.arch_parameters()) + + for p, v in zip(self.model.parameters(), vector): + # w- = (w+R*v) - 2R*v + p.data.sub_(2 * R, v) + + loss = self.model.loss(data, target) + grads_n = autograd.grad(loss, self.model.arch_parameters()) + + for p, v in zip(self.model.parameters(), vector): + # w = (w+R*v) - 2R*v + R*v + p.data.add_(R, v) + + h= [(x - y).div_(2 * R) for x, y in zip(grads_p, grads_n)] + # h len: 2 h0 torch.Size([14, 8]) + # print('h len:', len(h), 'h0', h[0].shape) + return h diff --git a/common/darts/datasets/__init__.py b/common/darts/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/common/darts/datasets/p3b3.py b/common/darts/datasets/p3b3.py new file mode 100644 index 00000000..1285a158 --- /dev/null +++ b/common/darts/datasets/p3b3.py @@ -0,0 +1,102 @@ +import os +import numpy as np +from torch.utils.data import Dataset + + +class P3B3(Dataset): + """P3B3 Synthetic Dataset. + + Args: + root: str + Root directory of dataset where CANDLE loads P3B3 data. + + partition: str + dataset partition to be loaded. + Must be either 'train' or 'test'. + """ + training_data_file = 'train_X.npy' + training_label_file = 'train_Y.npy' + test_data_file = 'test_X.npy' + test_label_file = 'test_Y.npy' + + def __init__(self, root, partition, subsite=True, + laterality=True, behavior=True, grade=True, + transform=None, target_transform=None): + self.root = root + self.partition = partition + self.transform = transform + self.target_transform = target_transform + self.subsite = subsite + self.laterality = laterality + self.behavior = behavior + self.grade = grade + + if self.partition == 'train': + data_file = self.training_data_file + label_file = self.training_label_file + elif self.partition == 'test': + data_file = self.test_data_file + label_file = self.test_label_file + else: + raise ValueError("Partition must either be 'train' or 'test'.") + + self.data = np.load(os.path.join(self.root, data_file)) + self.targets = self.get_targets(label_file) + + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = self.partition + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + return fmt_str + + def __len__(self): + return len(self.data) + + def load_data(self): + return self.data, self.targets + + def get_targets(self, label_file): + """Get dictionary of targets specified by user.""" + targets = np.load(os.path.join(self.root, label_file)) + + tasks = {} + if self.subsite: + tasks['subsite'] = targets[:, 0] + if self.laterality: + tasks['laterality'] = targets[:, 1] + if self.behavior: + tasks['behavior'] = targets[:, 2] + if self.grade: + tasks['grade'] = targets[:, 3] + + return tasks + + def __getitem__(self, idx): + """ + Parameters + ---------- + index : int + Index of the data to be loaded. + + Returns + ------- + (document, target) : tuple + where target is index of the target class. + """ + document = self.data[idx] + + if self.transform is not None: + document = self.transform(document) + + targets = {} + for key, value in self.targets.items(): + subset = value[idx] + + if self.target_transform is not None: + subset = self.target_transform(subset) + + targets[key] = subset + + return document, targets \ No newline at end of file diff --git a/common/darts/datasets/random.py b/common/darts/datasets/random.py new file mode 100644 index 00000000..e153a525 --- /dev/null +++ b/common/darts/datasets/random.py @@ -0,0 +1,39 @@ +import numpy as np +from typing import Dict +from torch.utils.data import Dataset + + +class RandomData(Dataset): + """ Random dataset - Useful for quick iterating """ + + def __init__(self, x_dim: int, num_samples: int, tasks: Dict[str, int], seed: int=13): + np.random.seed(seed) + self.data = self.create_data(x_dim, num_samples) + self.labels = self.create_labels(tasks, num_samples) + + def create_data(self, x_dim, num_samples): + data = [np.random.randn(x_dim).astype('f') for _ in range(num_samples)] + return np.stack(data) + + def create_labels(self, tasks, num_samples): + labels = {} + for task, num_classes in tasks.items(): + labels[task] = np.random.randint(num_classes, size=num_samples) + + return labels + + def index_labels(self, idx): + """ Index into the labels """ + return {key: value[idx] for key, value in self.labels.items()} + + def load_data(self): + return self.data, self.labels + + def __repr__(self): + return f'Random supervised dataset' + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx], self.index_labels(idx) diff --git a/common/darts/datasets/sample.py b/common/darts/datasets/sample.py new file mode 100644 index 00000000..70880405 --- /dev/null +++ b/common/darts/datasets/sample.py @@ -0,0 +1,15 @@ +from sklearn.utils import resample +from darts.api.dataset import Subset + + +def dummy_indices(dataset): + """ Get indexes for the dataset """ + return [x for x in range(len(dataset))] + + +def sample(dataset, num_samples, replace=True): + """ Sample the dataset """ + data_idx = dummy_indices(dataset) + sample_idx = resample(data_idx, n_samples=num_samples, replace=replace) + return Subset(dataset, sample_idx) + diff --git a/common/darts/datasets/uno.py b/common/darts/datasets/uno.py new file mode 100644 index 00000000..4109f979 --- /dev/null +++ b/common/darts/datasets/uno.py @@ -0,0 +1,199 @@ +import os +import torch + +import numpy as np +import pandas as pd + +from darts.api import InMemoryDataset +from darts.datasets.utils import ( + download_url, makedir_exist_ok +) + + +class Uno(InMemoryDataset): + """Uno Dataset + + Parameters + ---------- + root str : + Root directory of dataset where ``processed/training.npy`` + ``processed/validation.npy and ``processed/test.npy`` exist. + + partition : str + dataset partition to be loaded. + Either 'train', 'validation', or 'test'. + + download : bool, optional + If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + """ + urls = [ + 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5', + ] + + training_data_file = 'train_data.pt' + training_label_file = 'train_labels.pt' + test_data_file = 'test_data.pt' + test_label_file = 'test_labels.pt' + + def __init__(self, root, partition, transform=None, + target_transform=None, download=False): + self.root = os.path.expanduser(root) + self.transform = transform + self.target_transform = target_transform + + if download: + self.download() + + if not self._check_exists(): + raise RuntimeError('Dataset not found.' + + ' You can use download=True to download it') + + self.partition = partition + if self.partition == 'train': + data_file = self.training_data_file + label_file = self.training_label_file + elif self.partition == 'test': + data_file = self.test_data_file + label_file = self.test_label_file + else: + raise ValueError("Partition must either be 'train' or 'test'.") + + self.data = torch.load(os.path.join(self.processed_folder, data_file)) + self.targets = torch.load(os.path.join(self.processed_folder, label_file)) + + def __len__(self): + return len(self.data['gene_data']) + + def load_data(self): + return self.data, self.targets + + def read_data(self, data_file, partition): + """ Read in the H5 data """ + if partition == 'train': + gene_data = 'x_train_0' + drug_data = 'x_train_1' + else: + gene_data = 'x_val_0' + drug_data = 'x_val_1' + + gene_data = torch.tensor(pd.read_hdf(data_file, gene_data).values) + drug_data = torch.tensor(pd.read_hdf(data_file, drug_data).values) + data = {'gene_data': gene_data, 'drug_data': drug_data} + + return data + + def read_targets(self, data_file, partition): + """Get dictionary of targets specified by user.""" + if partition == 'train': + label = 'y_train' + else: + label = 'y_val' + + tasks = { + 'response': torch.tensor( + pd.read_hdf(data_file, label)['AUC'].apply(lambda x: 1 if x < 0.5 else 0) + ) + } + + return tasks + + def __getitem__(self, idx): + """ + Parameters + ---------- + index : int + Index of the data to be loaded. + + Returns + ------- + (document, target) : tuple + where target is index of the target class. + """ + data = self.data['gene_data'][idx] + + if self.transform is not None: + data = self.transform(data) + + targets = {} + for key, value in self.targets.items(): + subset = value[idx] + + if self.target_transform is not None: + subset = self.target_transform(subset) + + targets[key] = subset + + return data, targets + + @property + def raw_folder(self): + return os.path.join(self.root, self.__class__.__name__, 'raw') + + @property + def processed_folder(self): + return os.path.join(self.root, self.__class__.__name__, 'processed') + + def _check_exists(self): + return os.path.exists(os.path.join(self.processed_folder, self.training_data_file)) and \ + os.path.exists(os.path.join(self.processed_folder, self.training_label_file)) and \ + os.path.exists(os.path.join(self.processed_folder, self.test_data_file)) and \ + os.path.exists(os.path.join(self.processed_folder, self.test_label_file)) + + @staticmethod + def extract_array(path, remove_finished=False): + print('Extracting {}'.format(path)) + arry = np.load(path) + if remove_finished: + os.unlink(path) + + def download(self): + """Download the Synthetic data if it doesn't exist in processed_folder already.""" + + if self._check_exists(): + return + + makedir_exist_ok(self.raw_folder) + makedir_exist_ok(self.processed_folder) + + # download files + for url in self.urls: + filename = url.rpartition('/')[2] + file_path = os.path.join(self.raw_folder, filename) + download_url(url, root=self.raw_folder, filename=filename, md5=None) + #self.extract_array(path=file_path, remove_finished=False) + + # process and save as numpy files + print('Processing...') + + training_set = ( + self.read_data(os.path.join(self.raw_folder, 'top_21_auc_1fold.uno.h5'), 'train'), + self.read_targets(os.path.join(self.raw_folder, 'top_21_auc_1fold.uno.h5'), 'train') + ) + test_set = ( + self.read_data(os.path.join(self.raw_folder, 'top_21_auc_1fold.uno.h5'), 'test'), + self.read_targets(os.path.join(self.raw_folder, 'top_21_auc_1fold.uno.h5'), 'test') + ) + + # Save processed training data + train_data_path = os.path.join(self.processed_folder, self.training_data_file) + torch.save(training_set[0], train_data_path) + train_label_path = os.path.join(self.processed_folder, self.training_label_file) + torch.save(training_set[1], train_label_path) + + # Save processed test data + test_data_path = os.path.join(self.processed_folder, self.test_data_file) + torch.save(test_set[0], test_data_path) + test_label_path = os.path.join(self.processed_folder, self.test_label_file) + torch.save(test_set[1], test_label_path) + + print('Done!') + + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = self.partition + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + return fmt_str diff --git a/common/darts/datasets/utils.py b/common/darts/datasets/utils.py new file mode 100644 index 00000000..39b5a417 --- /dev/null +++ b/common/darts/datasets/utils.py @@ -0,0 +1,119 @@ +import os +import os.path +import hashlib +import errno +from tqdm import tqdm + + +def gen_bar_updater(pbar): + def bar_update(count, block_size, total_size): + if pbar.total is None and total_size: + pbar.total = total_size + progress_bytes = count * block_size + pbar.update(progress_bytes - pbar.n) + + return bar_update + + +def check_integrity(fpath, md5=None): + if md5 is None: + return True + if not os.path.isfile(fpath): + return False + md5o = hashlib.md5() + with open(fpath, 'rb') as f: + # read in 1MB chunks + for chunk in iter(lambda: f.read(1024 * 1024), b''): + md5o.update(chunk) + md5c = md5o.hexdigest() + if md5c != md5: + return False + return True + + +def makedir_exist_ok(dirpath): + """ + Python2 support for os.makedirs(.., exist_ok=True) + """ + try: + os.makedirs(dirpath) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + + +def download_url(url, root, filename, md5): + from six.moves import urllib + + root = os.path.expanduser(root) + fpath = os.path.join(root, filename) + + makedir_exist_ok(root) + + # downloads file + if os.path.isfile(fpath) and check_integrity(fpath, md5): + print('Using downloaded and verified file: ' + fpath) + else: + try: + print('Downloading ' + url + ' to ' + fpath) + urllib.request.urlretrieve( + url, fpath, + reporthook=gen_bar_updater(tqdm(unit='B', unit_scale=True)) + ) + except: + if url[:5] == 'https': + url = url.replace('https:', 'http:') + print('Failed download. Trying https -> http instead.' + ' Downloading ' + url + ' to ' + fpath) + urllib.request.urlretrieve( + url, fpath, + reporthook=gen_bar_updater(tqdm(unit='B', unit_scale=True)) + ) + + +def list_dir(root, prefix=False): + """List all directories at a given root + + Args: + root (str): Path to directory whose folders need to be listed + prefix (bool, optional): If true, prepends the path to each result, otherwise + only returns the name of the directories found + """ + root = os.path.expanduser(root) + directories = list( + filter( + lambda p: os.path.isdir(os.path.join(root, p)), + os.listdir(root) + ) + ) + + if prefix is True: + directories = [os.path.join(root, d) for d in directories] + + return directories + + +def list_files(root, suffix, prefix=False): + """List all files ending with a suffix at a given root + + Args: + root (str): Path to directory whose folders need to be listed + suffix (str or tuple): Suffix of the files to match, e.g. '.png' or ('.jpg', '.png'). + It uses the Python "str.endswith" method and is passed directly + prefix (bool, optional): If true, prepends the path to each result, otherwise + only returns the name of the files found + """ + root = os.path.expanduser(root) + files = list( + filter( + lambda p: os.path.isfile(os.path.join(root, p)) and p.endswith(suffix), + os.listdir(root) + ) + ) + + if prefix is True: + files = [os.path.join(root, d) for d in files] + + return files diff --git a/common/darts/functional.py b/common/darts/functional.py new file mode 100644 index 00000000..fcc65806 --- /dev/null +++ b/common/darts/functional.py @@ -0,0 +1,94 @@ +import torch + + +def flatten(tensor): + """ Flatten a tensor. + + Parameters + ---------- + tensor : torch.tensor + + Returns + ------- + Flattened tensor + + Example + ------- + >>> x = torch.tensor([[0,1],[2,3]]) + >>> x_flattened = flatten(x) + >>> print(x) + >>> tensor([[0, 1], + [2, 3]]) + >>> print(x_flattened) + >>> tensor([0, 1, 2, 3]) + """ + return torch.cat([x.view(-1) for x in tensor]) + + +def multitask_loss(target, logits, criterion, reduce='mean'): + """ Compute multitask loss """ + losses = {} + for task, label in target.items(): + losses[task] = criterion(logits[task], label) + + if reduce: + total = 0 + for _, value in losses.items(): + total += value + + if reduce == "mean": + losses = total / len(losses) + elif reduce == "sum": + losses = total + else: + raise ValueError('Reduced loss must use either `mean` or `sum`!') + + return losses + + +def accuracy(target: torch.tensor, output: torch.tensor,): + """ Computes accuracy + + Args: + output: logits of the model + target: true labels + + Returns: + accuracy of the predictions + """ + return output.argmax(1).eq(target).double().mean().item() + + +def multitask_accuracy(target, output): + """ Compute the accuracy for multitask problems """ + accuracies = {} + for key, value in target.items(): + accuracies[key] = accuracy(target[key], output[key]) + + return accuracies + + +def accuracy_topk(target, output, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def multitask_accuracy_topk(target, output, topk=(1,)): + """Compute the topk accuracy for multitask problems""" + topk_accuracies = {} + for key, value in target.items(): + topk_accuracies[key] = accuracy_topk(output[key], target[key], topk) + + return topk_accuracies diff --git a/common/darts/genotypes.py b/common/darts/genotypes.py new file mode 100644 index 00000000..e96681be --- /dev/null +++ b/common/darts/genotypes.py @@ -0,0 +1,126 @@ +from collections import namedtuple + + +Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat') + + +PRIMITIVES = [ + 'none', + 'max_pool_3', + 'avg_pool_3', + 'skip_connect', + 'sep_conv_3', + 'sep_conv_5', + 'dil_conv_3', + 'dil_conv_5', +] + + +LINEAR_PRIMITIVES = [ + 'linear_block', + 'skip_connect', + 'linear_conv', + 'linear_drop', + 'encoder', + 'none', +] + + +AmoebaNet = Genotype( + normal=[ + ('avg_pool_3', 0), + ('max_pool_3', 1), + ('sep_conv_3', 0), + ('sep_conv_5', 2), + ('sep_conv_3', 0), + ('avg_pool_3', 3), + ('sep_conv_3', 1), + ('skip_connect', 1), + ('skip_connect', 0), + ('avg_pool_3', 1), + ], + normal_concat=[4, 5, 6], + reduce=[ + ('avg_pool_3', 0), + ('sep_conv_3', 1), + ('max_pool_3', 0), + ('sep_conv_7', 2), + ('sep_conv_7', 0), + ('avg_pool_3', 1), + ('max_pool_3', 0), + ('max_pool_3', 1), + ('conv_7x1_1', 0), + ('sep_conv_3', 5), + ], + reduce_concat=[3, 4, 6] +) + + +GradeNet36 = Genotype( + normal=[ + ('sep_conv_5', 1), + ('dil_conv_3', 0), + ('sep_conv_5', 2), + ('max_pool_3', 1), + ('max_pool_3', 2), + ('max_pool_3', 1), + ('skip_connect', 4), + ('max_pool_3', 1), + ], + normal_concat = [4, 5, 6], + reduce=[ + ('sep_conv_5', 0), + ('sep_conv_5', 1), + ('max_pool_3', 2), + ('sep_conv_3', 1), + ('dil_conv_5', 3), + ('sep_conv_5', 2), + ('sep_conv_5', 3), + ('dil_conv_5', 4) + ], + reduce_concat = [4, 5, 6] +) + + +Multitask = Genotype( + normal=[ + ('avg_pool_3', 1), + ('sep_conv_3', 0), + ('avg_pool_3', 1), + ('sep_conv_5', 2), + ('max_pool_3', 2), + ('max_pool_3', 1), + ('skip_connect', 4), + ('avg_pool_3', 1) + ], + normal_concat = [4, 5, 6], + reduce=[ + ('sep_conv_5', 1), + ('sep_conv_5', 0), + ('sep_conv_5', 2), + ('sep_conv_3', 0), + ('sep_conv_5', 3), + ('sep_conv_5', 2), + ('sep_conv_5', 4), + ('sep_conv_5', 3) + ], + reduce_concat = [4, 5, 6] +) + + +MultitaskN2C3 = Genotype( + normal=[ + ('max_pool_3', 0), + ('sep_conv_5', 1), + ('sep_conv_5', 1), + ('sep_conv_5', 0) + ], + normal_concat = [2, 3, 4], + reduce=[ + ('sep_conv_5', 1), + ('sep_conv_5', 0), + ('sep_conv_5', 1), + ('sep_conv_3', 2) + ], + reduce_concat=[2, 3, 4] +) diff --git a/common/darts/meters/__init__.py b/common/darts/meters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/common/darts/meters/accuracy.py b/common/darts/meters/accuracy.py new file mode 100644 index 00000000..7e2670e2 --- /dev/null +++ b/common/darts/meters/accuracy.py @@ -0,0 +1,42 @@ +import os +import pandas as pd + +from darts.meters.average import AverageMeter + + +class MultitaskAccuracyMeter: + + def __init__(self, tasks): + self.tasks = tasks + self.reset() + + def reset(self): + self.meters = self.create_meters() + + def create_meters(self): + """ Create an average meter for each task """ + meters = {} + for task, _ in self.tasks.items(): + meters[task] = AverageMeter('Acc@1', ':6.2f') + return meters + + def get_avg_accuracy(self, task): + return self.meters[task].avg + + def get_accuracy(self, task): + return self.meters[task].val + + def update(self, accuracies, batch_size): + for task, acc in accuracies.items(): + self.meters[task].update(acc[0].item(), batch_size) + + def dataframe(self): + """ Get a dataframe of all task accuracies """ + avg_accuracy = {k: v.avgs for (k, v) in self.meters.items()} + return pd.DataFrame(avg_accuracy) + + def save(self, path, filename): + """ Save the task accuracies as a csv """ + path = os.path.join(path, f'{filename}_accuracy.csv') + self.dataframe().to_csv(path, index=False) + diff --git a/common/darts/meters/average.py b/common/darts/meters/average.py new file mode 100644 index 00000000..e82af8ec --- /dev/null +++ b/common/darts/meters/average.py @@ -0,0 +1,23 @@ +class AverageMeter: + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) diff --git a/common/darts/meters/epoch.py b/common/darts/meters/epoch.py new file mode 100644 index 00000000..b20ee4ac --- /dev/null +++ b/common/darts/meters/epoch.py @@ -0,0 +1,40 @@ +import os +import pandas as pd + +from darts.meters.average import AverageMeter +from darts.meters.accuracy import MultitaskAccuracyMeter + + +class EpochMeter: + """ Track epoch loss and accuracy """ + + def __init__(self, tasks, name='train'): + self.name = name + self.loss_meter = AverageMeter(name) + self.acc_meter = MultitaskAccuracyMeter(tasks) + self.reset() + + def reset(self): + self.loss = [] + self.acc = { task: [] for task, _ in self.acc_meter.meters.items() } + + def update_batch_loss(self, loss, batch_size): + self.loss_meter.update(loss, batch_size) + + def update_batch_accuracy(self, acc, batch_size): + self.acc_meter.update(acc, batch_size) + + def update_epoch(self): + self.loss.append(self.loss_meter.avg) + for task, acc in self.acc_meter.meters.items(): + self.acc[task].append(acc.avg) + + def dataframe(self): + results = self.acc + results['loss'] = self.loss + return pd.DataFrame(results) + + def save(self, path): + os.makedirs(path, exist_ok=True) + path = os.path.join(path, f'{self.name}_epoch_results') + self.dataframe().to_csv(path, index=False) diff --git a/common/darts/metrics/__init__.py b/common/darts/metrics/__init__.py new file mode 100644 index 00000000..0e02fb01 --- /dev/null +++ b/common/darts/metrics/__init__.py @@ -0,0 +1,2 @@ +from .topk_accuracy import accuracy_topk +from .topk_accuracy import multitask_accuracy_topk diff --git a/common/darts/metrics/multitask_accuracy.py b/common/darts/metrics/multitask_accuracy.py new file mode 100644 index 00000000..3f89599f --- /dev/null +++ b/common/darts/metrics/multitask_accuracy.py @@ -0,0 +1,18 @@ +import darts.functional as F +from darts.api.metrics.average import MultitaskAveragingSupervisedMetric + + +class MultitaskAccuracy(MultitaskAveragingSupervisedMetric): + """ Multitask Classification accuracy """ + + def __init__(self, scope="train"): + super().__init__("accuracy", scope=scope) + + def _value_function(self, x_input, y_true, y_pred): + """ Return classification accuracy of input """ + return F.multitask_accuracy(y_true, y_pred) + + +def create(): + """ darts factory function """ + return MultitaskAccuracy() diff --git a/common/darts/metrics/multitask_loss.py b/common/darts/metrics/multitask_loss.py new file mode 100644 index 00000000..be2a3c54 --- /dev/null +++ b/common/darts/metrics/multitask_loss.py @@ -0,0 +1,21 @@ +import torch.nn as nn + +import darts.functional as F +from darts.api.metrics.average import MultitaskAveragingSupervisedMetric + + +class MultitaskLoss(MultitaskAveragingSupervisedMetric): + """ Multitask Classification loss """ + + def __init__(self, scope="train", criterion=nn.CrossEntropyLoss()): + super().__init__("loss", scope=scope) + self.criterion = criterion + + def _value_function(self, x_input, y_true, y_pred, reduce=None): + """ Return loss value of input """ + return F.multitask_loss(y_true, y_pred, criterion=self.criterion, reduce=reduce) + + +def create(): + """ darts factory function """ + return MultitaskLoss() diff --git a/common/darts/metrics/topk_accuracy.py b/common/darts/metrics/topk_accuracy.py new file mode 100644 index 00000000..59b54142 --- /dev/null +++ b/common/darts/metrics/topk_accuracy.py @@ -0,0 +1,27 @@ +import torch + + +def accuracy_topk(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def multitask_accuracy_topk(output, target, topk=(1,)): + """Compute the topk accuracy for multitask problems""" + topk_accuracies = {} + for key, value in target.items(): + topk_accuracies[key] = accuracy_topk(output[key], target[key], topk) + + return topk_accuracies diff --git a/common/darts/modules/__init__.py b/common/darts/modules/__init__.py new file mode 100644 index 00000000..e8086a41 --- /dev/null +++ b/common/darts/modules/__init__.py @@ -0,0 +1,3 @@ +from .cell import Cell +from .mixed_layer import MixedLayer +from .network import Network diff --git a/common/darts/modules/cell.py b/common/darts/modules/cell.py new file mode 100644 index 00000000..6ee82d82 --- /dev/null +++ b/common/darts/modules/cell.py @@ -0,0 +1,82 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.modules.mixed_layer import MixedLayer + + +class ConvBlock(Model): + """ ReLu -> Conv2d """ + + def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d( + c_in, c_out, kernel_size=kernel_size, stride=stride, padding=padding + ) + + def forward(self, x): + return self.conv(x) + + +class Cell(Model): + + def __init__(self, num_nodes, multiplier, cpp, cp, c, primitives, ops): + """ + :param steps: 4, number of layers inside a cell + :param multiplier: 4 + :param cpp: 48 + :param cp: 48 + :param c: 16 + :param reduction: indicates whether to reduce the output maps width + :param reduction_prev: when previous cell reduced width, s1_d = s0_d//2 + in order to keep same shape between s1 and s0, we adopt prep0 layer to + reduce the s0 width by half. + """ + super(Cell, self).__init__() + self.preprocess0 = ConvBlock(cpp, c, 1, 1, 0, affine=False) + # preprocess1 deal with output from prev cell + self.preprocess1 = ConvBlock(cp, c, 1, 1, 0, affine=False) + + # steps inside a cell + self.num_nodes = num_nodes + self.multiplier = multiplier + + self.layers = nn.ModuleList() + + for i in range(self.num_nodes): + # for each i inside cell, it connects with all previous output + # plus previous two cells' output + for j in range(2 + i): + # for reduction cell, it will reduce the heading 2 inputs only + stride = 1 + layer = MixedLayer(c, stride, primitives, ops) + self.layers.append(layer) + + def forward(self, s0, s1, weights): + """ + :param s0: + :param s1: + :param weights: [14, 8] + :return: + """ + #print('s0:', s0.shape,end='=>') + s0 = self.preprocess0(s0) # [40, 48, 32, 32], [40, 16, 32, 32] + #print(s0.shape, self.reduction_prev) + #print('s1:', s1.shape,end='=>') + s1 = self.preprocess1(s1) # [40, 48, 32, 32], [40, 16, 32, 32] + #print(s1.shape) + + states = [s0, s1] + offset = 0 + # for each node, receive input from all previous intermediate nodes and s0, s1 + for i in range(self.num_nodes): # 4 + # [40, 16, 32, 32] + s = sum(self.layers[offset + j](h, weights[offset + j]) for j, h in enumerate(states)) + offset += len(states) + # append one state since s is the elem-wise addition of all output + states.append(s) + #print('node:',i, s.shape, self.reduction) + + # concat along dim=channel + return torch.cat(states[-self.multiplier:], dim=1) # 6 of [40, 16, 32, 32] + diff --git a/common/darts/modules/classifier.py b/common/darts/modules/classifier.py new file mode 100644 index 00000000..439c17f0 --- /dev/null +++ b/common/darts/modules/classifier.py @@ -0,0 +1,26 @@ +from typing import Dict +import torch.nn as nn + + +class MultitaskClassifier(nn.Module): + + def __init__(self, input_dim: int, tasks: Dict[str, int]): + super(MultitaskClassifier, self).__init__() + self.tasks = tasks + + for task, num_classes in tasks.items(): + self.add_module( + task, + nn.Linear(input_dim, num_classes) + ) + + def num_classes(self, task): + """ Get number of classes for a task. """ + return self.tasks[task] + + def forward(self, x): + logits = {} + for task, _ in self.tasks.items(): + logits[task] = self._modules[task](x) + + return logits diff --git a/common/darts/modules/conv/__init__.py b/common/darts/modules/conv/__init__.py new file mode 100644 index 00000000..e8b50cc7 --- /dev/null +++ b/common/darts/modules/conv/__init__.py @@ -0,0 +1,4 @@ +from .cell import Cell +from .network import ConvNetwork +from .mixed_layer import MixedLayer + diff --git a/common/darts/modules/conv/cell.py b/common/darts/modules/conv/cell.py new file mode 100644 index 00000000..f902ce52 --- /dev/null +++ b/common/darts/modules/conv/cell.py @@ -0,0 +1,80 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.modules.conv.mixed_layer import MixedLayer +from darts.modules.operations.conv import ConvBlock, FactorizedReduce + + +class Cell(Model): + + def __init__(self, num_nodes, multiplier, cpp, cp, c, reduction, reduction_prev): + """ + :param steps: 4, number of layers inside a cell + :param multiplier: 4 + :param cpp: 48 + :param cp: 48 + :param c: 16 + :param reduction: indicates whether to reduce the output maps width + :param reduction_prev: when previous cell reduced width, s1_d = s0_d//2 + in order to keep same shape between s1 and s0, we adopt prep0 layer to + reduce the s0 width by half. + """ + super(Cell, self).__init__() + + # indicating current cell is reduction or not + self.reduction = reduction + self.reduction_prev = reduction_prev + + # preprocess0 deal with output from prev_prev cell + if reduction_prev: + # if prev cell has reduced channel/double width, + # it will reduce width by half + self.preprocess0 = FactorizedReduce(cpp, c, affine=False) + else: + self.preprocess0 = ConvBlock(cpp, c, 1, 1, 0, affine=False) + # preprocess1 deal with output from prev cell + self.preprocess1 = ConvBlock(cp, c, 1, 1, 0, affine=False) + + # steps inside a cell + self.num_nodes = num_nodes # 4 + self.multiplier = multiplier # 4 + + self.layers = nn.ModuleList() + + for i in range(self.num_nodes): + # for each i inside cell, it connects with all previous output + # plus previous two cells' output + for j in range(2 + i): + # for reduction cell, it will reduce the heading 2 inputs only + stride = 2 if reduction and j < 2 else 1 + layer = MixedLayer(c, stride) + self.layers.append(layer) + + def forward(self, s0, s1, weights): + """ + :param s0: + :param s1: + :param weights: [14, 8] + :return: + """ + #print('s0:', s0.shape,end='=>') + s0 = self.preprocess0(s0) # [40, 48, 32, 32], [40, 16, 32, 32] + #print(s0.shape, self.reduction_prev) + #print('s1:', s1.shape,end='=>') + s1 = self.preprocess1(s1) # [40, 48, 32, 32], [40, 16, 32, 32] + #print(s1.shape) + + states = [s0, s1] + offset = 0 + # for each node, receive input from all previous intermediate nodes and s0, s1 + for i in range(self.num_nodes): # 4 + # [40, 16, 32, 32] + s = sum(self.layers[offset + j](h, weights[offset + j]) for j, h in enumerate(states)) + offset += len(states) + # append one state since s is the elem-wise addition of all output + states.append(s) + #print('node:',i, s.shape, self.reduction) + + # concat along dim=channel + return torch.cat(states[-self.multiplier:], dim=1) # 6 of [40, 16, 32, 32] diff --git a/common/darts/modules/conv/mixed_layer.py b/common/darts/modules/conv/mixed_layer.py new file mode 100644 index 00000000..8da4373b --- /dev/null +++ b/common/darts/modules/conv/mixed_layer.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.genotypes import PRIMITIVES +from darts.modules.operations.conv import OPS + + +class MixedLayer(Model): + """ A mixture of 8 unit types + + We use weights to aggregate these outputs while training. + and softmax to select the strongest edges while inference. + """ + def __init__(self, c, stride): + super(MixedLayer, self).__init__() + self.reset(c, stride) + + def reset(self, c, stride): + self.layers = nn.ModuleList() + + for primitive in PRIMITIVES: + layer = OPS[primitive](c, stride, False) + + if 'pool' in primitive: + layer = nn.Sequential(layer, nn.BatchNorm1d(c, affine=False)) + + self.layers.append(layer) + + def forward(self, x, weights): + """ + Parameters + ---------- + x : torch.tensor + Data + + Weights : torch.tensor + alpha, [op_num:8], the output = sum of alpha * op(x) + """ + x = [w * layer(x) for w, layer in zip(weights, self.layers)] + return sum(x) diff --git a/common/darts/modules/conv/network.py b/common/darts/modules/conv/network.py new file mode 100644 index 00000000..5b49222a --- /dev/null +++ b/common/darts/modules/conv/network.py @@ -0,0 +1,248 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from darts.api import Model +from darts.modules.conv import Cell +from darts.modules.classifier import MultitaskClassifier +from darts.genotypes import PRIMITIVES, Genotype + + +class Hyperparameters: + c = 8 + num_nodes = 2 + num_cells = 3 + channel_multiplier = 2 + stem_channel_multiplier = 2 + num_embeddings = 35095 # vocab size + embedding_dim = 1500 + + +class ConvNetwork(Model): + """ Collection of cells """ + + def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): + super(ConvNetwork, self).__init__() + self.tasks = tasks + self.criterion = criterion + self.device = device + self.c = hyperparams.c + self.num_cells = hyperparams.num_cells + self.num_nodes = hyperparams.num_nodes + self.channel_multiplier = hyperparams.channel_multiplier + + # stem_multiplier is for stem network, + # and multiplier is for general cell + c_curr = hyperparams.stem_channel_multiplier * self.c # 3*16 + # stem network, convert 3 channel to c_curr + self.stem = nn.Sequential( + nn.Embedding( + num_embeddings=hyperparams.num_embeddings, + embedding_dim=hyperparams.embedding_dim + ), + nn.Conv1d(hyperparams.embedding_dim, c_curr, 3, padding=1, bias=False), + nn.BatchNorm1d(c_curr) + ).to(self.device) + + # c_curr means a factor of the output channels of current cell + # output channels = multiplier * c_curr + cpp, cp, c_curr = c_curr, c_curr, self.c + self.cells = nn.ModuleList() + reduction_prev = False + for i in range(hyperparams.num_cells): + + # for layer in the middle [1/3, 2/3], reduce via stride=2 + if i in [hyperparams.num_cells // 3, 2 * hyperparams.num_cells // 3]: + c_curr *= 2 + reduction = True + else: + reduction = False + + # [cp, h, h] => [multiplier*c_curr, h/h//2, h/h//2] + # the output channels = multiplier * c_curr + cell = Cell( + hyperparams.num_nodes, + hyperparams.channel_multiplier, + cpp, + cp, + c_curr, + reduction, + reduction_prev + ).to(self.device) + # update reduction_prev + reduction_prev = reduction + self.cells += [cell] + cpp, cp = cp, hyperparams.channel_multiplier * c_curr + + # adaptive pooling output size to 1x1 + self.global_pooling = nn.AdaptiveAvgPool1d(1) + # since cp records last cell's output channels + # it indicates the input channel number + self.classifier = MultitaskClassifier(cp, tasks) + + # k is the total number of edges inside single cell, 14 + k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) + num_ops = len(PRIMITIVES) # 8 + + self.alpha_normal = nn.Parameter(torch.randn(k, num_ops)) + self.alpha_reduce = nn.Parameter(torch.randn(k, num_ops)) + + with torch.no_grad(): + # initialize to smaller value + self.alpha_normal.mul_(1e-3) + self.alpha_reduce.mul_(1e-3) + + self._arch_parameters = [ + self.alpha_normal, + self.alpha_reduce, + ] + + def new(self): + """ Create a new model initialzed with current alpha parameters. + + Weights are left untouched. + + Returns + ------- + model : Network + New model initialized with current alpha. + """ + model = ConvNetwork( + self.tasks, + self.criterion + ).to(self.device) + + for x, y in zip(model.arch_parameters(), self.arch_parameters()): + x.data.copy_(y.data) + + return model + + def forward(self, x): + """ + in: torch.Size([3, 3, 32, 32]) + stem: torch.Size([3, 48, 32, 32]) + cell: 0 torch.Size([3, 64, 32, 32]) False + cell: 1 torch.Size([3, 64, 32, 32]) False + cell: 2 torch.Size([3, 128, 16, 16]) True + cell: 3 torch.Size([3, 128, 16, 16]) False + cell: 4 torch.Size([3, 128, 16, 16]) False + cell: 5 torch.Size([3, 256, 8, 8]) True + cell: 6 torch.Size([3, 256, 8, 8]) False + cell: 7 torch.Size([3, 256, 8, 8]) False + pool: torch.Size([16, 256, 1, 1]) + linear: [b, 10] + :param x: + :return: + """ + #print('network in:', x.shape) + # s0 & s1 means the last cells' output + s0 = s1 = self.stem(x) # [b, 3, 32, 32] => [b, 48, 32, 32] + #print('network stem:', s0.shape) + #print('network stem1:', s1.shape) + + for i, cell in enumerate(self.cells): + # weights are shared across all reduction cell or normal cell + # according to current cell's type, it choose which architecture parameters + # to use + if cell.reduction: # if current cell is reduction cell + weights = F.softmax(self.alpha_reduce, dim=-1) + else: + weights = F.softmax(self.alpha_normal, dim=-1) # [14, 8] + # execute cell() firstly and then assign s0=s1, s1=result + s0, s1 = s1, cell(s0, s1, weights) # [40, 64, 32, 32] + #print('cell:',i, s1.shape, cell.reduction, cell.reduction_prev) + #print('\n') + + # s1 is the last cell's output + out = self.global_pooling(s1) + # logits = {} + # for task, fc in self.classifier.items(): + # logits[task] = fc(out.view(out.size(0), -1)) + logits = self.classifier(out.view(out.size(0), -1)) + + return logits + + def loss(self, data, target, reduce='mean'): + """ Calculate a value of loss function """ + logits = self(data) + + for task, logit in logits.items(): + logits[task] = logit.to(self.device) + + losses = {} + for task, label in target.items(): + label = label.to(self.device) + losses[task] = self.criterion(logits[task], label) + + if reduce: + total = 0 + for _, value in losses.items(): + total += value + + if reduce == "mean": + losses = total / len(losses) + elif reduce == "sum": + losses = total + else: + raise ValueError('Reduced loss must use either `mean` or `sum`!') + + return losses + + def arch_parameters(self): + return self._arch_parameters + + def genotype(self): + """ + :return: + """ + def _parse(weights): + """ + :param weights: [14, 8] + :return: + """ + gene = [] + n = 2 + start = 0 + for i in range(self.num_nodes): # for each node + end = start + n + W = weights[start:end].copy() # [2, 8], [3, 8], ... + edges = sorted(range(i + 2), # i+2 is the number of connection for node i + key=lambda x: -max(W[x][k] # by descending order + for k in range(len(W[x])) # get strongest ops + if k != PRIMITIVES.index('none')) + )[:2] # only has two inputs + for j in edges: # for every input nodes j of current node i + k_best = None + for k in range(len(W[j])): # get strongest ops for current input j->i + if k != PRIMITIVES.index('none'): + if k_best is None or W[j][k] > W[j][k_best]: + k_best = k + gene.append((PRIMITIVES[k_best], j)) # save ops and input node + start = end + n += 1 + return gene + + gene_normal = _parse(F.softmax(self.alpha_normal, dim=-1).data.cpu().numpy()) + gene_reduce = _parse(F.softmax(self.alpha_reduce, dim=-1).data.cpu().numpy()) + + concat = range(2 + self.num_nodes - self.channel_multiplier, self.num_nodes + 2) + genotype = Genotype( + normal=gene_normal, normal_concat=concat, + reduce=gene_reduce, reduce_concat=concat + ) + + return genotype + + +def new(c, num_classes, num_layers, criterion, device, steps=4, multiplier=4, stem_multiplier=3): + """ + create a new model and initialize it with current alpha parameters. + However, its weights are left untouched. + :return: + """ + model = Network(c, num_classes, num_layers, criterion, steps, multiplier, stem_multiplier).to(device) + + for x, y in zip(model_new.arch_parameters(), self.arch_parameters()): + x.data.copy_(y.data) + + return model diff --git a/common/darts/modules/linear/__init__.py b/common/darts/modules/linear/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/common/darts/modules/linear/cell.py b/common/darts/modules/linear/cell.py new file mode 100644 index 00000000..b533c61b --- /dev/null +++ b/common/darts/modules/linear/cell.py @@ -0,0 +1,58 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.modules.linear.mixed_layer import MixedLayer + + +class Cell(Model): + + def __init__(self, num_nodes, multiplier, cpp, cp, c, reduction, reduction_prev): + """ + :param steps: 4, number of layers inside a cell + :param multiplier: 4 + :param cpp: 48 + :param cp: 48 + :param c: 16 + :param reduction: indicates whether to reduce the output maps width + :param reduction_prev: when previous cell reduced width, s1_d = s0_d//2 + in order to keep same shape between s1 and s0, we adopt prep0 layer to + reduce the s0 width by half. + """ + super(Cell, self).__init__() + + # indicating current cell is reduction or not + self.reduction = reduction + self.reduction_prev = reduction_prev + + # steps inside a cell + self.num_nodes = num_nodes # 4 + self.multiplier = multiplier # 4 + + self.layers = nn.ModuleList() + + for i in range(self.num_nodes): + # for each i inside cell, it connects with all previous output + # plus previous two cells' output + for j in range(2 + i): + # for reduction cell, it will reduce the heading 2 inputs only + stride = 2 if reduction and j < 2 else 1 + layer = MixedLayer(c, stride) + self.layers.append(layer) + + def forward(self, s0, s1, weights): + """ + :param s0: + :param s1: + :param weights: [14, 8] + :return: + """ + states = [s0, s1] + offset = 0 + # for each node, receive input from + # all previous intermediate nodes and s0, s1 + for i in range(self.num_nodes): # 4 + offset += len(states) + + # concat along dim=channel + return torch.cat(states[-self.multiplier:], dim=1) diff --git a/common/darts/modules/linear/mixed_layer.py b/common/darts/modules/linear/mixed_layer.py new file mode 100644 index 00000000..085fb56d --- /dev/null +++ b/common/darts/modules/linear/mixed_layer.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.genotypes import LINEAR_PRIMITIVES +from darts.modules.operations.linear import OPS + + +class MixedLayer(Model): + """ A mixture of 8 unit types + + We use weights to aggregate these outputs while training. + and softmax to select the strongest edges while inference. + """ + def __init__(self, c, stride): + super(MixedLayer, self).__init__() + self.reset(c, stride) + + def reset(self, c, stride): + self.layers = nn.ModuleList() + + for primitive in LINEAR_PRIMITIVES: + layer = OPS[primitive](c, stride, False) + + if 'pool' in primitive: + layer = nn.Sequential(layer, nn.BatchNorm1d(c, affine=False)) + + self.layers.append(layer) + + def forward(self, x, weights): + """ + Parameters + ---------- + x : torch.tensor + Data + + Weights : torch.tensor + alpha, [op_num:8], the output = sum of alpha * op(x) + """ + x = [w * layer(x) for w, layer in zip(weights, self.layers)] + return sum(x) diff --git a/common/darts/modules/linear/network.py b/common/darts/modules/linear/network.py new file mode 100644 index 00000000..470d309a --- /dev/null +++ b/common/darts/modules/linear/network.py @@ -0,0 +1,202 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from darts.api import Model +from darts.modules.linear.cell import Cell +from darts.modules.classifier import MultitaskClassifier +from darts.genotypes import LINEAR_PRIMITIVES, Genotype + + +class Hyperparameters: + c = 100 + num_nodes = 2 + num_cells = 3 + channel_multiplier = 1 + stem_channel_multiplier = 1 + intermediate_dim = 100 + + +class LinearNetwork(Model): + """ Collection of cells """ + + def __init__(self, input_dim, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): + super(LinearNetwork, self).__init__() + self.tasks = tasks + self.criterion = criterion + self.device = device + self.c = hyperparams.c + self.num_cells = hyperparams.num_cells + self.num_nodes = hyperparams.num_nodes + self.channel_multiplier = hyperparams.channel_multiplier + + # stem_multiplier is for stem network, + # and multiplier is for general cell + c_curr = hyperparams.stem_channel_multiplier * self.c + + self.stem = nn.Sequential( + nn.Linear( + input_dim, hyperparams.intermediate_dim + ), + ).to(self.device) + + # c_curr means a factor of the output channels of current cell + # output channels = multiplier * c_curr + cpp, cp, c_curr = c_curr, c_curr, self.c + self.cells = nn.ModuleList() + reduction_prev = False + for i in range(hyperparams.num_cells): + # for layer in the middle [1/3, 2/3], reduce via stride=2 + if i in [hyperparams.num_cells // 3, 2 * hyperparams.num_cells // 3]: + c_curr *= 2 + reduction = True + else: + reduction = False + + # [cp, h, h] => [multiplier*c_curr, h/h//2, h/h//2] + # the output channels = multiplier * c_curr + cell = Cell( + hyperparams.num_nodes, + hyperparams.channel_multiplier, + cpp, + cp, + c_curr, + reduction, + reduction_prev + ).to(self.device) + # update reduction_prev + reduction_prev = reduction + self.cells += [cell] + cpp, cp = cp, hyperparams.channel_multiplier * c_curr + + self.classifier = MultitaskClassifier(hyperparams.intermediate_dim, tasks) + + # k is the total number of edges inside single cell + k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) + num_ops = len(LINEAR_PRIMITIVES) # 8 + + self.alpha_normal = nn.Parameter(torch.randn(k, num_ops)) + self.alpha_reduce = nn.Parameter(torch.randn(k, num_ops)) + + with torch.no_grad(): + # initialize to smaller value + self.alpha_normal.mul_(1e-3) + self.alpha_reduce.mul_(1e-3) + + self._arch_parameters = [ + self.alpha_normal, + self.alpha_reduce, + ] + + def fc_layers(self, cp, tasks): + """ Create fully connnected layers for each task """ + fc_layers = {} + for task, dim in tasks.items(): + fc_layers[task] = nn.Linear(cp, dim).to(self.device) + return fc_layers + + def new(self): + """ Create a new model initialzed with current alpha parameters. + + Weights are left untouched. + + Returns + ------- + model : Network + New model initialized with current alpha. + """ + model = LinearNetwork( + self.tasks, + self.criterion + ).to(self.device) + + for x, y in zip(model.arch_parameters(), self.arch_parameters()): + x.data.copy_(y.data) + + return model + + def forward(self, x): + # s0 & s1 means the last cells' output + s0 = s1 = self.stem(x) # [b, 3, 32, 32] => [b, 48, 32, 32] + + for i, cell in enumerate(self.cells): + # weights are shared across all reduction cell or normal cell + # according to current cell's type, it choose which architecture parameters + # to use + if cell.reduction: # if current cell is reduction cell + weights = F.softmax(self.alpha_reduce, dim=-1) + else: + weights = F.softmax(self.alpha_normal, dim=-1) # [14, 8] + # execute cell() firstly and then assign s0=s1, s1=result + s0, s1 = s1, cell(s0, s1, weights) # [40, 64, 32, 32] + + # s1 is the last cell's output + logits = self.classifier(s1.view(s1.size(0), -1)) + + return logits + + def loss_value(self, x_data, y_true, y_pred, reduce='mean'): + """ Calculate a value of loss function """ + y_pred = self(x_data) + + losses = {} + for key, value in y_true.items(): + losses[key] = F.nll_loss(F.log_softmax(y_pred[key], dim=1), y_true[key]) + + if reduce: + total = 0 + for _, value in losses.items(): + total += value + + if reduce == "mean": + losses = total / len(losses) + elif reduce == "sum": + losses = total + + return losses + + def arch_parameters(self): + return self._arch_parameters + + def genotype(self): + """ + :return: + """ + def _parse(weights): + """ + :param weights: [14, 8] + :return: + """ + gene = [] + n = 2 + start = 0 + for i in range(self.num_nodes): # for each node + end = start + n + W = weights[start:end].copy() # [2, 8], [3, 8], ... + edges = sorted(range(i + 2), # i+2 is the number of connection for node i + key=lambda x: -max(W[x][k] # by descending order + for k in range(len(W[x])) # get strongest ops + if k != LINEAR_PRIMITIVES.index('none')) + )[:2] # only has two inputs + for j in edges: # for every input nodes j of current node i + k_best = None + for k in range(len(W[j])): # get strongest ops for current input j->i + if k != LINEAR_PRIMITIVES.index('none'): + if k_best is None or W[j][k] > W[j][k_best]: + k_best = k + gene.append((LINEAR_PRIMITIVES[k_best], j)) # save ops and input node + start = end + n += 1 + return gene + + gene_normal = _parse(F.softmax(self.alpha_normal, dim=-1).data.cpu().numpy()) + gene_reduce = _parse(F.softmax(self.alpha_reduce, dim=-1).data.cpu().numpy()) + + concat = range(2 + self.num_nodes - self.channel_multiplier, self.num_nodes + 2) + genotype = Genotype( + normal=gene_normal, normal_concat=concat, + reduce=gene_reduce, reduce_concat=concat + ) + + return genotype + diff --git a/common/darts/modules/mixed_layer.py b/common/darts/modules/mixed_layer.py new file mode 100644 index 00000000..10bfd6b2 --- /dev/null +++ b/common/darts/modules/mixed_layer.py @@ -0,0 +1,57 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from darts.api import Model + + +class MixedLayer(Model): + """ A mixture of 8 unit types + + We use weights to aggregate these outputs while training. + and softmax to select the strongest edges while inference. + """ + def __init__(self, c, stride, primitives, ops): + super(MixedLayer, self).__init__() + self.reset(c, stride, primitives, ops) + + def reset(self, c, stride, primitives, ops): + self.layers = nn.ModuleList() + + for primitive in primitives: + layer = ops[primitive](c, stride, False) + + if 'pool' in primitive: + layer = nn.Sequential(layer, nn.BatchNorm1d(c, affine=False)) + + self.layers.append(layer) + + def pad(self, tensors): + """ Pad with zeros for mixed layers """ + prev = tensors[0] + padded = [] + for tensor in tensors: + if tensor.shape < prev.shape: + tensor_pad = F.pad( + input=tensor, pad=(1, 1, 1, 1), mode='constant', value=0 + ) + padded.append(tensor_pad) + else: + padded.append(tensor) + prev = tensor + + return padded + + def forward(self, x, weights): + """ + Parameters + ---------- + x : torch.tensor + Data + + Weights : torch.tensor + alpha, [op_num:8], the output = sum of alpha * op(x) + """ + x = [w * layer(x) for w, layer in zip(weights, self.layers)] + x = self.pad(x) + return sum(x) diff --git a/common/darts/modules/network.py b/common/darts/modules/network.py new file mode 100644 index 00000000..1947f837 --- /dev/null +++ b/common/darts/modules/network.py @@ -0,0 +1,200 @@ +from typing import Dict, List, Callable + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from darts.api import Model +from darts.modules import Cell +from darts.modules.classifier import MultitaskClassifier +from darts.genotypes import Genotype + + +class Hyperparameters: + c = 1 + num_nodes = 2 + num_cells = 3 + channel_multiplier = 1 + + +class Network(Model): + """ Collection of cells + + Args: + stem: nn.Module that takes the input data + and outputs `cell_dim` number of features + + classifier_dim: number of features from + Darts.modules.mixed_layer.MixedLayer. This + depends upon the choice of primitives specified + by `ops`. + + ops: Constructor for all of the primitive nn.Modules. This + should be a dictionary of lambda function used to construct + your nn.Modules. The parameters of the lamdas must be `c`, the + number of input channels of each primitive, `stride`, the stride for + convolution blocks, and `affine`, whether to use `affine` in + batch norm. + + tasks: a dictionary whose keys are the names of the classification + tasks, and whose keys are the number of classes in each task. + + criterion: Pytorch loss criterion + + device: Either "cpu" or "gpu + + hyperparams: instance of Hyperparameters. This hyperparamters for DARTS. + """ + + def __init__(self, + stem: nn.Module, + cell_dim: int, + classifier_dim: int, + ops: Dict[str, Callable[[int, int, bool], nn.Module]], + tasks: Dict[str, int], + criterion, + device="cpu", + hyperparams=Hyperparameters()): + super(Network, self).__init__() + self.ops = ops + self.cell_dim = cell_dim + self.tasks = tasks + self.criterion = criterion + self.device = device + self.num_cells = hyperparams.num_cells + self.num_nodes = hyperparams.num_nodes + self.primitives = list(ops.keys()) + self.stem = stem + self.channel_multiplier = hyperparams.channel_multiplier + self.c = hyperparams.c + + # c_curr means a factor of the output channels of current cell + c_curr = cell_dim * self.channel_multiplier * hyperparams.c + cpp, cp, c_curr = c_curr, c_curr, hyperparams.c + self.cells = nn.ModuleList() + for i in range(hyperparams.num_cells): + + cell = Cell( + hyperparams.num_nodes, + hyperparams.channel_multiplier, + cpp, + cp, + c_curr, + self.primitives, + self.ops + ).to(self.device) + + self.cells += [cell] + + self.classifier = MultitaskClassifier(classifier_dim, tasks) + + # k is the total number of edges inside single cell, 14 + k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) + num_ops = len(self.primitives) + + self.alpha_normal = nn.Parameter(torch.randn(k, num_ops)) + + with torch.no_grad(): + # initialize to smaller value + self.alpha_normal.mul_(1e-3) + + self._arch_parameters = [ + self.alpha_normal, + ] + + def new(self): + """ Create a new model initialzed with current alpha parameters. + + Weights are left untouched. + + Returns + ------- + model : Network + New model initialized with current alpha. + """ + model = Network( + self.stem, + self.cell_dim, + self.ops, + self.tasks, + self.criterion + ).to(self.device) + + for x, y in zip(model.arch_parameters(), self.arch_parameters()): + x.data.copy_(y.data) + + return model + + def forward(self, x): + # s0 & s1 means the last cells' output + s0 = s1 = self.stem(x) # [b, 3, 32, 32] => [b, 48, 32, 32] + + for i, cell in enumerate(self.cells): + weights = F.softmax(self.alpha_normal, dim=-1) # [14, 8] + # execute cell() firstly and then assign s0=s1, s1=result + s0, out = s1, cell(s0, s1, weights) # [40, 64, 32, 32] + + logits = self.classifier(out.view(out.size(0), -1)) + + return logits + + def loss_value(self, x_data, y_true, y_pred, reduce='mean'): + """ Calculate a value of loss function """ + y_pred = self(x_data) + + losses = {} + for key, value in y_true.items(): + losses[key] = F.nll_loss(F.log_softmax(y_pred[key], dim=1), y_true[key]) + + if reduce: + total = 0 + for _, value in losses.items(): + total += value + + if reduce == "mean": + losses = total / len(losses) + elif reduce == "sum": + losses = total + + return losses + + def arch_parameters(self): + return self._arch_parameters + + def genotype(self): + """ + :return: + """ + def _parse(weights): + gene = [] + n = 2 + start = 0 + for i in range(self.num_nodes): # for each node + end = start + n + W = weights[start:end].copy() + edges = sorted(range(i + 2), # i+2 is the number of connection for node i + key=lambda x: -max(W[x][k] # by descending order + for k in range(len(W[x])) # get strongest ops + if k != self.primitives.index('none')) + )[:2] # only has two inputs + for j in edges: # for every input nodes j of current node i + k_best = None + for k in range(len(W[j])): # get strongest ops for current input j->i + if k != self.primitives.index('none'): + if k_best is None or W[j][k] > W[j][k_best]: + k_best = k + gene.append((self.primitives[k_best], j)) # save ops and input node + start = end + n += 1 + return gene + + gene_normal = _parse(F.softmax(self.alpha_normal, dim=-1).data.cpu().numpy()) + concat = range(2 + self.num_nodes - self.channel_multiplier, self.num_nodes + 2) + + genotype = Genotype( + normal=gene_normal, normal_concat=concat, + reduce=gene_normal, reduce_concat=concat + ) + + return genotype + diff --git a/common/darts/modules/operations/__init__.py b/common/darts/modules/operations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/common/darts/modules/operations/conv.py b/common/darts/modules/operations/conv.py new file mode 100644 index 00000000..9bc5e14b --- /dev/null +++ b/common/darts/modules/operations/conv.py @@ -0,0 +1,167 @@ +""" +CNN NLP operations closely modeled after the original paper's vision task. +""" + +import torch +import torch.nn as nn + +from darts.api import Model + + +OPS = { + 'none' : lambda c, stride, affine: Zero(stride), + 'avg_pool_3' : lambda c, stride, affine: nn.AvgPool1d(3, stride=stride, padding=1, count_include_pad=False), + 'max_pool_3' : lambda c, stride, affine: nn.MaxPool1d(3, stride=stride, padding=1), + 'skip_connect': lambda c, stride, affine: Identity() if stride == 1 else FactorizedReduce(c, c, affine=affine), + 'sep_conv_3' : lambda c, stride, affine: SepConv(c, c, 3, stride, 1, affine=affine), + 'sep_conv_5' : lambda c, stride, affine: SepConv(c, c, 5, stride, 2, affine=affine), + 'sep_conv_7' : lambda c, stride, affine: SepConv(c, c, 7, stride, 3, affine=affine), + 'dil_conv_3' : lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine), + 'dil_conv_5' : lambda c, stride, affine: DilConv(c, c, 5, stride, 4, 2, affine=affine), + 'convblock_7' : lambda c, stride, affine: ConvBlock(c, c, 7, stride, 3, affine=affine), +} + + +class ConvBlock(Model): + """ ReLu -> Conv1d -> BatchNorm """ + + def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): + super(ConvBlock, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + nn.Conv1d(c_in, c_out, kernel_size, stride=stride, padding=padding, bias=False), + nn.BatchNorm1d(c_out, affine=affine) + ) + + def forward(self, x): + return self.op(x) + + +class DilConv(Model): + """ ReLU Dilated Convolution """ + + def __init__(self, c_in, c_out, kernel_size, stride, padding, dilation, affine=True): + super(DilConv, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + + nn.Conv1d( + c_in, + c_in, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=c_in, + bias=False + ), + + nn.Conv1d( + c_in, + c_out, + kernel_size=1, + padding=0, + bias=False + ), + + nn.BatchNorm1d(c_out, affine=affine), + ) + + def forward(self, x): + return self.op(x) + + +class FactorizedReduce(Model): + """ Reduce the feature maps by half, maintaining number of channels + + Example + ------- + x: torch.Size([2, 10, 12]) + out: [batch_size, c_out, d//2] + out: torch.Size([2, 10, 6]) + """ + + def __init__(self, c_in, c_out, affine=True): + super(FactorizedReduce, self).__init__() + assert c_out % 2 == 0 + + self.conv_1 = nn.Conv1d(c_in, c_out // 2, 1, stride=2, padding=0, bias=False) + self.conv_2 = nn.Conv1d(c_in, c_out // 2, 1, stride=2, padding=0, bias=False) + self.bn = nn.BatchNorm1d(c_out, affine=affine) + + def forward(self, x): + x = torch.relu(x) + out = torch.cat([self.conv_1(x), self.conv_2(x[:, :, 1:])], dim=1) + out = self.bn(out) + return out + + +class Identity(Model): + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +class SepConv(Model): + """ Separable Convolution Block """ + def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): + super(SepConv, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + + nn.Conv1d( + c_in, + c_in, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=c_in, + bias=False + ), + + nn.Conv1d( + c_in, + c_in, + kernel_size=1, + padding=0, + bias=False + ), + + nn.BatchNorm1d(c_in, affine=affine), + nn.ReLU(inplace=False), + + nn.Conv1d( + c_in, + c_in, + kernel_size=kernel_size, + stride=1, + padding=padding, + groups=c_in, + bias=False + ), + + nn.Conv1d(c_in, c_out, kernel_size=1, padding=0, bias=False), + nn.BatchNorm1d(c_out, affine=affine), + ) + + def forward(self, x): + return self.op(x) + + +class Zero(nn.Module): + """ Zero tensor by stride """ + + def __init__(self, stride): + super(Zero, self).__init__() + self.stride = stride + + def forward(self, x): + if self.stride == 1: + return x.mul(0.) + return x[:, :, ::self.stride].mul(0.) diff --git a/common/darts/modules/operations/linear.py b/common/darts/modules/operations/linear.py new file mode 100644 index 00000000..3695b502 --- /dev/null +++ b/common/darts/modules/operations/linear.py @@ -0,0 +1,111 @@ +""" +Linear operations. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from darts.api import Model + + +OPS = { + 'none' : lambda c, stride, affice: Zero(), + 'skip_connect' : lambda c, stride, affine: Identity(), + 'linear_block' : lambda c, stride, affine: LinearBlock(c, c, affine=affine), + 'linear_conv' : lambda c, stride, affine: LinearConv(c, c, 1), + 'linear_drop' : lambda c, stride, affine: LinearDrop(c, c, 1), + 'encoder' : lambda c, stride, affine: Encoder(c, c, 1), +} + + +class LinearBlock(Model): + """ Linear block consisting of two fully connected layers + + Example + ------- + x: torch.Size([2, 10, 12]) + out: [batch_size, c_out, d//2] + out: torch.Size([2, 10, 6]) + """ + + def __init__(self, c_in, c_out, affine=True): + super(LinearBlock, self).__init__() + assert c_out % 2 == 0 + + self.fc1 = nn.Linear(c_in, c_in * 2) + self.fc2 = nn.Linear(c_in * 2, c_out) + + def forward(self, x): + x = torch.relu(x) + x = self.fc1(x) + out = self.fc2(x) + return out + + +class LinearDrop(Model): + """ Linear block with dropout """ + + def __init__(self, c_in, c_out, affine=True): + super(LinearDrop, self).__init__() + assert c_out % 2 == 0 + + self.fc1 = nn.Linear(c_in, c_in * 2) + self.fc2 = nn.Linear(c_in * 2, c_out) + + def forward(self, x): + x = torch.relu(x) + x = F.dropout(self.fc1(x)) + out = F.dropout(self.fc2(x)) + return out + + +class Encoder(Model): + """ Linear encoder """ + + def __init__(self, c_in, c_out, affine=True): + super(Encoder, self).__init__() + assert c_out % 2 == 0 + + self.fc1 = nn.Linear(c_in, c_in // 2) + self.fc2 = nn.Linear(c_in // 2, c_in) + + def forward(self, x): + x = torch.relu(x) + x = self.fc1(x) + return self.fc2(x) + + +class LinearConv(Model): + """ Linear => Conv => Linear """ + + def __init__(self, c_in, c_out, kernel_size): + super(LinearConv, self).__init__() + self.fc_1 = nn.Linear(c_in, c_in) + self.conv = nn.Conv1d(c_in, c_in, kernel_size) + self.fc_2 = nn.Linear(c_in, c_out) + + def forward(self, x): + x = torch.relu(x) + x = self.fc_1(x) + x = self.conv(x) + return x + + +class Identity(Model): + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +class Zero(nn.Module): + """ Zero tensor by stride """ + + def __init__(self): + super(Zero, self).__init__() + + def forward(self, x): + return x diff --git a/common/darts/storage/__init__.py b/common/darts/storage/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/common/darts/storage/genotype.py b/common/darts/storage/genotype.py new file mode 100644 index 00000000..a5c72ad6 --- /dev/null +++ b/common/darts/storage/genotype.py @@ -0,0 +1,92 @@ +import os +import json +from typing import List + +from darts.genotypes import Genotype + + +class GenotypeStorage: + """ Disk storage for Genotypes + + Args: + root: rooth path to save genotype + """ + + def __init__(self, root: str): + self.root = root + + def save_genotype(self, genotype: Genotype, filename='genotype.json') -> None: + """ Save a genotype to disk + + Args: + genotype: genotype to be saved + filename: name of the save file + """ + genotype = self._replace_range(genotype) + os.makedirs(self.root, exist_ok=True) + path = os.path.join(self.root, filename) + with open(path, 'w') as outfile: + json.dump(genotype, outfile) + + def load_genotype(self, filename='genotype.json') -> Genotype: + """ Load a genotype from disk + + Args: + filename: name of the save file + + Returns: + the genotype + """ + path = os.path.join(self.root, filename) + with open(path, 'r') as infile: + saved = json.load(infile) + + genotype = self._convert_serialized(saved) + return genotype + + def _replace_range(self, genotype: Genotype) -> Genotype: + """ Replace the range values with lists + + Python's `range` is not serializable as json objects. + We convert the genotype's ranges to lists first. + + Args: + genotype: the genotype to be serialized + + Returns + genotype: with proper lists. + """ + genotype = genotype._replace(normal_concat=list(genotype.normal_concat)) + genotype = genotype._replace(reduce_concat=list(genotype.reduce_concat)) + return genotype + + def _convert_serialized(self, save: list) -> Genotype: + """ Convert json serialized form to Genotype + + Args: + save: serialized form of the the genotype + + Returns: + the genotype + """ + # Serialized genotypes have a consistent structure + normal = self._convert_to_tuple(save[0]) + normal_concat = save[1] + reduce = self._convert_to_tuple(save[2]) + reduce_concat = save[3] + return Genotype(normal, normal_concat, reduce, reduce_concat) + + def _convert_to_tuple(self, block: list) -> List[tuple]: + """ Convert list to list of tuples + + Used when converting part of a serialized form of + the genotype + + Args: + block: part of the serialized genotype + + Returns: + list of tuples that constitute that block + """ + return [tuple(x) for x in block] + diff --git a/common/darts/utils/__init__.py b/common/darts/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/common/darts/utils/logging.py b/common/darts/utils/logging.py new file mode 100644 index 00000000..a45e106c --- /dev/null +++ b/common/darts/utils/logging.py @@ -0,0 +1,45 @@ +import logging + + +logger = logging.getLogger('DARTS') +fh = logging.FileHandler('darts_accuracy.log') +logger.addHandler(fh) + + +def log_accuracy(accuracy, split: str='train'): + """ Log the average accuracy + + Parameters + ---------- + accuracy: darts.MultitaskAccuracyMeter + Current accuracy meter state + + split: str + Either training of testing + """ + acc_info = ( + f">>> {split.upper()} Accuracy - Subsite: {accuracy.get_avg_accuracy('subsite'):.4f}, " + f"Laterality: {accuracy.get_avg_accuracy('laterality'):.4f}, " + f"Behavior: {accuracy.get_avg_accuracy('behavior'):.4f}, " + f"Grade: {accuracy.get_avg_accuracy('grade'):.4f}" + ) + + logger.info(acc_info) + + +def log_single_accuracy(accuracy, split: str='train'): + """ Log the average accuracy for a single task + + Parameters + ---------- + accuracy: darts.MultitaskAccuracyMeter + Current accuracy meter state + + split: str + Either training of testing + """ + acc_info = ( + f">>> {split.upper()} Accuracy - Response: {accuracy.get_avg_accuracy('response'):.4f}, " + ) + + logger.info(acc_info) diff --git a/common/darts/utils/random.py b/common/darts/utils/random.py new file mode 100644 index 00000000..75682a05 --- /dev/null +++ b/common/darts/utils/random.py @@ -0,0 +1,40 @@ +import os +import torch +import random +import numpy as np + + +class Seeds: + pythonhash = 0 + pythonrand = 0 + numpy = 0 + torch = 0 + + +class SeedControl: + + def __init__(self, seeds=Seeds()): + self.s = seeds + + def fix_all_seeds(self, seed: int): + """Fix all seeds to the same seed""" + self.s.pythonhash = seed + self.s.pythonrand = seed + self.s.numpy = seed + self.s.torch = seed + self.set_seeds() + + def set_seeds(self): + os.environ['PYTHONHASHSEED'] = str(self.s.pythonhash) + random.seed(self.s.pythonrand) + np.random.seed(self.s.numpy) + torch.random.manual_seed(self.s.torch) + + def get_seeds(self): + return { + 'PythonHash': self.s.pythonhash, + 'PythonRand': self.s.pythonrand, + 'Numpy': self.s.numpy, + 'Torch': self.s.torch + } + diff --git a/common/darts/utils/tensor.py b/common/darts/utils/tensor.py new file mode 100644 index 00000000..77257713 --- /dev/null +++ b/common/darts/utils/tensor.py @@ -0,0 +1,17 @@ +import torch + + +def to_device(tensor, device: torch.device): + """ Convert tensor-like object to given PyTorch device """ + if tensor is None: + return tensor + elif isinstance(tensor, torch.Tensor): + return tensor.to(device) + elif isinstance(tensor, dict): + return {k: to_device(v, device) for k, v in tensor.items()} + elif isinstance(tensor, list): + return [to_device(v, device) for v in tensor] + elif isinstance(tensor, tuple): + return tuple(to_device(v, device) for v in tensor) + else: + raise NotImplementedError \ No newline at end of file diff --git a/common/darts/visualize.py b/common/darts/visualize.py new file mode 100644 index 00000000..bf048ca6 --- /dev/null +++ b/common/darts/visualize.py @@ -0,0 +1,70 @@ +import sys +from graphviz import Digraph +import genotypes + + +def plot(genotype, filename): + """ Plot the graph of a given genotype """ + g = Digraph( + format='pdf', + edge_attr = dict(fontsize='20', fontname="times"), + node_attr = dict( + style='filled', + shape='rect', + align='center', + fontsize='20', + height='0.5', + width='0.5', + penwidth='2', + fontname="times" + ), + engine='dot' + ) + + g.body.extend(['rankdir=LR']) + + g.node("c_{k-2}", fillcolor='darkseagreen2') + g.node("c_{k-1}", fillcolor='darkseagreen2') + + assert len(genotype) % 2 == 0 + steps = len(genotype) // 2 + + for i in range(steps): + g.node(str(i), fillcolor='lightblue') + + for i in range(steps): + for k in [2 * i, 2 * i + 1]: + op, j = genotype[k] + if j == 0: + u = "c_{k-2}" + elif j == 1: + u = "c_{k-1}" + else: + u = str(j - 2) + v = str(i) + g.edge(u, v, label=op, fillcolor="gray") + + g.node("c_{k}", fillcolor='palegoldenrod') + + for i in range(steps): + g.edge(str(i), "c_{k}", fillcolor="gray") + + g.render(filename, view=True) + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print("usage:\n python {} ARCH_NAME".format(sys.argv[0])) + sys.exit(1) + + genotype_name = sys.argv[1] + + try: + genotype = eval('genotypes.{}'.format(genotype_name)) + except AttributeError: + print("{} is not specified in genotypes.py".format(genotype_name)) + sys.exit(1) + + plot(genotype.normal, "normal") + plot(genotype.reduce, "reduction") + diff --git a/common/data_preprocessing_utils.py b/common/data_preprocessing_utils.py new file mode 100644 index 00000000..04cefc58 --- /dev/null +++ b/common/data_preprocessing_utils.py @@ -0,0 +1,133 @@ +import sys +import pandas as pd +import numpy as np +import numpy.linalg as la +from scipy import stats +from collections import Counter + +def quantile_normalization(data): + ''' + This function does quantile normalization to input data. After normalization, the samples (rows) in output + data follow the same distribution, which is the average distribution calculated based on all samples. + This function allows missing values, and assume missing values occur at random. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features]. + + Returns: + -------- + norm_data: numpy array or pandas data frame containing the data after quantile normalization. + ''' + + colnames = None + rownames = None + if isinstance(data, pd.DataFrame): + colnames = data.columns + rownames = data.index + data = data.values + elif not isinstance(data, np.ndarray): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + norm_data = data.copy() + nan_mask = np.isnan(norm_data) + if np.sum(nan_mask) > 0: + n_samples, n_features = norm_data.shape + for i in range(n_samples): + idi_nan = np.where(np.isnan(norm_data[i, :]))[0] + if len(idi_nan) > 0: + idi = np.setdiff1d(range(n_features), idi_nan) + norm_data[i, idi_nan] = np.random.choice(norm_data[i, idi], size=len(idi_nan), replace=True) + + quantiles = np.mean(np.sort(norm_data, axis=1), axis=0) + ranks = np.apply_along_axis(stats.rankdata, 1, norm_data) + rank_indices = ranks.astype(int) - 1 + norm_data = quantiles[rank_indices] + + if np.sum(nan_mask) > 0: + row_id, col_id = np.where(nan_mask) + norm_data[row_id, col_id] = np.nan + + if colnames is not None and rownames is not None: + norm_data = pd.DataFrame(norm_data, columns=colnames, index=rownames) + + return norm_data + +def generate_cross_validation_partition(group_label, n_folds=5, n_repeats=1, portions=None, random_seed=None): + ''' + This function generates partition indices of samples for cross-validation analysis. + + Parameters: + ----------- + group_label: 1-D array or list of group labels of samples. If there are no groups in samples, a list of + sample indices can be supplied for generating partitions based on individual samples rather than sample groups. + n_folds: positive integer larger than 1, indicating the number of folds for cross-validation. Default is 5. + n_repeats: positive integer, indicating how many times the n_folds cross-validation should be repeated. + So the total number of cross-validation trials is n_folds * n_repeats. Default is 1. + portions: 1-D array or list of positive integers, indicating the number of data folds in each set + (e.g. training set, testing set, or validation set) after partitioning. The summation of elements + in portions must be equal to n_folds. Default is [1, n_folds - 1]. + random_seed: positive integer, the seed for random generator. Default is None. + + Returns: + -------- + partition: list of n_folds * n_repeats lists, each of which contains len(portions) sample index lists for + a cross-validation trial. + ''' + + group_counter = Counter(group_label) + unique_label = np.array(list(group_counter.keys())) + n_group = len(unique_label) + if n_group < n_folds: + print('The number of groups in labels can not be smaller than the number of folds.') + sys.exit(1) + sorted_label = np.array(sorted(unique_label, key=lambda x: group_counter[x], reverse=True)) + + if portions is None: + portions = [1, n_folds - 1] + else: + if np.sum(portions) != n_folds: + print('The summation of elements in portions must be equal to n_folds') + sys.exit(1) + + if random_seed is not None: + np.random.seed(random_seed) + + n_set = len(portions) + partition = [] + for r in range(n_repeats): + + if r == 0 and random_seed is None: + label = sorted_label.copy() + else: + idr = np.random.permutation(n_group) + label = sorted_label[idr] + + folds = [[] for _ in range(n_folds)] + fold_size = np.zeros((n_folds, )) + + for g in range(n_group): + f = np.argmin(fold_size) + folds[f].append(label[g]) + fold_size[f] += group_counter[label[g]] + + for f in range(n_folds): + folds[f] = list(np.where(np.isin(group_label, folds[f]))[0]) + + a = list(range(n_folds)) + list(range(n_folds)) + for f in range(n_folds): + temp = [] + end = f + for s in range(n_set): + start = end + end = start + portions[s] + t = [] + for i in range(start, end): + t = t + folds[a[i]] + temp.append(sorted(t)) + partition.append(temp) + + return partition + + diff --git a/common/data_utils.py b/common/data_utils.py index c17a3b42..b1a3e613 100644 --- a/common/data_utils.py +++ b/common/data_utils.py @@ -3,7 +3,15 @@ import numpy as np import pandas as pd -from sklearn.preprocessing import Imputer +## Adding conditional import for compatibility between +## sklearn versions +## The second commented line corresponds to a more recent version +#from sklearn.preprocessing import Imputer +#from sklearn.impute import SimpleImputer +try: + from sklearn.impute import SimpleImputer as Imputer +except ImportError: + from sklearn.preprocessing import Imputer from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler from default_utils import DEFAULT_SEED @@ -125,13 +133,168 @@ def impute_and_scale_array(mat, scaling=None): it returns the imputed numpy array. """ - imputer = Imputer(strategy='mean', axis=0, copy=False) +# imputer = Imputer(strategy='mean', axis=0, copy=False) +# imputer = SimpleImputer(strategy='mean', copy=False) + # Next line is from conditional import. axis=0 is default + # in old version so it is not necessary. + imputer = Imputer(strategy='mean', copy=False) imputer.fit_transform(mat) - #mat = imputer.fit_transform(mat) return scale_array(mat, scaling) +def drop_impute_and_scale_dataframe(df, scaling='std', imputing='mean', dropna='all'): + """Impute missing values with mean and scale data included in pandas dataframe. + + Parameters + ---------- + df : pandas dataframe + dataframe to process + scaling : string + String describing type of scaling to apply. + 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional + (Default 'std') + imputing : string + String describing type of imputation to apply. + 'mean' replace missing values with mean value along the column, + 'median' replace missing values with median value along the column, + 'most_frequent' replace missing values with most frequent value along column + (Default: 'mean'). + dropna : string + String describing strategy for handling missing values. + 'all' if all values are NA, drop that column. + 'any' if any NA values are present, dropt that column. + (Default: 'all'). + + Return + ---------- + Returns the data frame after handling missing values and scaling. + + """ + + if dropna: + df = df.dropna(axis=1, how=dropna) + else: + empty_cols = df.columns[df.notnull().sum() == 0] + df[empty_cols] = 0 + + if imputing is None or imputing.lower() == 'none': + mat = df.values + else: +# imputer = Imputer(strategy=imputing, axis=0) +# imputer = SimpleImputer(strategy=imputing) + # Next line is from conditional import. axis=0 is default + # in old version so it is not necessary. + imputer = Imputer(strategy=imputing) + mat = imputer.fit_transform(df.values) + + if scaling is None or scaling.lower() == 'none': + return pd.DataFrame(mat, columns=df.columns) + + if scaling == 'maxabs': + scaler = MaxAbsScaler() + elif scaling == 'minmax': + scaler = MinMaxScaler() + else: + scaler = StandardScaler() + + mat = scaler.fit_transform(mat) + df = pd.DataFrame(mat, columns=df.columns) + + return df + + +def discretize_dataframe(df, col, bins=2, cutoffs=None): + """Discretize values of given column in pandas dataframe. + + Parameters + ---------- + df : pandas dataframe + dataframe to process. + col : int + Index of column to bin. + bins : int + Number of bins for distributing column values. + cutoffs : list + List of bin limits. + If None, the limits are computed as percentiles. + (Default: None). + + Return + ---------- + Returns the data frame with the values of the specified column binned, i.e. the values + are replaced by the associated bin number. + + """ + + y = df[col] + thresholds = cutoffs + if thresholds is None: + percentiles = [100 / bins * (i + 1) for i in range(bins - 1)] + thresholds = [np.percentile(y, x) for x in percentiles] + classes = np.digitize(y, thresholds) + df[col] = classes + + return df + + +def discretize_array(y, bins=5): + """Discretize values of given array. + + Parameters + ---------- + y : numpy array + array to discretize. + bins : int + Number of bins for distributing column values. + + Return + ---------- + Returns an array with the bin number associated to the values in the + original array. + + """ + + percentiles = [100 / bins * (i + 1) for i in range(bins - 1)] + thresholds = [np.percentile(y, x) for x in percentiles] + classes = np.digitize(y, thresholds) + return classes + + + +def lookup(df, query, ret, keys, match='match'): + """Dataframe lookup. + + Parameters + ---------- + df : pandas dataframe + dataframe for retrieving values. + query : string + String for searching. + ret : int/string or list + Names or indices of columns to be returned. + keys : list + List of strings or integers specifying the names or + indices of columns to look into. + match : string + String describing strategy for matching keys to query. + + Return + ---------- + Returns a list of the values in the dataframe whose columns match + the specified query and have been selected to be returned. + + """ + + mask = pd.Series(False, index=range(df.shape[0])) + for key in keys: + if match == 'contains': + mask |= df[key].str.contains(query.upper(), case=False) + else: + mask |= (df[key].str.upper() == query.upper()) + + return list(set(df[mask][ret].values.flatten().tolist())) + def load_X_data(train_file, test_file, drop_cols=None, n_cols=None, shuffle=False, scaling=None, diff --git a/common/default_utils.py b/common/default_utils.py index 143e227e..4e649a2c 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -6,6 +6,7 @@ import inspect import logging +import warnings import os import sys @@ -30,6 +31,57 @@ DEFAULT_DATATYPE = np.float32 +PARAMETERS_CANDLE = [ + 'config_file', + # neon parser + 'verbose', 'logfile', 'save_path', 'model_name', 'data_type', 'dense', 'rng_seed', 'epochs', 'batch_size', + # general behavior + 'train_bool', 'eval_bool', 'timeout', + # logging + 'home_dir', 'train_data', 'test_data', 'output_dir', 'data_url', 'experiment_id', 'run_id', + # model architecture + 'conv', 'locally_connected', 'activation', 'out_activation', 'lstm_size', 'recurrent_dropout', + # processing between layers + 'dropout', 'pool', 'batch_normalization', + # model evaluation + 'loss', 'optimizer', 'metrics', + # data preprocessing + 'scaling', 'shuffle', 'feature_subsample', + # training + 'learning_rate', 'early_stop', 'momentum', 'initialization', + 'val_split', 'train_steps', 'val_steps', 'test_steps', 'train_samples', 'val_samples', + # backend + 'gpus', + # profiling + 'profiling', + # cyclic learning rate + 'clr_flag', 'clr_mode', 'clr_base_lr', 'clr_max_lr', 'clr_gamma' + ] + +CONFLICT_LIST = [ + ['clr_flag','warmup_lr'], + ['clr_flag','reduce_lr'] +] + +def check_flag_conflicts(params): + key_set = set(params.keys()) + # check for conflicts + #conflict_flag = False + # loop over each set of mutually exclusive flags + # if any set conflicts exit program + for flag_list in CONFLICT_LIST: + flag_count = 0 + for i in flag_list: + if i in key_set: + if params[i] is True: + flag_count +=1 + if flag_count > 1 : + raise Exception('ERROR ! Conflict in flag specification. ' \ + 'These flags should not be used together: ' + str(sorted(flag_list)) + \ + '... Exiting') + #print("Warning: conflicting flags in ", flag_list) + #exit() + #### IO UTILS def fetch_file(link, subdir, untar=False, md5_hash=None): @@ -113,7 +165,7 @@ def set_up_logger(logfile, logger, verbose): def eval_string_as_list(str_read, separator, dtype): """ Parse a string and convert it into a list of lists. - + Parameters ---------- str_read : string @@ -122,7 +174,7 @@ def eval_string_as_list(str_read, separator, dtype): Character that specifies the separation between the lists dtype : data type Data type to decode the elements of the list - + Return ---------- decoded_list : list @@ -149,7 +201,7 @@ def eval_string_as_list(str_read, separator, dtype): def eval_string_as_list_of_lists(str_read, separator_out, separator_in, dtype): """ Parse a string and convert it into a list of lists. - + Parameters ---------- str_read : string @@ -160,7 +212,7 @@ def eval_string_as_list_of_lists(str_read, separator_out, separator_in, dtype): Character that specifies the separation between the inner level lists dtype : data type Data type to decode the elements of the lists - + Return ---------- decoded_list : list @@ -238,7 +290,7 @@ def __init__(self, option_strings, dest, type, **kwargs): """Initialize a ListOfListsAction object. If no type is specified, an integer is assumed by default as the type for the elements of the list-of-lists. - + Parameters ---------- option_strings : string @@ -257,13 +309,13 @@ def __init__(self, option_strings, dest, type, **kwargs): self.dtype = type if self.dtype is None: self.dtype = np.int32 - + def __call__(self, parser, namespace, values, option_string=None): """This function overrides the __call__ method of the base argparse.Action class. - + This function implements the action of the ListOfListAction class by parsing an input string (command-line option or argument) and maping it into a list-of-lists. The resulting list-of-lists is @@ -317,21 +369,59 @@ def set_seed(seed): random.seed(seed) - -def initialize_parameters(bmk): - """Utility to parse parameters in common as well as parmeters +def check_file_parameters_exists(params_parser, params_benchmark, params_file): + """Functionality to verify that the parameters defined in the configuraion file are recognizable by the command line parser (i.e. no uknown keywords are used in the configuration file). + + Parameters + ---------- + params_parser : python dictionary + Includes parameters set via the command line. + params_benchmark : python list + Includes additional parameters defined in the benchmark. + params_file : python dictionary + Includes parameters read from the configuration file. + + Global: + PARAMETERS_CANDLE : python list + Includes all the core keywords that are specified in CANDLE. + """ + # Get keywords from arguments coming via command line (and CANDLE supervisor) + args_dict = vars(params_parser) + args_set = set(args_dict.keys()) + # Get keywords from benchmark definition + bmk_keys = [] + for item in params_benchmark: + bmk_keys.append( item['name'] ) + bmk_set = set(bmk_keys) + # Get core CANDLE keywords + candle_set = set(PARAMETERS_CANDLE) + # Consolidate keywords from CANDLE core, command line, CANDLE supervisor and benchmark + candle_set = candle_set.union(args_set) + candle_set = candle_set.union(bmk_set) + # Get keywords used in config_file + file_set = set(params_file.keys()) + # Compute keywords that come from the config_file that are not in the CANDLE specs + diff_set = file_set.difference(candle_set) + + if ( len(diff_set) > 0 ): + message = 'These keywords used in the configuration file are not defined in CANDLE: ' + str(sorted(diff_set)) + warnings.warn(message, RuntimeWarning) + + +def finalize_parameters(bmk): + """Utility to parse parameters in common as well as parameters particular to each benchmark. Parameters ---------- bmk : benchmark object Object that has benchmark filepaths and specifications - + Return ---------- gParameters : python dictionary Dictionary with all the parameters necessary to run the benchmark. - Command line overwrites config file especifications + Command line overwrites config file specifications """ # Parse common parameters @@ -352,17 +442,22 @@ def initialize_parameters(bmk): else: # a 'config_file' has been set --> use this file conffile = os.path.join(bmk.file_path, conffile_txt) - print("Configuration file: ", conffile) + #print("Configuration file: ", conffile) fileParameters = bmk.read_config_file(conffile)#aux.config_file)#args.config_file) # Get command-line parameters args = bmk.parser.parse_args() #print ('Params:', fileParameters) + # Check keywords from file against CANDLE common and module definitions + bmk_dict = bmk.additional_definitions + check_file_parameters_exists(args, bmk_dict, fileParameters) # Consolidate parameter set. Command-line parameters overwrite file configuration gParameters = args_overwrite_config(args, fileParameters) # Check that required set of parameters has been defined bmk.check_required_exists(gParameters) print ('Params:') pprint(gParameters) + # Check that no keywords conflict + check_flag_conflicts(gParameters) return gParameters @@ -370,7 +465,7 @@ def initialize_parameters(bmk): def get_default_neon_parser(parser): """Parse command-line arguments that are default in neon parser (and are common to all frameworks). Ignore if not present. - + Parameters ---------- parser : ArgumentParser object @@ -382,16 +477,16 @@ def get_default_neon_parser(parser): parser.add_argument("-l", "--log", dest='logfile', default=None, help="log file") - + # Logging utilities parser.add_argument("-s", "--save_path", dest='save_path', default=argparse.SUPPRESS, type=str, help="file path to save model snapshots") # General behavior - parser.add_argument("--model_file", dest='model_file', type=str, + parser.add_argument("--model_name", dest='model_name', type=str, default=argparse.SUPPRESS, - help="specify trained model Pickle file") + help="specify model name to use when building filenames for saving") parser.add_argument("-d", "--data_type", dest='data_type', default=argparse.SUPPRESS, choices=['f16', 'f32', 'f64'], @@ -424,17 +519,17 @@ def get_default_neon_parser(parser): def get_common_parser(parser): """Parse command-line arguments. Ignore if not present. - + Parameters ---------- parser : ArgumentParser object Parser for command-line options """ - + # Configuration file parser.add_argument("--config_file", dest='config_file', default=argparse.SUPPRESS, help="specify model configuration file") - + # General behavior parser.add_argument("--train_bool", dest='train_bool', type=str2bool, default=True, @@ -452,7 +547,7 @@ def get_common_parser(parser): parser.add_argument("--home_dir", dest='home_dir', default=argparse.SUPPRESS, type=str, help="set home directory") - + parser.add_argument("--train_data", action="store", default=argparse.SUPPRESS, help="training data filename") @@ -464,7 +559,7 @@ def get_common_parser(parser): parser.add_argument("--output_dir", dest='output_dir', default=argparse.SUPPRESS, type=str, help="output directory") - + parser.add_argument("--data_url", dest='data_url', default=argparse.SUPPRESS, type=str, help="set data source url") @@ -472,7 +567,7 @@ def get_common_parser(parser): parser.add_argument("--experiment_id", default="EXP000", type=str, help="set the experiment unique identifier") parser.add_argument("--run_id", default="RUN000", type=str, help="set the run unique identifier") - + # Model definition @@ -489,18 +584,18 @@ def get_common_parser(parser): parser.add_argument("--out_activation", default=argparse.SUPPRESS, help="keras activation function to use in out layer: softmax, linear, ...") - - + + parser.add_argument("--lstm_size", nargs='+', type=int, default= argparse.SUPPRESS, help="integer array describing size of LSTM internal state per layer") parser.add_argument("--recurrent_dropout", action="store", default=argparse.SUPPRESS, type=float, help="ratio of recurrent dropout") - - + + # Processing between layers - parser.add_argument("--drop", type=float, + parser.add_argument("--dropout", type=float, default=argparse.SUPPRESS, help="ratio of dropout used in fully connected layers") parser.add_argument("--pool", type=int, @@ -509,7 +604,7 @@ def get_common_parser(parser): parser.add_argument("--batch_normalization", type=str2bool, default=argparse.SUPPRESS, help="use batch normalization") - + # Model Evaluation parser.add_argument("--loss", default=argparse.SUPPRESS, @@ -521,13 +616,13 @@ def get_common_parser(parser): parser.add_argument("--metrics", default=argparse.SUPPRESS, help="metrics to evaluate performance: accuracy, ...") - + # Data preprocessing parser.add_argument("--scaling", default=argparse.SUPPRESS, choices=['minabs', 'minmax', 'std', 'none'], help="type of feature scaling; 'minabs': to [-1,1]; 'minmax': to [0,1], 'std': standard unit normalization; 'none': no normalization") - + parser.add_argument("--shuffle", type=str2bool, default=False, help="randomly shuffle data set (produces different training and testing partitions each run depending on the seed)") @@ -540,7 +635,13 @@ def get_common_parser(parser): parser.add_argument("--learning_rate", default= argparse.SUPPRESS, type=float, help="overrides the learning rate for training") - + parser.add_argument("--early_stop", type=str2bool, + default= argparse.SUPPRESS, + help="activates keras callback for early stopping of training in function of the monitored variable specified") + parser.add_argument("--momentum", + default= argparse.SUPPRESS, type=float, + help="overrides the momentum to use in the SGD optimizer when training") + parser.add_argument("--initialization", default=argparse.SUPPRESS, choices=['constant', 'uniform', 'normal', 'glorot_uniform', 'lecun_uniform', 'he_normal'], @@ -563,13 +664,43 @@ def get_common_parser(parser): parser.add_argument("--val_samples", action="store", default=argparse.SUPPRESS, type=int, help="overrides the number of validation samples if set to nonzero") - - + + # Backend configuration - parser.add_argument("--gpus", action="store", nargs='*', - default=[], type=int, + parser.add_argument("--gpus", nargs="*", + default=argparse.SUPPRESS, + #default=[0], + type=int, help="set IDs of GPUs to use") + # profiling flags + parser.add_argument("-p", "--profiling", type=str2bool, + default = 'false', + help="Turn profiling on or off") + + # cyclic learning rate + parser.add_argument("--clr_flag", + default=argparse.SUPPRESS, + #default=None, + type=str2bool, + help="CLR flag (boolean)") + parser.add_argument("--clr_mode", + default=argparse.SUPPRESS, + #default=None, + type=str, choices=['trng1', 'trng2', 'exp'], + help="CLR mode (default: trng1)") + parser.add_argument("--clr_base_lr", type=float, + default=argparse.SUPPRESS, + #default=1e-4, + help="Base lr for cycle lr.") + parser.add_argument("--clr_max_lr", type=float, + default=argparse.SUPPRESS, + #default=1e-3, + help="Max lr for cycle lr.") + parser.add_argument("--clr_gamma", type=float, + default=argparse.SUPPRESS, + #default=0.999994, + help="Gamma parameter for learning cycle LR.") return parser @@ -578,7 +709,7 @@ def get_common_parser(parser): def args_overwrite_config(args, config): """Overwrite configuration parameters with parameters specified via command-line. - + Parameters ---------- args : ArgumentParser object @@ -586,20 +717,20 @@ def args_overwrite_config(args, config): config : python dictionary Parameters read from configuration file """ - + params = config - + args_dict = vars(args) - + for key in args_dict.keys(): params[key] = args_dict[key] - - - if 'datatype' not in params: - params['datatype'] = DEFAULT_DATATYPE + + + if 'data_type' not in params: + params['data_type'] = DEFAULT_DATATYPE else: - if params['datatype'] in set(['f16', 'f32', 'f64']): - params['datatype'] = get_choice(params['datatype']) + if params['data_type'] in set(['f16', 'f32', 'f64']): + params['data_type'] = get_choice(params['datatype']) if 'output_dir' not in params: params['output_dir'] = directory_from_parameters(params) @@ -621,16 +752,16 @@ def get_choice(name): """ Maps name string to the right type of argument """ mapping = {} - + # dtype mapping['f16'] = np.float16 mapping['f32'] = np.float32 mapping['f64'] = np.float64 - + mapped = mapping.get(name) if not mapped: raise Exception('No mapping found for "{}"'.format(name)) - + return mapped @@ -645,7 +776,7 @@ def directory_from_parameters(params, commonroot='Output'): String to specify the common folder to store results. """ - + if commonroot in set(['.', './']): # Same directory --> convert to absolute path outdir = os.path.abspath('.') else: # Create path specified @@ -696,7 +827,7 @@ def __init__(self, filepath, defmodel, framework, prog=None, desc=None, parser=N parser : argparser (default None) if 'neon' framework a NeonArgparser is passed. Otherwise an argparser is constructed. """ - + if parser is None: parser = argparse.ArgumentParser(prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=desc, conflict_handler='resolve') @@ -704,11 +835,11 @@ def __init__(self, filepath, defmodel, framework, prog=None, desc=None, parser=N self.file_path = filepath self.default_model = defmodel self.framework = framework - + self.required = set([]) self.additional_definitions = [] self.set_locals() - + def parse_from_common(self): @@ -718,15 +849,15 @@ def parse_from_common(self): 'get_common_parser' which are defined previously(above). If the order changes or they are moved, the calling has to be updated. """ - - + + # Parse has been split between arguments that are common with the default neon parser # and all the other options parser = self.parser if self.framework is not 'neon': parser = get_default_neon_parser(parser) parser = get_common_parser(parser) - + self.parser = parser # Set default configuration file @@ -737,7 +868,7 @@ def parse_from_benchmark(self): """Functionality to parse options specific specific for each benchmark. """ - + for d in self.additional_definitions: if 'type' not in d: d['type'] = None @@ -763,13 +894,13 @@ def parse_from_benchmark(self): self.parser.add_argument('--' + d['name'], choices=d['choices'], default=d['default'], help=d['help']) else: # Non an action, one parameter, no choices self.parser.add_argument('--' + d['name'], type=d['type'], default=d['default'], help=d['help']) - + def format_benchmark_config_arguments(self, dictfileparam): """ Functionality to format the particular parameters of the benchmark. - + Parameters ---------- dictfileparam : python dictionary @@ -779,7 +910,7 @@ def format_benchmark_config_arguments(self, dictfileparam): Most of the time command-line overwrites configuration file except when the command-line is using default values and config file defines those values - + """ configOut = dictfileparam.copy() @@ -790,7 +921,7 @@ def format_benchmark_config_arguments(self, dictfileparam): dtype = d['type'] else: dtype = None - + if 'action' in d: if inspect.isclass(d['action']): str_read = dictfileparam[d['name']] @@ -812,7 +943,7 @@ def read_config_file(self, file): config.read(file) section=config.sections() fileParams={} - + # parse specified arguments (minimal validation: if arguments # are written several times in the file, just the first time # will be used) @@ -820,9 +951,8 @@ def read_config_file(self, file): for k,v in config.items(sec): if not k in fileParams: fileParams[k] = eval(v) - fileParams = self.format_benchmark_config_arguments(fileParams) - pprint(fileParams) + #pprint(fileParams) return fileParams @@ -834,11 +964,11 @@ def set_locals(self): - additional_definitions: list of dictionaries describing \ the additional parameters for the benchmark. """ - + pass - + def check_required_exists(self, gparam): """Functionality to verify that the required model parameters have been specified. @@ -847,7 +977,7 @@ def check_required_exists(self, gparam): key_set = set(gparam.keys()) intersect_set = key_set.intersection(self.required) diff_set = self.required.difference(intersect_set) - + if ( len(diff_set) > 0 ): raise Exception('ERROR ! Required parameters are not specified. ' \ 'These required parameters have not been initialized: ' + str(sorted(diff_set)) + \ @@ -859,9 +989,9 @@ def keras_default_config(): """Defines parameters that intervine in different functions using the keras defaults. This helps to keep consistency in parameters between frameworks. """ - + kerasDefaults = {} - + # Optimizers #kerasDefaults['clipnorm']=? # Maximum norm to clip all parameter gradients #kerasDefaults['clipvalue']=? # Maximum (minimum=-max) value to clip all parameter gradients diff --git a/common/feature_selection_utils.py b/common/feature_selection_utils.py new file mode 100644 index 00000000..ee1c47da --- /dev/null +++ b/common/feature_selection_utils.py @@ -0,0 +1,177 @@ +import sys +import pandas as pd +import numpy as np +import numpy.linalg as la +from astropy.stats import median_absolute_deviation +import matplotlib.pyplot as plt + +def select_features_by_missing_values(data, threshold=0.1): + ''' + This function returns the indices of the features whose missing rates are smaller than the threshold. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features] + threshold: float in the range of [0, 1]. Features with a missing rate smaller than threshold will be selected. + Default is 0.1 + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected features + ''' + + if isinstance(data, pd.DataFrame): + data = data.values + elif not isinstance(data, np.ndarray): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + missing_rate = np.sum(np.isnan(data), axis=0) / data.shape[0] + indices = np.where(missing_rate < threshold)[0] + + indices = np.sort(indices) + + return indices + +def select_features_by_variation(data, variation_measure='var', threshold=None, portion=None, draw_histogram=False, + bins=100, log=False): + ''' + This function evaluates the variations of individual features and returns the indices of features with large + variations. Missing values are ignored in evaluating variation. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features]. + variation_metric: string indicating the metric used for evaluating feature variation. 'var' indicates variance; + 'std' indicates standard deviation; 'mad' indicates median absolute deviation. Default is 'var'. + threshold: float. Features with a variation larger than threshold will be selected. Default is None. + portion: float in the range of [0, 1]. It is the portion of features to be selected based on variation. + The number of selected features will be the smaller of int(portion * n_features) and the total number of + features with non-missing variations. Default is None. threshold and portion can not take real values + and be used simultaneously. + draw_histogram: boolean, whether to draw a histogram of feature variations. Default is False. + bins: positive integer, the number of bins in the histogram. Default is the smaller of 50 and the number of + features with non-missing variations. + log: boolean, indicating whether the histogram should be drawn on log scale. + + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected features. If both threshold and + portion are None, indices will be an empty array. + ''' + + if isinstance(data, pd.DataFrame): + data = data.values + elif not isinstance(data, np.ndarray): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + if variation_measure == 'std': + v_all = np.nanstd(a=data, axis=0) + elif variation_measure == 'mad': + v_all = median_absolute_deviation(data=data, axis=0, ignore_nan=True) + else: + v_all = np.nanvar(a=data, axis=0) + + indices = np.where(np.invert(np.isnan(v_all)))[0] + v = v_all[indices] + + if draw_histogram: + if len(v) < 50: + print('There must be at least 50 features with variation measures to draw a histogram') + else: + bins = int(min(bins, len(v))) + _ = plt.hist(v, bins=bins, log=log) + plt.show() + + if threshold is None and portion is None: + return np.array([]) + elif threshold is not None and portion is not None: + print('threshold and portion can not be used simultaneously. Only one of them can take a real value') + sys.exit(1) + + if threshold is not None: + indices = indices[np.where(v > threshold)[0]] + else: + n_f = int(min(portion * data.shape[1], len(v))) + indices = indices[np.argsort(-v)[:n_f]] + + indices = np.sort(indices) + + return indices + +def select_decorrelated_features(data, method='pearson', threshold=None, random_seed=None): + ''' + This function selects features whose mutual absolute correlation coefficients are smaller than a threshold. + It allows missing values in data. The correlation coefficient of two features are calculated based on + the observations that are not missing in both features. Features with only one or no value present and + features with a zero standard deviation are not considered for selection. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features]. + method: string indicating the method used for calculating correlation coefficient. 'pearson' indicates Pearson + correlation coefficient; 'kendall' indicates Kendall Tau correlation coefficient; 'spearman' indicates + Spearman rank correlation coefficient. Default is 'pearson'. + threshold: float. If two features have an absolute correlation coefficient higher than threshold, + one of the features is removed. If threshold is None, a feature is removed only when the two features + are exactly identical. Default is None. + random_seed: positive integer, seed of random generator for ordering the features. If it is None, features + are not re-ordered before feature selection and thus the first feature is always selected. Default is None. + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected features. + ''' + + if isinstance(data, np.ndarray): + data = pd.DataFrame(data) + elif not isinstance(data, pd.DataFrame): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + present = np.where(np.sum(np.invert(pd.isna(data)), axis=0) > 1)[0] + present = present[np.where(np.nanstd(data.iloc[:, present].values, axis=0) > 0)[0]] + + data = data.iloc[:, present] + + num_f = data.shape[1] + if random_seed is not None: + np.random.seed(random_seed) + random_order = np.random.permutation(num_f) + data = data.iloc[:, random_order] + + if threshold is not None: + if np.sum(pd.isna(data).values) == 0 and method == 'pearson': + cor = np.corrcoef(data.values, rowvar=False) + else: + cor = data.corr(method=method).values + else: + data = data.values + + rm = np.full(num_f, False) + index = 0 + while index < num_f-1: + if rm[index]: + index += 1 + continue + idi = np.array(range(index+1, num_f)) + idi = idi[np.where(rm[idi] == False)[0]] + if len(idi) > 0: + if threshold is None: + idi = idi[np.where(np.sum(np.isnan(data[:, idi]) ^ np.isnan(data[:, index][:, np.newaxis]), axis=0) == 0)[0]] + if len(idi) > 0: + idi = idi[np.where(np.nansum(abs(data[:, idi] - data[:, index][:, np.newaxis]), axis=0) == 0)[0]] + else: + idi = idi[np.where(abs(cor[index, idi]) >= threshold)[0]] + if len(idi) > 0: + rm[idi] = True + index += 1 + + indices = np.where(rm == False)[0] + if random_seed is not None: + indices = random_order[indices] + indices = np.sort(present[indices]) + + return indices diff --git a/common/file_utils.py b/common/file_utils.py index 04085dc3..46e065d2 100644 --- a/common/file_utils.py +++ b/common/file_utils.py @@ -9,6 +9,7 @@ from six.moves.urllib.request import urlopen from six.moves.urllib.error import URLError, HTTPError +import requests from generic_utils import Progbar @@ -39,7 +40,9 @@ def chunk_read(response, chunk_size=8192, reporthook=None): def get_file(fname, origin, untar=False, - md5_hash=None, cache_subdir='common'): + #md5_hash=None, datadir='../Data/common'): + #md5_hash=None, cache_subdir='common', datadir='../Data/common'): + md5_hash=None, cache_subdir='common', datadir=None): # datadir argument was never actually used so changing it to None """ Downloads a file from a URL if it not already in the cache. Passing the MD5 hash will verify the file after download as well as if it is already present in the cache. @@ -56,15 +59,19 @@ def get_file(fname, origin, untar=False, MD5 hash of the file for verification cache_subdir : string directory being used as the cache + datadir : string + if set, datadir becomes its setting (which could be e.g. an absolute path) and cache_subdir no longer matters Returns ---------- Path to the downloaded file """ - file_path = os.path.dirname(os.path.realpath(__file__)) - datadir_base = os.path.expanduser(os.path.join(file_path, '..', 'Data')) - datadir = os.path.join(datadir_base, cache_subdir) + if datadir is None: + file_path = os.path.dirname(os.path.realpath(__file__)) + datadir_base = os.path.expanduser(os.path.join(file_path, '..', 'Data')) + datadir = os.path.join(datadir_base, cache_subdir) + if not os.path.exists(datadir): os.makedirs(datadir) @@ -80,11 +87,13 @@ def get_file(fname, origin, untar=False, fnamesplit = fname.split('.tgz') untar_fpath = os.path.join(datadir, fnamesplit[0]) untar = True + else: + untar_fpath = None fpath = os.path.join(datadir, fname) download = False - if os.path.exists(fpath): + if os.path.exists(fpath) or (untar_fpath is not None and os.path.exists(untar_fpath)): # file found; verify integrity if a hash was provided if md5_hash is not None: if not validate_file(fpath, md5_hash): @@ -94,6 +103,14 @@ def get_file(fname, origin, untar=False, else: download = True + # fix ftp protocol if needed + ''' + if origin.startswith('ftp://'): + new_url = origin.replace('ftp://','http://') + origin = new_url + print('Origin = ', origin) + ''' + if download: print('Downloading data from', origin) global progbar diff --git a/common/keras_utils.py b/common/keras_utils.py index 2d35b3ac..c57c7583 100644 --- a/common/keras_utils.py +++ b/common/keras_utils.py @@ -6,9 +6,10 @@ from keras import initializers from keras.layers import Dropout -from keras.callbacks import Callback +from keras.callbacks import Callback, ModelCheckpoint from keras.utils import get_custom_objects -from keras.metrics import binary_crossentropy, mean_squared_error +from keras.metrics import binary_crossentropy, mean_squared_error, mean_absolute_error +from keras.models import Model from scipy.stats.stats import pearsonr @@ -47,7 +48,10 @@ def set_seed(seed): if K.backend() == 'tensorflow': import tensorflow as tf - tf.set_random_seed(seed) + if tf.__version__ < "2.0.0": + tf.set_random_seed(seed) + else: + tf.random.set_seed(seed) def get_function(name): @@ -196,6 +200,16 @@ def xent(y_true, y_pred): return binary_crossentropy(y_true, y_pred) +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + + +def mae(y_true, y_pred): + return mean_absolute_error(y_true, y_pred) + + def mse(y_true, y_pred): return mean_squared_error(y_true, y_pred) @@ -243,3 +257,13 @@ def __init__(self, print_fcn=print): def on_epoch_end(self, epoch, logs={}): msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) self.print_fcn(msg) + + +class MultiGPUCheckpoint(ModelCheckpoint): + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + diff --git a/common/profiling_utils.py b/common/profiling_utils.py new file mode 100644 index 00000000..81d793b4 --- /dev/null +++ b/common/profiling_utils.py @@ -0,0 +1,11 @@ +import numba.cuda + +def start_profiling(do_prof): + if (do_prof): + numba.cuda.profile_start() + + +def stop_profiling(do_prof): + if (do_prof): + numba.cuda.profile_stop() + diff --git a/common/solr_keras.py b/common/solr_keras.py index a4944b77..eb33c009 100644 --- a/common/solr_keras.py +++ b/common/solr_keras.py @@ -5,7 +5,6 @@ import numpy as np import requests -from keras import backend as K from keras.callbacks import Callback @@ -20,10 +19,17 @@ def compute_trainable_params(model): ---------- python dictionary that contains trainable_params, non_trainable_params and total_params """ + if str(type(model)).startswith(" self.init_abs_epoch: + + current = logs.get(self.monitor) + + if current is None: + warnings.warn( 'Abstention Adapt conditioned on metric `%s` ' 'which is not available. Available metrics are: %s' % (self.monitor, ','.join(list(logs.keys()))), RuntimeWarning) + else: + # modify mu as needed + if current > self.target_acc: #increase abstention penalty + new_mu_val /= self.scale_factor + elif current < self.target_acc: #decrease abstention penalty + new_mu_val *= self.scale_factor + + K.set_value(mu, new_mu_val) + self.muvalues.append( new_mu_val ) + + #print('epoch: %d, mu: %f' % (epoch, new_mu_val)) + + +def modify_labels(numclasses_out, ytrain, ytest, yval): + """ This function generates a categorical representation with a class added for indicating abstention. + + Parameters + ---------- + numclasses_out : integer + Original number of classes + 1 abstention class + ytrain : ndarray + Numpy array of the classes (labels) in the training set + ytest : ndarray + Numpy array of the classes (labels) in the testing set + yval : ndarray + Numpy array of the classes (labels) in the validation set + """ + + classestrain = np.max(ytrain) + 1 + classestest = np.max(ytest) + 1 + classesval = np.max(yval) + 1 + + assert( classestrain == classestest ) + assert( classesval == classestest ) + assert( (classestrain+1) == numclasses_out ) # In this case only one other slot for abstention is created + + labels_train = np_utils.to_categorical( ytrain, numclasses_out ) + labels_test = np_utils.to_categorical( ytest, numclasses_out ) + labels_val = np_utils.to_categorical( yval, numclasses_out ) + + # For sanity check + mask_vec = np.zeros(labels_train.shape) + mask_vec[:,-1] = 1 + i = np.random.choice(range(labels_train.shape[0])) + sanity_check = mask_vec[i,:]*labels_train[i,:] + print(sanity_check.shape) + if ytrain.ndim > 1: + ll = ytrain.shape[1] + else: + ll = 0 + + for i in range( ll ): + for j in range( numclasses_out ): + if sanity_check[i,j] == 1: + print('Problem at ',i,j) + + return labels_train, labels_test, labels_val + +################################################################### + +def add_model_output(modelIn, mode=None, num_add=None, activation=None): + """ This function modifies the last dense layer in the passed keras model. The modification includes adding units and optionally changing the activation function. + + Parameters + ---------- + modelIn : keras model + Keras model to be modified. + mode : string + Mode to modify the layer. It could be: + 'abstain' for adding an arbitrary number of units for the abstention optimization strategy. + 'qtl' for quantile regression which needs the outputs to be tripled. + 'het' for heteroscedastic regression which needs the outputs to be doubled. (current implicit default: 'het') + num_add : integer + Number of units to add. This only applies to the 'abstain' mode. + activation : string + String with keras specification of activation function (e.g. 'relu', 'sigomid', 'softmax', etc.) + + Return + ---------- + modelOut : keras model + Keras model after last dense layer has been modified as specified. If there is no mode specified it returns the same model. + """ + + if mode is None: + return modelIn + + numlayers = len(modelIn.layers) + # Find last dense layer + i = -1 + while 'dense' not in (modelIn.layers[i].name) and ((i+numlayers) > 0): + i -= 1 + # Minimal verification about the validity of the layer found + assert ((i + numlayers) >= 0) + assert ('dense' in modelIn.layers[i].name) + + # Compute new output size + if mode is 'abstain': + assert num_add is not None + new_output_size = modelIn.layers[i].output_shape[-1] + num_add + elif mode is 'qtl': # for quantile UQ + new_output_size = 3 * modelIn.layers[i].output_shape[-1] + else: # for heteroscedastic UQ + new_output_size = 2 * modelIn.layers[i].output_shape[-1] + + # Recover current layer options + config = modelIn.layers[i].get_config() + # Update number of units + config['units'] = new_output_size + # Update activation function if requested + if activation is not None: + config['activation'] = activation + # Create new Dense layer + reconstructed_layer = Dense.from_config(config) + # Connect new Dense last layer to previous one-before-last layer + additional = reconstructed_layer(modelIn.layers[i-1].output) + # If the layer to replace is not the last layer, add the remainder layers + if i < -1: + for j in range(i+1, 0): + config_j = modelIn.layers[j].get_config() + aux_j = layers.deserialize({'class_name': modelIn.layers[j].__class__.__name__, + 'config': config_j}) + reconstructed_layer = aux_j.from_config(config_j) + additional = reconstructed_layer(additional) + + modelOut = Model(modelIn.input, additional) + + return modelOut diff --git a/common/uq_utils.py b/common/uq_utils.py index 650da687..9497cf94 100644 --- a/common/uq_utils.py +++ b/common/uq_utils.py @@ -1,7 +1,9 @@ from __future__ import absolute_import import numpy as np - +from scipy.stats import pearsonr, spearmanr +from scipy import signal +from scipy.interpolate import InterpolatedUnivariateSpline def generate_index_distribution(numTrain, numTest, numValidation, params): """ Generates a vector of indices to partition the data for training. @@ -75,6 +77,8 @@ def generate_index_distribution_from_fraction(numTrain, numTest, numValidation, Indices for data in testing (if merging) """ + tol = 1e-7 + # Extract required parameters fractionTrain = params['uq_train_fr'] fractionValidation = params['uq_valid_fr'] @@ -88,7 +92,8 @@ def generate_index_distribution_from_fraction(numTrain, numTest, numValidation, raise ValueError('uq_test_fr is not in (0, 1) range. uq_test_fr: ', fractionTest) fractionSum = fractionTrain + fractionValidation + fractionTest - if (fractionSum > 1.) or (fractionSum < 1.): + #if (fractionSum > 1.) or (fractionSum < 1.): + if abs(fractionSum-1.) > tol: raise ValueError('Specified UQ fractions (uq_train_fr, uq_valid_fr, uq_test_fr) do not add up to 1. No cross-validation partition is computed ! sum:', fractionSum) # Determine data size and block size @@ -331,6 +336,751 @@ def fill_array(blocklist, maxsize, numdata, numblocks, blocksize): return indexArray[:offset] +###### UTILS for COMPUTATION OF EMPIRICAL CALIBRATION + +def compute_statistics_homoscedastic(df_data, + col_true=0, + col_pred=6, + col_std_pred=7, + ): + """ Extracts ground truth, mean predition, error and + standard deviation of prediction from inference + data frame. The latter includes the statistics + over all the inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current CANDLE inference + experiments. Indices are hard coded to agree with + current CANDLE version. (The inference file usually + has the name: _pred.tsv). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 0, index in current CANDLE format). + col_pred : integer + Index of the column in the data frame where the predicted + value is stored (Default: 6, index in current CANDLE format). + col_std_pred : integer + Index of the column in the data frame where the standard + deviation of the predicted values is stored (Default: 7, + index in current CANDLE format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. For homoscedastic inference this corresponds to the + std value computed from prediction (and is equal to the + following returned variable). + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred = df_data.iloc[:,col_pred].values + print('Ypred shape: ', Ypred.shape) + Ypred_std = df_data.iloc[:,col_std_pred].values + print('Ypred_std shape: ', Ypred_std.shape) + yerror = Ytrue - Ypred + print('yerror shape: ', yerror.shape) + sigma = Ypred_std # std + MSE = np.mean((Ytrue - Ypred)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred, yerror, sigma, Ypred_std, pred_name + + +def compute_statistics_homoscedastic_all(df_data, + col_true=4, + col_pred_start=6 + ): + """ Extracts ground truth, mean predition, error and + standard deviation of prediction from inference + data frame. The latter includes all the individual + inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current CANDLE inference + experiments. Indices are hard coded to agree with + current CANDLE version. (The inference file usually + has the name: .predicted_INFER.tsv). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 4, index in current HOM format). + col_pred_start : integer + Index of the column in the data frame where the first predicted + value is stored. All the predicted values during inference + are stored (Default: 6 index, in current HOM format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. For homoscedastic inference this corresponds to the + std value computed from prediction (and is equal to the + following returned variable). + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred_mean_ = np.mean(df_data.iloc[:,col_pred_start:], axis=1) + Ypred_mean = Ypred_mean_.values + print('Ypred_mean shape: ', Ypred_mean.shape) + Ypred_std_ = np.std(df_data.iloc[:,col_pred_start:], axis=1) + Ypred_std = Ypred_std_.values + print('Ypred_std shape: ', Ypred_std.shape) + yerror = Ytrue - Ypred_mean + print('yerror shape: ', yerror.shape) + sigma = Ypred_std # std + MSE = np.mean((Ytrue - Ypred_mean)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred_mean)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred_mean) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred_mean, yerror, sigma, Ypred_std, pred_name + + +def compute_statistics_heteroscedastic(df_data, + col_true=4, + col_pred_start=6, + col_std_pred_start=7, + ): + """ Extracts ground truth, mean predition, error, standard + deviation of prediction and predicted (learned) standard + deviation from inference data frame. The latter includes + all the individual inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current heteroscedastic inference + experiments. Indices are hard coded to agree with + current version. (The inference file usually + has the name: .predicted_INFER_HET.tsv). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 4, index in current HET format). + col_pred_start : integer + Index of the column in the data frame where the first predicted + value is stored. All the predicted values during inference + are stored and are interspaced with standard deviation + predictions (Default: 6 index, step 2, in current HET format). + col_std_pred_start : integer + Index of the column in the data frame where the first predicted + standard deviation value is stored. All the predicted values + during inference are stored and are interspaced with predictions + (Default: 7 index, step 2, in current HET format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. For homoscedastic inference this corresponds to the + std value computed from prediction (and is equal to the + following returned variable). + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred_mean_ = np.mean(df_data.iloc[:,col_pred_start::2], axis=1) + Ypred_mean = Ypred_mean_.values + print('Ypred shape: ', Ypred_mean.shape) + Ypred_std_ = np.std(df_data.iloc[:,col_pred_start::2], axis=1) + Ypred_std = Ypred_std_.values + print('Ypred_std shape: ', Ypred_std.shape) + yerror = Ytrue - Ypred_mean + print('yerror shape: ', yerror.shape) + s_ = df_data.iloc[:,col_std_pred_start::2] + s_mean = np.mean(s_, axis=1) + var = np.exp(s_mean.values) # variance + sigma = np.sqrt(var) # std + print('sigma shape: ', sigma.shape) + MSE = np.mean((Ytrue - Ypred_mean)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred_mean)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred_mean) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred_mean, yerror, sigma, Ypred_std, pred_name + + +def compute_statistics_quantile(df_data, + sigma_divisor=2.56, + col_true=4, + col_pred_start=6 + ): + """ Extracts ground truth, 50th percentile mean predition, + low percentile and high percentile mean prediction + (usually 10th percentile and 90th percentile respectively), + error (using 50th percentile), standard deviation of + prediction (using 50th percentile) and predicted (learned) + standard deviation from interdecile range in inference data frame. + The latter includes all the individual inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current quantile inference + experiments. Indices are hard coded to agree with + current version. (The inference file usually + has the name: .predicted_INFER_QTL.tsv). + sigma_divisor : float + Divisor to convert from the intercedile range to the corresponding + standard deviation for a Gaussian distribution. + (Default: 2.56, consisten with an interdecile range computed from + the difference between the 90th and 10th percentiles). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 4, index in current QTL format). + col_pred_start : integer + Index of the column in the data frame where the first predicted + value is stored. All the predicted values during inference + are stored and are interspaced with other percentile + predictions (Default: 6 index, step 3, in current QTL format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values (based on the 50th percentile). + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. This corresponds to the interdecile range divided + by the sigma divisor. + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + Ypred_Lp_mean : numpy array + Array with predicted values of the lower percentile + (usually the 10th percentile). + Ypred_Hp_mean : numpy array + Array with predicted values of the higher percentile + (usually the 90th percentile). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred_50q_mean = np.mean(df_data.iloc[:,col_pred_start::3], axis=1) + Ypred_mean = Ypred_50q_mean.values + print('Ypred shape: ', Ypred_mean.shape) + Ypred_Lp_mean_ = np.mean(df_data.iloc[:,col_pred_start+1::3], axis=1) + Ypred_Hp_mean_ = np.mean(df_data.iloc[:,col_pred_start+2::3], axis=1) + Ypred_Lp_mean = Ypred_Lp_mean_.values + Ypred_Hp_mean = Ypred_Hp_mean_.values + interdecile_range = Ypred_Hp_mean - Ypred_Lp_mean + sigma = interdecile_range / sigma_divisor + print('sigma shape: ', sigma.shape) + yerror = Ytrue - Ypred_mean + print('yerror shape: ', yerror.shape) + Ypred_std_ = np.std(df_data.iloc[:,col_pred_start::3], axis=1) + Ypred_std = Ypred_std_.values + print('Ypred_std shape: ', Ypred_std.shape) + MSE = np.mean((Ytrue - Ypred_mean)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred_mean)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred_mean) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred_mean, yerror, sigma, Ypred_std, pred_name, Ypred_Lp_mean, Ypred_Hp_mean + + +def split_data_for_empirical_calibration(Ytrue, Ypred, sigma, cal_split=0.8): + """ Extracts a portion of the arrays provided for the computation + of the calibration and reserves the remainder portion + for testing. + + Parameters + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + sigma : numpy array + Array with standard deviations learned with deep learning + model (or std value computed from prediction if homoscedastic + inference). + cal_split : float + Split of data to use for estimating the calibration relationship. + It is assumet that it will be a value in (0, 1). + (Default: use 80% of predictions to generate empirical + calibration). + + Return + ---------- + index_perm_total : numpy array + Random permutation of the array indices. The first 'num_cal' + of the indices correspond to the samples that are used for + calibration, while the remainder are the samples reserved + for calibration testing. + pSigma_cal : numpy array + Part of the input sigma array to use for calibration. + pSigma_test : numpy array + Part of the input sigma array to reserve for testing. + pPred_cal : numpy array + Part of the input Ypred array to use for calibration. + pPred_test : numpy array + Part of the input Ypred array to reserve for testing. + true_cal : numpy array + Part of the input Ytrue array to use for calibration. + true_test : numpy array + Part of the input Ytrue array to reserve for testing. + """ + + # shuffle data for calibration + num_pred_total = sigma.shape[0] + num_cal = np.int(num_pred_total * cal_split) + index_perm_total = np.random.permutation(range(num_pred_total)) + + # Permute data + pSigma_perm_all = sigma[index_perm_total] + pPred_perm_all = Ypred[index_perm_total] + true_perm_all = Ytrue[index_perm_total] + + # Split in calibration and testing + pSigma_cal = pSigma_perm_all[:num_cal] + pSigma_test = pSigma_perm_all[num_cal:] + pPred_cal = pPred_perm_all[:num_cal] + pPred_test = pPred_perm_all[num_cal:] + true_cal = true_perm_all[:num_cal] + true_test = true_perm_all[num_cal:] + + print('Size of calibration set: ', true_cal.shape) + print('Size of test set: ', true_test.shape) + + return index_perm_total, pSigma_cal, pSigma_test, pPred_cal, pPred_test, true_cal, true_test + + +def compute_empirical_calibration(pSigma_cal, pPred_cal, true_cal, bins, coverage_percentile): + """ Use the arrays provided to estimate an empirical mapping + between standard deviation and absolute value of error, + both of which have been observed during inference. Since + most of the times the raw statistics per bin are very noisy, + a smoothing step (based on scipy's savgol filter) is performed. + + Parameters + ---------- + pSigma_cal : numpy array + Part of the standard deviations array to use for calibration. + pPred_cal : numpy array + Part of the predictions array to use for calibration. + true_cal : numpy array + Part of the true (observed) values array to use for calibration. + bins : int + Number of bins to split the range of standard deviations + included in pSigma_cal array. + coverage_percentile : float + Value to use for estimating coverage when evaluating the percentiles + of the observed absolute value of errors. + + Return + ---------- + mean_sigma : numpy array + Array with the mean standard deviations computed per bin. + min_sigma : numpy array + Array with the minimum standard deviations computed per bin. + max_sigma : numpy array + Array with the maximum standard deviations computed per bin. + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + err_err : numpy array + Error bars in errors (one standard deviation for a binomial + distribution estimated by bin vs. the other bins) for the + calibration error. + error_thresholds_smooth : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin after a smoothed operation is applied + to the frequently noisy bin-based estimations. + sigma_start_index : non-negative integer + Index in the mean_sigma array that defines the start of + the valid empirical calibration interval (i.e. index to + the smallest std for which a meaningful error mapping + is obtained). + sigma_end_index : non-negative integer + Index in the mean_sigma array that defines the end of + the valid empirical calibration interval (i.e. index to + the largest std for which a meaningful error mappping + is obtained). + s_interpolate : scipy.interpolate python object + A python object from scipy.interpolate that computes a + univariate spline (InterpolatedUnivariateSpline) constructed + to express the mapping from standard deviation to error. This + spline is generated during the computational empirical + calibration procedure. + """ + + index_sigma_cal = np.argsort(pSigma_cal) + pSigma_cal_ordered_ = pSigma_cal[index_sigma_cal] + Er_vect_cal_ = np.abs(true_cal - pPred_cal) + Er_vect_cal_orderedSigma_ = Er_vect_cal_[index_sigma_cal] + + minL_sigma = np.min(pSigma_cal_ordered_) + maxL_sigma = np.max(pSigma_cal_ordered_) + print('Complete Sigma range --> Min: %f, Max: %f' % (minL_sigma, maxL_sigma)) + + # Bin statistics for error and sigma + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err = bining_for_calibration(pSigma_cal_ordered_, + minL_sigma, + maxL_sigma, + Er_vect_cal_orderedSigma_, + bins, + coverage_percentile) + + # smooth error function + #scipy.signal.savgol_filter(x, window_length, polyorder, + #deriv=0, delta=1.0, axis=-1, mode='interp', cval=0.0) + #error_thresholds_smooth = signal.savgol_filter(error_thresholds, 5, 1) + error_thresholds_smooth = signal.savgol_filter(error_thresholds, 5, 1, mode='nearest') + + # Build Interpolant over smooth plot (this will become the calibration function) + s_interpolate = InterpolatedUnivariateSpline(mean_sigma, error_thresholds_smooth) + # Determine limits of calibration (i.e. monotonicity range) + sigma_start_index, sigma_end_index = computation_of_valid_calibration_interval(error_thresholds, error_thresholds_smooth, err_err) + + print('Range of valid sigma: %.6f --> %.6f' % (mean_sigma[sigma_start_index], mean_sigma[sigma_end_index])) + + return mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate + + + +def bining_for_calibration(pSigma_cal_ordered_, minL_sigma, + maxL_sigma, Er_vect_cal_orderedSigma_, + bins, coverage_percentile): + """ Bin the values of the standard deviations observed during + inference and estimate a specified coverage percentile + in the absolute error (observed during inference as well). + Bins that have less than 50 samples are merged until they + surpass this threshold. + + Parameters + ---------- + pSigma_cal_ordered_ : numpy array + Array of standard deviations ordered in ascending way. + minL_sigma : float + Minimum value of standard deviations included in + pSigma_cal_ordered_ array. + maxL_sigma : numpy array + Maximum value of standard deviations included in + pSigma_cal_ordered_ array. + Er_vect_cal_orderedSigma_ : numpy array + Array ob absolute value of errors corresponding with + the array of ordered standard deviations. + bins : int + Number of bins to split the range of standard deviations + included in pSigma_cal_ordered_ array. + coverage_percentile : float + Value to use for estimating coverage when evaluating the percentiles + of the observed absolute value of errors. + + Return + ---------- + mean_sigma : numpy array + Array with the mean standard deviations computed per bin. + min_sigma : numpy array + Array with the minimum standard deviations computed per bin. + max_sigma : numpy array + Array with the maximum standard deviations computed per bin. + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + err_err : numpy array + Error bars in errors (one standard deviation for a binomial + distribution estimated by bin vs. the other bins) for the + calibration error. + """ + + #thresholds = np.logspace(np.log10(minL_sigma), np.log10(maxL_sigma), num=bins) + thresholds = np.linspace(minL_sigma, maxL_sigma, num=bins) + classes = np.digitize(pSigma_cal_ordered_, thresholds) + Nbin = np.zeros(bins+1) + for i in range(bins+1): + indices = (classes == i) + Nbin[i] = indices.sum() + + # Repair bins + new_thresholds_l = [] + new_nbins_l = [] + sumN = 0 + for i in range(Nbin.shape[0]): + sumN += Nbin[i] + if sumN > 50: + if i > (thresholds.shape[0] - 1): + new_thresholds_l.append(thresholds[-1]) + else: + new_thresholds_l.append(thresholds[i]) + new_nbins_l.append(sumN) + sumN = 0 + new_thresholds = np.array(new_thresholds_l) + new_nbins = np.array(new_nbins_l) + new_thresholds[-1] = thresholds[-1] + new_nbins[-1] += sumN + + # + classes = np.digitize(pSigma_cal_ordered_, new_thresholds[:-1]) + error_thresholds = -1. * np.ones(new_nbins.shape[0]) + mean_sigma = -1. * np.ones(new_nbins.shape[0]) + min_sigma = -1. * np.ones(new_nbins.shape[0]) + max_sigma = -1. * np.ones(new_nbins.shape[0]) + err_err = -1. * np.ones(new_nbins.shape[0]) + Ncal = pSigma_cal_ordered_.shape[0] + for i in range(error_thresholds.shape[0]): + indices = (classes == i) + n_aux = indices.sum() + assert n_aux == new_nbins[i] + print('Points in bin %d: %d' % (i, n_aux)) + mean_sigma[i] = np.mean(pSigma_cal_ordered_[indices]) + min_sigma[i] = np.min(pSigma_cal_ordered_[indices]) + max_sigma[i] = np.max(pSigma_cal_ordered_[indices]) + error_thresholds[i] = np.percentile(Er_vect_cal_orderedSigma_[indices], coverage_percentile) + err_err[i] = np.sqrt(new_nbins[i] * (Ncal - new_nbins[i])) / Ncal * error_thresholds[i] + + return mean_sigma, min_sigma, max_sigma, error_thresholds, err_err + + +def computation_of_valid_calibration_interval(error_thresholds, error_thresholds_smooth, err_err): + """ Function that estimates the empirical range in which a + monotonic relation is observed between standard deviation + and coverage of absolute value of error. Since the + statistics computed per bin are relatively noisy, the + application of a greedy criterion (e.g. guarantee a + monotonically increasing relationship) does not yield + good results. Therefore, a softer version is constructed + based on the satisfaction of certain criteria depending + on: the values of the error coverage computed per bin, + a smoothed version of them and the assocatiate error + estimated (based on one standard deviation for a binomial + distribution estimated by bin vs. the other bins). + A minimal validation requiring the end idex to be + largest than the starting index is performed before + the function return. + + Current criteria: + - the smoothed errors are inside the error bars AND + they are almost increasing (a small tolerance is + allowed, so a small wobbliness in the smoother + values is permitted). + OR + - both the raw values for the bins (with a small tolerance) + are increasing, AND the smoothed value is greater than the + raw value. + OR + - the current smoothed value is greater than the previous AND + the smoothed values for the next been are inside the error + bars. + + Parameters + ---------- + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + error_thresholds_smooth : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin after a smoothed operation is applied + to the frequently noisy bin-based estimations. + err_err : numpy array + Error bars in errors (one standard deviation for a binomial + distribution estimated by bin vs. the other bins) for the + calibration error. + + Return + ---------- + sigma_start_index : non-negative integer + Index estimated in the mean_sigma array corresponing to + the value that defines the start of the valid empirical + calibration interval (i.e. index to the smallest std for + which a meaningful error mapping is obtained, according + to the criteria explained before). + sigma_end_index : non-negative integer + Index estimated in the mean_sigma array corresponing to + the value that defines the end of the valid empirical + calibration interval (i.e. index to the largest std for + which a meaningful error mapping is obtained, according + to the criteria explained before). + """ + + # Computation of the calibration interval + limitH = error_thresholds + err_err + limitL = error_thresholds - err_err + + # search for starting point + for i in range(err_err.shape[0]): + if ((error_thresholds_smooth[i] >= limitL[i]) and + (error_thresholds_smooth[i] <= limitH[i])): # Ask if the current is in the interval + sigma_start_index = i + break + sigma_end_index = sigma_start_index - 1 + + restart = max(1, sigma_start_index) + for i in range(restart, err_err.shape[0]-1): + if (((error_thresholds_smooth[i] >= limitL[i]) and + (error_thresholds_smooth[i] <= limitH[i]) and + ((error_thresholds_smooth[i] * 1.005 > error_thresholds_smooth[i-1]) or + ((error_thresholds[i] * 1.01 > error_thresholds[i-1]) and + (error_thresholds_smooth[i] > error_thresholds[i])))) # Ask if the current is in the interval with slightly increasing trend + or # Ask if the current is greater than the previous and the next is in the interval + ((error_thresholds_smooth[i] > error_thresholds_smooth[i-1]) and + ((error_thresholds_smooth[i+1] >= limitL[i+1]) and + (error_thresholds_smooth[i+1] <= limitH[i+1])))): + + sigma_end_index = i + else: # Finalize search for monotonic range + if (sigma_end_index - sigma_start_index) > 4: + break + else: # Reset indices + sigma_start_index = i + 1 + sigma_end_index = i + + print('Range of valid sigma indices (inclusive): %d --> %d' % (sigma_start_index, sigma_end_index)) + + assert (sigma_end_index > sigma_start_index) + + return sigma_start_index, sigma_end_index + + +def applying_calibration(pSigma_test, pPred_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto): + """ Use the empirical mapping between standard deviation and + absolute value of error estimated during calibration (i.e. + apply the univariate spline computed) to estimate the error + for the part of the standard deviation array that was reserved + for testing the empirical calibration. The resulting error array + (yp_test) should overestimate the true observed error (eabs_red). + All the computations are restricted to the valid calibration + interval: [minL_sigma_auto, maxL_sigma_auto]. + + Parameters + ---------- + pSigma_test : numpy array + Part of the standard deviations array to use for calibration testing. + pPred_test : numpy array + Part of the predictions array to use for calibration testing. + true_test : numpy array + Part of the true (observed) values array to use for calibration testing. + s_interpolate : scipy.interpolate python object + A python object from scipy.interpolate that computes a + univariate spline (InterpolatedUnivariateSpline) expressing + the mapping from standard deviation to error. This + spline is generated during the computational empirical + calibration procedure. + minL_sigma_auto : float + Starting value of the valid empirical calibration interval + (i.e. smallest std for which a meaningful error mapping + is obtained). + maxL_sigma_auto : float + Ending value of the valid empirical calibration interval + (i.e. largest std for which a meaningful error mappping + is obtained). + + Return + ---------- + index_sigma_range_test : numpy array + Indices of the pSigma_test array that are included in the + valid calibration interval, given by: + [minL_sigma_auto, maxL_sigma_auto]. + xp_test : numpy array + Array with the mean standard deviations in the calibration + testing array. + yp_test : numpy array + Mapping of the given standard deviation to error computed + from the interpolation spline constructed by empirical + calibration. + eabs_red : numpy array + Array with the observed abolute errors in the part of the testing + array for which the observed standard deviations are in the + valid interval of calibration. + """ + + # Filter to appropriate range + index_sigma_range_test = (pSigma_test >= minL_sigma_auto) & (pSigma_test < maxL_sigma_auto) + xp_test = pSigma_test[index_sigma_range_test] + yp_test = s_interpolate(xp_test) + Er_vect_ = true_test - pPred_test + eabs_ = np.abs(Er_vect_) + eabs_red = eabs_[index_sigma_range_test] + + return index_sigma_range_test, xp_test, yp_test, eabs_red + + +def overprediction_check(yp_test, eabs_red): + """ Compute the percentage of overestimated absoulte error + predictions for the arrays reserved for calibration testing + and whose corresponding standard deviations are included + in the valid calibration interval. + + Parameters + ---------- + yp_test : numpy array + Mapping of the standard deviation to error computed + from the interpolation spline constructed by empirical + calibration. + eabs_red : numpy array + Array with the observed abolute errors in the part of the testing + array for which the observed standard deviations are in the + valid interval of calibration. + """ + + over_pred_error_index = (yp_test >= eabs_red) + percentage_over_predicted = (over_pred_error_index.sum() / yp_test.shape[0]) + print("percentage over predicted: ", percentage_over_predicted) diff --git a/common/viz_utils.py b/common/viz_utils.py index eb570e37..c4eed5e2 100644 --- a/common/viz_utils.py +++ b/common/viz_utils.py @@ -1,7 +1,10 @@ +from pathlib import Path import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt +import numpy as np + def plot_history(out, history, metric='loss', title=None, width=8, height=6): title = title or 'model {}'.format(metric) val_metric = 'val_{}'.format(metric) @@ -14,6 +17,7 @@ def plot_history(out, history, metric='loss', title=None, width=8, height=6): plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center') png = '{}.plot.{}.png'.format(out, metric) plt.savefig(png, bbox_inches='tight') + plt.close() def plot_scatter(data, classes, out, width=10, height=8): cmap = plt.cm.get_cmap('gist_rainbow') @@ -22,6 +26,7 @@ def plot_scatter(data, classes, out, width=10, height=8): plt.colorbar() png = '{}.png'.format(out) plt.savefig(png, bbox_inches='tight') + plt.close() def plot_error(y_true, y_pred, batch, file_ext, file_pre='output_dir', subsample=1000): if batch % 10: @@ -60,3 +65,365 @@ def plot_error(y_true, y_pred, batch, file_ext, file_pre='output_dir', subsample plt.savefig(file_pre+'.diff'+file_ext+'.b'+str(batch)+'.png') plt.close() +###### UTILS for UQ / CALIBRATION VISUALIZATION + +from matplotlib.colors import LogNorm + +def plot_density_observed_vs_predicted(Ytest, Ypred, pred_name=None, figprefix=None): + """Functionality to plot a 2D histogram of the distribution of observed (ground truth) + values vs. predicted values. The plot generated is stored in a png file. + + Parameters + ---------- + Ytest : numpy array + Array with (true) observed values + Ypred : numpy array + Array with predicted values. + pred_name : string + Name of data colum or quantity predicted (e.g. growth, AUC, etc.) + figprefix : string + String to prefix the filename to store the figure generated. + A '_density_predictions.png' string will be appended to the + figprefix given. + """ + + xbins = 51 + + fig = plt.figure(figsize=(24,18)) # (30,16) + ax = plt.gca() + plt.rc('xtick', labelsize=16) # fontsize of the tick labels + ax.plot([Ytest.min(), Ytest.max()], [Ytest.min(), Ytest.max()], 'r--', lw=4.) + plt.hist2d(Ytest, Ypred, bins=xbins, norm=LogNorm()) + cb = plt.colorbar() + ax.set_xlabel('Observed ' + pred_name, fontsize=38, labelpad=15.) + ax.set_ylabel('Mean ' + pred_name + ' Predicted', fontsize=38, labelpad=15.) + ax.axis([Ytest.min()*0.98, Ytest.max()*1.02, Ytest.min()*0.98, Ytest.max()*1.02]) + plt.setp(ax.get_xticklabels(), fontsize=32) + plt.setp(ax.get_yticklabels(), fontsize=32) + cb.ax.set_yticklabels(cb.ax.get_yticklabels(), fontsize=28) + plt.grid(True) + plt.savefig(figprefix + '_density_predictions.png') + plt.close() + print('Generated plot: ', figprefix + '_density_predictions.png') + + +def plot_2d_density_sigma_vs_error(sigma, yerror, method=None, figprefix=None): + """Functionality to plot a 2D histogram of the distribution of + the standard deviations computed for the predictions vs. the + computed errors (i.e. values of observed - predicted). + The plot generated is stored in a png file. + + Parameters + ---------- + sigma : numpy array + Array with standard deviations computed. + yerror : numpy array + Array with errors computed (observed - predicted). + method : string + Method used to comput the standard deviations (i.e. dropout, + heteroscedastic, etc.). + figprefix : string + String to prefix the filename to store the figure generated. + A '_density_sigma_error.png' string will be appended to the + figprefix given. + """ + + xbins = 51 + ybins = 31 + + fig = plt.figure(figsize=(24,12)) # (30,16) + ax = plt.gca() + plt.rc('xtick', labelsize=16) # fontsize of the tick labels + plt.hist2d(sigma, yerror, bins=[xbins,ybins], norm=LogNorm()) + cb = plt.colorbar() + ax.set_xlabel('Sigma (' + method + ')', fontsize=38, labelpad=15.) + ax.set_ylabel('Observed - Mean Predicted', fontsize=38, labelpad=15.) + ax.axis([sigma.min()*0.98, sigma.max()*1.02, -yerror.max(), yerror.max()]) + plt.setp(ax.get_xticklabels(), fontsize=28) + plt.setp(ax.get_yticklabels(), fontsize=28) + cb.ax.set_yticklabels(cb.ax.get_yticklabels(), fontsize=22) + plt.grid(True) + plt.savefig(figprefix + '_density_sigma_error.png') + plt.close() + print('Generated plot: ', figprefix + '_density_sigma_error.png') + + +def plot_histogram_error_per_sigma(sigma, yerror, method=None, figprefix=None): + """Functionality to plot a 1D histogram of the distribution of + computed errors (i.e. values of observed - predicted) observed + for specific values of standard deviations computed. The range of + standard deviations computed is split in xbins values and the + 1D histograms of error distributions for the smallest six + standard deviations are plotted. + The plot generated is stored in a png file. + + Parameters + ---------- + sigma : numpy array + Array with standard deviations computed. + yerror : numpy array + Array with errors computed (observed - predicted). + method : string + Method used to comput the standard deviations (i.e. dropout, + heteroscedastic, etc.). + figprefix : string + String to prefix the filename to store the figure generated. + A '_histogram_error_per_sigma.png' string will be appended to + the figprefix given. + """ + + xbins = 21 + ybins = 31 + + H, xedges, yedges, img = plt.hist2d(sigma, yerror,# normed=True, + bins=[xbins,ybins]) + + fig = plt.figure(figsize=(14,16)) + legend = [] + for ii in range(6):#(H.shape[0]): + if ii is not 1: + plt.plot(yedges[0:H.shape[1]], H[ii,:]/np.sum(H[ii,:]), marker='o', + markersize=12, lw=6.) + legend.append(str((xedges[ii] + xedges[ii+1])/2)) + plt.legend(legend, fontsize=16) + ax = plt.gca() + plt.title('Error Dist. per Sigma for ' + method, fontsize=40) + ax.set_xlabel('Observed - Mean Predicted', fontsize=38, labelpad=15.) + ax.set_ylabel('Density', fontsize=38, labelpad=15.) + plt.setp(ax.get_xticklabels(), fontsize=28) + plt.setp(ax.get_yticklabels(), fontsize=28) + plt.grid(True) + plt.savefig(figprefix + '_histogram_error_per_sigma.png') + plt.close() + print('Generated plot: ', figprefix + '_histogram_error_per_sigma.png') + + +def plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, + method=None, figprefix=None, + steps=False): + """Functionality to plot empirical calibration curves + estimated by binning the statistics of computed + standard deviations and errors. + + Parameters + ---------- + mean_sigma : numpy array + Array with the mean standard deviations computed per bin. + sigma_start_index : non-negative integer + Index of the mean_sigma array that defines the start of + the valid empirical calibration interval (i.e. index to + the smallest std for which a meaningful error is obtained). + sigma_end_index : non-negative integer + Index of the mean_sigma array that defines the end of + the valid empirical calibration interval (i.e. index to + the largest std for which a meaningful error is obtained). + min_sigma : numpy array + Array with the minimum standard deviations computed per bin. + max_sigma : numpy array + Array with the maximum standard deviations computed per bin. + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + error_thresholds_smooth : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin after a smoothed operation is applied + to the frequently noisy bin-based estimations. + err_err : numpy array + Vertical error bars (usually one standard deviation for a binomial + distribution estimated by bin) for the error calibration + computed empirically. + s_interpolate : scipy.interpolate python object + A python object from scipy.interpolate that computes a + univariate spline (InterpolatedUnivariateSpline) constructed + to express the mapping from standard deviation to error. This + spline is generated during the computational empirical + calibration procedure. + coverage_percentile : float + Value used for the coverage in the percentile estimation + of the observed error. + method : string + Method used to comput the standard deviations (i.e. dropout, + heteroscedastic, etc.). + figprefix : string + String to prefix the filename to store the figure generated. + A '_empirical_calibration.png' string will be appended to + the figprefix given. + steps : boolean + Besides the complete empirical calibration (including raw + statistics, error bars and smoothing), also generates partial + plots with only the raw bin statistics (step1) and with only + the raw bin statistics and the smoothing interpolation (step2). + """ + + xp23 = np.linspace(mean_sigma[sigma_start_index], mean_sigma[sigma_end_index], 200) + yp23 = s_interpolate(xp23) + + p_cov = coverage_percentile + if steps: + # Plot raw bin statistics + fig = plt.figure(figsize=(18,12)) + ax = plt.gca() + ax.errorbar(mean_sigma, error_thresholds, + yerr=err_err, + xerr=[mean_sigma-min_sigma, max_sigma-mean_sigma], + fmt='o', ecolor='k', capthick=2, ms=8) + plt.xlabel('Sigma Predicted (' + method + ')', fontsize=24.) + plt.ylabel(str(p_cov) + '% Coverage for ABS Observed - Mean Predicted', fontsize=24.) + plt.title('Calibration', fontsize=28) + ax.axis([0, np.max(max_sigma)*1.1, np.min(error_thresholds)*0.9, np.max(yp23)*1.2]) + plt.grid() + plt.setp(ax.get_xticklabels(), fontsize=22) + plt.setp(ax.get_yticklabels(), fontsize=22) + plt.savefig(figprefix + '_empirical_calibration_step1.png') + plt.close() + print('Generated plot: ', figprefix + '_empirical_calibration_step1.png') + # Plot raw bin statistics and smoothing + fig = plt.figure(figsize=(18,12)) + ax = plt.gca() + ax.plot(mean_sigma, error_thresholds_smooth, 'g^', ms=12) + ax.errorbar(mean_sigma, error_thresholds, + yerr=err_err, + xerr=[mean_sigma-min_sigma, max_sigma-mean_sigma], + fmt='o', ecolor='k', capthick=2, ms=8) + plt.xlabel('Sigma Predicted (' + method + ')', fontsize=24.) + plt.ylabel(str(p_cov) + '% Coverage for ABS Observed - Mean Predicted', fontsize=24.) + plt.title('Calibration', fontsize=28) + ax.axis([0, np.max(max_sigma)*1.1, np.min(error_thresholds)*0.9, np.max(yp23)*1.2]) + plt.grid() + plt.setp(ax.get_xticklabels(), fontsize=22) + plt.setp(ax.get_yticklabels(), fontsize=22) + plt.savefig(figprefix + '_empirical_calibration_step2.png') + plt.close() + print('Generated plot: ', figprefix + '_empirical_calibration_step2.png') + + # Plot raw bin statistics, smoothing and empirical calibration + fig = plt.figure(figsize=(18,12)) + ax = plt.gca() + ax.plot(xp23, yp23, 'rx', ms=20) + ax.plot(mean_sigma, error_thresholds_smooth, 'g^', ms=12) + ax.errorbar(mean_sigma, error_thresholds, + yerr=err_err, + xerr=[mean_sigma-min_sigma, max_sigma-mean_sigma], + fmt='o', ecolor='k', capthick=2, ms=8) + plt.xlabel('Sigma Predicted (' + method + ')', fontsize=24.) + plt.ylabel(str(p_cov) + '% Coverage for ABS Observed - Mean Predicted', fontsize=24.) + plt.title('Calibration', fontsize=28) + ax.axis([0, np.max(max_sigma)*1.1, np.min(error_thresholds)*0.9, np.max(yp23)*1.2]) + plt.grid() + plt.setp(ax.get_xticklabels(), fontsize=22) + plt.setp(ax.get_yticklabels(), fontsize=22) + plt.savefig(figprefix + '_empirical_calibration.png') + plt.close() + print('Generated plot: ', figprefix + '_empirical_calibration.png') + + +def plot_percentile_predictions(Ypred, Ypred_Lp, Ypred_Hp, percentile_list, pred_name=None, figprefix=None): + """Functionality to plot the mean of the percentiles predicted. + The plot generated is stored in a png file. + + Parameters + ---------- + Ypred : numpy array + Array with mid percentile predicted values. + Ypred_Lp : numpy array + Array with low percentile predicted values. + Ypred_Hp : numpy array + Array with high percentile predicted values. + percentile_list : string list + List of percentiles predicted (e.g. '10p', '90p', etc.) + pred_name : string + Name of data colum or quantity predicted (e.g. growth, AUC, etc.) + figprefix : string + String to prefix the filename to store the figure generated. + A '_density_predictions.png' string will be appended to the + figprefix given. + """ + + index_ = np.argsort(Ypred) + fig = plt.figure(figsize=(24,18)) + plt.scatter(range(index_.shape[0]), Ypred[index_]) + plt.scatter(range(index_.shape[0]), Ypred_Lp[index_]) + plt.scatter(range(index_.shape[0]), Ypred_Hp[index_]) + plt.legend(percentile_list, fontsize=20) + plt.xlabel('Index', fontsize=18.) + plt.ylabel(pred_name, fontsize=18.) + plt.title('Predicted ' + pred_name + ' Percentiles', fontsize=28) + plt.grid() + ax = plt.gca() + plt.setp(ax.get_xticklabels(), fontsize=16) + plt.setp(ax.get_yticklabels(), fontsize=16) + plt.savefig(figprefix + '_percentile_predictions.png') + plt.close() + print('Generated plot: ', figprefix + '_percentile_predictions.png') + + +# plot training and validation metrics together and generate one chart per metrics +def plot_metrics(history, title=None, skip_ep=0, outdir='.', add_lr=False): + """ Plots keras training curves history. + Args: + skip_ep: number of epochs to skip when plotting metrics + add_lr: add curve of learning rate progression over epochs + """ + + def capitalize_metric(met): + return ' '.join(s.capitalize() for s in met.split('_')) + + all_metrics = list(history.history.keys()) + pr_metrics = ['_'.join(m.split('_')[1:]) for m in all_metrics if 'val' in m] + + epochs = np.asarray(history.epoch) + 1 + if len(epochs) <= skip_ep: + skip_ep = 0 + eps = epochs[skip_ep:] + hh = history.history + + for p, m in enumerate(pr_metrics): + metric_name = m + metric_name_val = 'val_' + m + + y_tr = hh[metric_name][skip_ep:] + y_vl = hh[metric_name_val][skip_ep:] + + ymin = min(set(y_tr).union(y_vl)) + ymax = max(set(y_tr).union(y_vl)) + lim = (ymax - ymin) * 0.1 + ymin, ymax = ymin - lim, ymax + lim + + # Start figure + fig, ax1 = plt.subplots() + + # Plot metrics + ax1.plot(eps, y_tr, color='b', marker='.', linestyle='-', linewidth=1, alpha=0.6, label=capitalize_metric(metric_name)) + ax1.plot(eps, y_vl, color='r', marker='.', linestyle='--', linewidth=1, alpha=0.6, label=capitalize_metric(metric_name_val)) + ax1.set_xlabel('Epoch') + ax1.set_ylabel(capitalize_metric(metric_name)) + ax1.set_xlim([min(eps) - 1, max(eps) + 1]) + ax1.set_ylim([ymin, ymax]) + ax1.tick_params('y', colors='k') + + # Add learning rate + if (add_lr is True) and ('lr' in hh): + ax2 = ax1.twinx() + ax2.plot(eps, hh['lr'][skip_ep:], color='g', marker='.', linestyle=':', linewidth=1, + alpha=0.6, markersize=5, label='LR') + ax2.set_ylabel('Learning rate', color='g', fontsize=12) + + ax2.set_yscale('log') + ax2.tick_params('y', colors='g') + + ax1.grid(True) + legend = ax1.legend(loc='best', prop={'size': 10}) + frame = legend.get_frame() + frame.set_facecolor('0.95') + if title is not None: + plt.title(title) + + figpath = Path(outdir) / (metric_name + '.png') + plt.savefig(figpath, bbox_inches='tight') + plt.close() diff --git a/examples/ADRP/README.md b/examples/ADRP/README.md new file mode 100644 index 00000000..2c6acf58 --- /dev/null +++ b/examples/ADRP/README.md @@ -0,0 +1,144 @@ +# Pilot1 ADRP Benchmark + +## loads a csv file + +Benchmark auto downloads the file below: +http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/ (~500MB) + +## Sample run: + +``` +$ export CUDA_VISIBLE_DEVICES=1 +$ python adrp_baseline_keras2.py +Using TensorFlow backend. +Importing candle utils for keras +Configuration file: /home/jain/CANDLE/fork/Benchmarks/examples/ADRP/adrp_default_model.txt +{'activation': 'relu', + 'batch_normalization': False, + 'batch_size': 32, + 'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', + 'dense': [250, 125, 60, 30], + 'drop': 0.1, + 'early_stop': True, + 'epochs': 1, + 'epsilon_std': 1.0, + 'feature_subsample': 0, + 'in': 'adrp-p1.csv', + 'initialization': 'glorot_uniform', + 'latent_dim': 2, + 'learning_rate': 0.0001, + 'loss': 'mean_squared_error', + 'model_name': 'adrp', + 'momentum': 0.9, + 'nb_classes': 2, + 'optimizer': 'sgd', + 'reduce_lr': True, + 'rng_seed': 2017, + 'save_path': './001/', + 'scaling': 'minmax', + 'timeout': 3600, + 'use_cp': False, + 'validation_split': 0.1} +Params: +{'activation': 'relu', + 'batch_normalization': False, + 'batch_size': 32, + 'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', + 'datatype': , + 'dense': [250, 125, 60, 30], + 'drop': 0.1, + 'early_stop': True, + 'epochs': 400, + 'epsilon_std': 1.0, + 'experiment_id': 'EXP000', + 'feature_subsample': 0, + 'gpus': [], + 'in': 'adrp-p1.csv', + 'initialization': 'glorot_uniform', + 'latent_dim': 2, + 'learning_rate': 0.0001, + 'logfile': None, + 'loss': 'mean_squared_error', + 'model_name': 'adrp', + 'momentum': 0.9, + 'nb_classes': 2, + 'optimizer': 'sgd', + 'output_dir': '/home/jain/CANDLE/fork/Benchmarks/examples/ADRP/Output/EXP000/RUN000', + 'profiling': False, + 'reduce_lr': True, + 'residual': False, + 'rng_seed': 2017, + 'run_id': 'RUN000', + 'save_path': './001/', + 'scaling': 'minmax', + 'shuffle': False, + 'timeout': 0, + 'train_bool': True, + 'tsne': False, + 'use_cp': False, + 'use_tb': False, + 'validation_split': 0.1, + 'verbose': None, + 'warmup_lr': False} +WARNING:tensorflow:From /home/jain/CANDLE/fork/Benchmarks/common/keras_utils.py:51: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead. + +Params: {'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', 'in': 'adrp-p1.csv', 'model_name': 'adrp', 'dense': [250, 125, 60, 30], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'mean_squared_error', 'optimizer': 'sgd', 'drop': 0.1, 'learning_rate': 0.0001, 'momentum': 0.9, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'save_path': './001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/home/jain/CANDLE/fork/Benchmarks/examples/ADRP/Output/EXP000/RUN000'} +processing csv in file adrp-p1.csv +PL= 1614 +X_train shape: (27447, 1613) +X_test shape: (6862, 1613) +Y_train shape: (27447,) +Y_test shape: (6862,) +WARNING:tensorflow:From /home/jain/.local/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version. +Instructions for updating: +If using Keras pass *_constraint arguments to layers. +Model: "model_1" +_________________________________________________________________ +Layer (type) Output Shape Param # +================================================================= +input_1 (InputLayer) (None, 1613) 0 +_________________________________________________________________ +dense_1 (Dense) (None, 250) 403500 +_________________________________________________________________ +dropout_1 (Dropout) (None, 250) 0 +_________________________________________________________________ +dense_2 (Dense) (None, 125) 31375 +_________________________________________________________________ +dropout_2 (Dropout) (None, 125) 0 +_________________________________________________________________ +dense_3 (Dense) (None, 60) 7560 +_________________________________________________________________ +dropout_3 (Dropout) (None, 60) 0 +_________________________________________________________________ +dense_4 (Dense) (None, 30) 1830 +_________________________________________________________________ +dropout_4 (Dropout) (None, 30) 0 +_________________________________________________________________ +dense_5 (Dense) (None, 1) 31 +================================================================= +Total params: 444,296 +Trainable params: 444,296 +Non-trainable params: 0 +_________________________________________________________________ +/home/jain/.local/lib/python3.7/site-packages/keras/callbacks/callbacks.py:998: UserWarning: `epsilon` argument is deprecated and will be removed, use `min_delta` instead. + warnings.warn('`epsilon` argument is deprecated and ' +2020-03-23 14:36:20.461062: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 +2020-03-23 14:36:20.463626: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error +2020-03-23 14:36:20.463720: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jain): /proc/driver/nvidia/version does not exist +2020-03-23 14:36:20.464039: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 +2020-03-23 14:36:20.475490: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2808000000 Hz +2020-03-23 14:36:20.475685: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x2dab430 initialized for platform Host (this does not guarantee that XLA will be used). Devices: +2020-03-23 14:36:20.475708: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version +WARNING:tensorflow:From /home/jain/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead. + +Train on 27447 samples, validate on 6862 samples +Epoch 1/1 +27447/27447 [==============================] - 5s 173us/step - loss: 3.4695 - mae: 1.3269 - r2: -2.1720 - val_loss: 1.2343 - val_mae: 0.9235 - val_r2: -0.1880 + +Epoch 00001: val_loss improved from inf to 1.23431, saving model to ./001/agg_adrp.autosave.model.h5 +[1.2343122459159792, 0.9235042333602905, -0.18803702294826508] +dict_keys(['val_loss', 'val_mae', 'val_r2', 'loss', 'mae', 'r2', 'lr']) +saving to path: ./001/ +Test val_loss: 1.2343122459159792 +Test val_mae: 0.9235042333602905 +``` diff --git a/examples/ADRP/adrp.py b/examples/ADRP/adrp.py new file mode 100644 index 00000000..1eabb26b --- /dev/null +++ b/examples/ADRP/adrp.py @@ -0,0 +1,175 @@ +from __future__ import print_function + +import os +import sys +import logging + +import pandas as pd +import numpy as np + +from sklearn.metrics import mean_squared_error +from sklearn.metrics import r2_score +from scipy.stats.stats import pearsonr + +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler + + +file_path = os.path.dirname(os.path.realpath(__file__)) +# lib_path = os.path.abspath(os.path.join(file_path, '..')) +# sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, "..", "..", "common")) +sys.path.append(lib_path2) + +import candle + +logger = logging.getLogger(__name__) +candle.set_parallelism_threads() + +additional_definitions = [ + {"name": "latent_dim", "action": "store", "type": int, "help": "latent dimensions"}, + { + "name": "residual", + "type": candle.str2bool, + "default": False, + "help": "add skip connections to the layers", + }, + { + "name": "reduce_lr", + "type": candle.str2bool, + "default": False, + "help": "reduce learning rate on plateau", + }, + { + "name": "warmup_lr", + "type": candle.str2bool, + "default": False, + "help": "gradually increase learning rate on start", + }, + {"name": "base_lr", "type": float, "help": "base learning rate"}, + { + "name": "epsilon_std", + "type": float, + "help": "epsilon std for sampling latent noise", + }, + { + "name": "use_cp", + "type": candle.str2bool, + "default": False, + "help": "checkpoint models with best val_loss", + }, + # {'name':'shuffle', + #'type': candle.str2bool, + #'default': False, + #'help':'shuffle data'}, + { + "name": "use_tb", + "type": candle.str2bool, + "default": False, + "help": "use tensorboard", + }, + { + "name": "tsne", + "type": candle.str2bool, + "default": False, + "help": "generate tsne plot of the latent representation", + }, +] + +required = [ + "activation", + "batch_size", + "dense", + "dropout", + "epochs", + "initialization", + "learning_rate", + "loss", + "optimizer", + "rng_seed", + "scaling", + "latent_dim", + "batch_normalization", + "epsilon_std", + "timeout", +] + + +class BenchmarkAdrp(candle.Benchmark): + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +def extension_from_parameters(params, framework=""): + """Construct string for saving model with annotation of parameters""" + ext = framework + for i, n in enumerate(params["dense"]): + if n: + ext += ".D{}={}".format(i + 1, n) + ext += ".A={}".format(params["activation"]) + ext += ".B={}".format(params["batch_size"]) + ext += ".E={}".format(params["epochs"]) + ext += ".L={}".format(params["latent_dim"]) + ext += ".LR={}".format(params["learning_rate"]) + ext += ".S={}".format(params["scaling"]) + + if params["epsilon_std"] != 1.0: + ext += ".EPS={}".format(params["epsilon_std"]) + if params["dropout"]: + ext += ".DR={}".format(params["dropout"]) + if params["batch_normalization"]: + ext += ".BN" + if params["warmup_lr"]: + ext += ".WU_LR" + if params["reduce_lr"]: + ext += ".Re_LR" + if params["residual"]: + ext += ".Res" + + return ext + + +def load_data(params, seed): + + # start change # + if params["train_data"].endswith("csv") or params["train_data"].endswith("csv"): + print("processing csv in file {}".format(params["train_data"])) + + url = params["data_url"] + file_train = params["train_data"] + train_file = candle.get_file( + file_train, url + file_train, cache_subdir="Pilot1" + ) + df = (pd.read_csv(train_file, skiprows=1).values).astype("float32") + + PL = df.shape[1] + print("PL=", PL) + + PS = PL - 1 + + df_y = df[:, 0].astype("float32") + df_x = df[:, 1:PL].astype(np.float32) + + df_y.shape + df_x.shape + scaler = StandardScaler() + df_x = scaler.fit_transform(df_x) + + X_train, X_test, Y_train, Y_test = train_test_split( + df_x, df_y, test_size=0.20, random_state=42 + ) + else: + print("expecting in file file suffix csv") + sys.exit() + + return X_train, Y_train, X_test, Y_test, PS diff --git a/examples/ADRP/adrp_baseline_keras2.py b/examples/ADRP/adrp_baseline_keras2.py new file mode 100644 index 00000000..c22d016e --- /dev/null +++ b/examples/ADRP/adrp_baseline_keras2.py @@ -0,0 +1,483 @@ +from __future__ import print_function + +import itertools +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse +import sklearn + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt + +import tensorflow as tf + +import keras as ke +from keras import backend as K + +from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization +from keras.optimizers import SGD, Adam, RMSprop, Adadelta +from keras.models import Sequential, Model, model_from_json, model_from_yaml +from keras.utils import np_utils, multi_gpu_model + +from keras.callbacks import ( + Callback, + ModelCheckpoint, + CSVLogger, + ReduceLROnPlateau, + EarlyStopping, + TensorBoard, +) + +from sklearn.utils.class_weight import compute_class_weight +from sklearn.model_selection import train_test_split +from sklearn.metrics import ( + r2_score, + mean_squared_error, + mean_absolute_error, + roc_auc_score, + confusion_matrix, + balanced_accuracy_score, + classification_report, +) +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler +from sklearn.metrics import ( + recall_score, + auc, + roc_curve, + f1_score, + precision_recall_curve, +) + +import adrp +import candle + +np.set_printoptions(precision=4) + + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return 1 - SS_res / (SS_tot + K.epsilon()) + + +def tf_auc(y_true, y_pred): + auc = tf.metrics.auc(y_true, y_pred)[1] + K.get_session().run(tf.local_variables_initializer()) + return auc + + +# from sklearn.metrics import roc_auc_score +# import tensorflow as tf + + +def auroc(y_true, y_pred): + score = tf.py_func( + lambda y_true, y_pred: roc_auc_score( + y_true, y_pred, average="macro", sample_weight=None + ).astype("float32"), + [y_true, y_pred], + "float32", + stateful=False, + name="sklearnAUC", + ) + return score + + +#def covariance(x, y): +# return K.mean(x * y) - K.mean(x) * K.mean(y) + + +def corr(y_true, y_pred): + cov = candle.covariance(y_true, y_pred) + var1 = candle.covariance(y_true, y_true) + var2 = candle.covariance(y_pred, y_pred) + return cov / (K.sqrt(var1 * var2) + K.epsilon()) + + +#def xent(y_true, y_pred): +# return binary_crossentropy(y_true, y_pred) + + +#def mse(y_true, y_pred): +# return mean_squared_error(y_true, y_pred) + + +class MetricHistory(Callback): + def on_epoch_begin(self, epoch, logs=None): + print("\n") + + def on_epoch_end(self, epoch, logs=None): + y_pred = self.model.predict(self.validation_data[0]) + r2 = r2_score(self.validation_data[1], y_pred) + corr, _ = pearsonr(self.validation_data[1].flatten(), y_pred.flatten()) + print("\nval_r2:", r2) + print(y_pred.shape) + print("\nval_corr:", corr, "val_r2:", r2) + print("\n") + + +class LoggingCallback(Callback): + def __init__(self, print_fcn=print): + Callback.__init__(self) + self.print_fcn = print_fcn + + def on_epoch_end(self, epoch, logs={}): + msg = "[Epoch: %i] %s" % ( + epoch, + ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items())), + ) + self.print_fcn(msg) + + +def build_type_classifier(x_train, y_train, x_test, y_test): + y_train = np.argmax(y_train, axis=1) + y_test = np.argmax(y_test, axis=1) + from xgboost import XGBClassifier + + clf = XGBClassifier(max_depth=6, n_estimators=100) + clf.fit( + x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], verbose=False + ) + y_pred = clf.predict(x_test) + acc = accuracy_score(y_test, y_pred) + print(acc) + return clf + + +def initialize_parameters(default_model="adrp_default_model.txt"): + + # Build benchmark object + adrpBmk = adrp.BenchmarkAdrp( + adrp.file_path, + default_model, + "keras", + prog="adrp_baseline", + desc="Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1", + ) + + # Initialize parameters + gParameters = candle.finalize_parameters(adrpBmk) + # adrp.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def save_cache( + cache_file, x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels +): + with h5py.File(cache_file, "w") as hf: + hf.create_dataset("x_train", data=x_train) + hf.create_dataset("y_train", data=y_train) + hf.create_dataset("x_val", data=x_val) + hf.create_dataset("y_val", data=y_val) + hf.create_dataset("x_test", data=x_test) + hf.create_dataset("y_test", data=y_test) + hf.create_dataset( + "x_labels", + (len(x_labels), 1), + "S100", + data=[x.encode("ascii", "ignore") for x in x_labels], + ) + hf.create_dataset( + "y_labels", + (len(y_labels), 1), + "S100", + data=[x.encode("ascii", "ignore") for x in y_labels], + ) + + +def load_cache(cache_file): + with h5py.File(cache_file, "r") as hf: + x_train = hf["x_train"][:] + y_train = hf["y_train"][:] + x_val = hf["x_val"][:] + y_val = hf["y_val"][:] + x_test = hf["x_test"][:] + y_test = hf["y_test"][:] + x_labels = [x[0].decode("unicode_escape") for x in hf["x_labels"][:]] + y_labels = [x[0].decode("unicode_escape") for x in hf["y_labels"][:]] + return x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels + + +def run(params): + args = candle.ArgumentStruct(**params) + seed = args.rng_seed + candle.set_seed(seed) + + # Construct extension to save model + ext = adrp.extension_from_parameters(params, ".keras") + candle.verify_path(params["save_path"]) + prefix = "{}{}".format(params["save_path"], ext) + logfile = params["logfile"] if params["logfile"] else prefix + ".log" + candle.set_up_logger(logfile, adrp.logger, params["verbose"]) + adrp.logger.info("Params: {}".format(params)) + + # Get default parameters for initialization and optimizer functions + keras_defaults = candle.keras_default_config() + + ## + X_train, Y_train, X_test, Y_test, PS = adrp.load_data(params, seed) + + print("X_train shape:", X_train.shape) + print("X_test shape:", X_test.shape) + + print("Y_train shape:", Y_train.shape) + print("Y_test shape:", Y_test.shape) + + # Initialize weights and learning rule + initializer_weights = candle.build_initializer( + params["initialization"], keras_defaults, seed + ) + initializer_bias = candle.build_initializer("constant", keras_defaults, 0.0) + + activation = params["activation"] + # TODO: set output_dim + output_dim = 1 + + # TODO: Use dense_layers for creating inputs/outputs + dense_layers = params["dense"] + + inputs = Input(shape=(PS,)) + + if dense_layers != None: + if type(dense_layers) != list: + dense_layers = list(dense_layers) + for i, l in enumerate(dense_layers): + if i == 0: + x = Dense( + l, + activation=activation, + kernel_initializer=initializer_weights, + bias_initializer=initializer_bias, + )(inputs) + else: + x = Dense( + l, + activation=activation, + kernel_initializer=initializer_weights, + bias_initializer=initializer_bias, + )(x) + if params["dropout"]: + x = Dropout(params["dropout"])(x) + output = Dense( + output_dim, + activation=activation, + kernel_initializer=initializer_weights, + bias_initializer=initializer_bias, + )(x) + else: + output = Dense( + output_dim, + activation=activation, + kernel_initializer=initializer_weights, + bias_initializer=initializer_bias, + )(inputs) + + model = Model(inputs=inputs, outputs=output) + + model.summary() + + kerasDefaults = candle.keras_default_config() + if params["momentum"]: + kerasDefaults["momentum_sgd"] = params["momentum"] + + optimizer = candle.build_optimizer( + params["optimizer"], params["learning_rate"], kerasDefaults + ) + + model.compile( + loss=params["loss"], optimizer=optimizer, metrics=["mae", r2], + ) + + # set up a bunch of callbacks to do work during model training.. + + checkpointer = ModelCheckpoint( + filepath=params["save_path"] + "agg_adrp.autosave.model.h5", + verbose=1, + save_weights_only=False, + save_best_only=True, + ) + csv_logger = CSVLogger(params["save_path"] + "agg_adrp.training.log") + reduce_lr = ReduceLROnPlateau( + monitor="val_loss", + factor=0.75, + patience=20, + mode="auto", + epsilon=0.0001, + cooldown=3, + min_lr=0.000000001, + ) + early_stop = EarlyStopping(monitor="val_loss", patience=100, verbose=1, mode="auto") + + # history = parallel_model.fit(X_train, Y_train, + epochs = params["epochs"] + batch_size = params["batch_size"] + + history = model.fit( + X_train, + Y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(X_test, Y_test), + callbacks=[checkpointer, csv_logger, reduce_lr, early_stop], + ) + + score = model.evaluate(X_test, Y_test, verbose=0) + + print(score) + + print(history.history.keys()) + + # see big fuction below, creates plots etc. + # TODO: Break post_process into multiple functions + post_process(params, X_train, X_test, Y_test, score, history, model) + + adrp.logger.handlers = [] + + return history + + +def post_process(params, X_train, X_test, Y_test, score, history, model): + save_path = params["save_path"] + print("saving to path: ", save_path) + + # summarize history for MAE + plt.plot(history.history["mae"]) + plt.plot(history.history["val_mae"]) + plt.title("Model Mean Absolute Error") + plt.ylabel("mae") + plt.xlabel("epoch") + plt.legend(["train", "test"], loc="upper left") + + plt.savefig(save_path + "agg_adrp.mae.png", bbox_inches="tight") + plt.savefig(save_path + "agg_adrp.mae.pdf", bbox_inches="tight") + + plt.close() + + # summarize history for loss + plt.plot(history.history["loss"]) + plt.plot(history.history["val_loss"]) + plt.title("Model Loss") + plt.ylabel("loss") + plt.xlabel("epoch") + plt.legend(["train", "test"], loc="upper left") + + plt.savefig(save_path + "agg_adrp.loss.png", bbox_inches="tight") + plt.savefig(save_path + "agg_adrp.loss.pdf", bbox_inches="tight") + + plt.close() + + print("Test val_loss:", score[0]) + print("Test val_mae:", score[1]) + + # serialize model to JSON + model_json = model.to_json() + with open(save_path + "agg_adrp.model.json", "w") as json_file: + json_file.write(model_json) + + # serialize model to YAML + model_yaml = model.to_yaml() + with open(save_path + "agg_adrp.model.yaml", "w") as yaml_file: + yaml_file.write(model_yaml) + + # serialize weights to HDF5 + model.save_weights(save_path + "agg_adrp.model.h5") + print("Saved model to disk") + + # load json and create model + json_file = open(save_path + "agg_adrp.model.json", "r") + loaded_model_json = json_file.read() + json_file.close() + loaded_model_json = model_from_json(loaded_model_json) + + # load yaml and create model + yaml_file = open(save_path + "agg_adrp.model.yaml", "r") + loaded_model_yaml = yaml_file.read() + yaml_file.close() + loaded_model_yaml = model_from_yaml(loaded_model_yaml) + + # load weights into new model + loaded_model_json.load_weights(save_path + "agg_adrp.model.h5") + print("Loaded json model from disk") + + # evaluate json loaded model on test data + loaded_model_json.compile( + loss="binary_crossentropy", optimizer="SGD", metrics=["mean_absolute_error"] + ) + score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) + + print("json Validation loss:", score_json[0]) + print("json Validation mae:", score_json[1]) + + print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1] * 100)) + + # load weights into new model + loaded_model_yaml.load_weights(save_path + "agg_adrp.model.h5") + print("Loaded yaml model from disk") + + # evaluate loaded model on test data + loaded_model_yaml.compile( + loss="binary_crossentropy", optimizer="SGD", metrics=["mean_absolute_error"] + ) + score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) + + print("yaml Validation loss:", score_yaml[0]) + print("yaml Validation mae:", score_yaml[1]) + + print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1] * 100)) + + # predict using loaded yaml model on test and training data + + predict_yaml_train = loaded_model_yaml.predict(X_train) + + predict_yaml_test = loaded_model_yaml.predict(X_test) + + print("Yaml_train_shape:", predict_yaml_train.shape) + print("Yaml_test_shape:", predict_yaml_test.shape) + + predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) + predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) + + np.savetxt( + save_path + "predict_yaml_train.csv", + predict_yaml_train, + delimiter=",", + fmt="%.3f", + ) + np.savetxt( + save_path + "predict_yaml_test.csv", + predict_yaml_test, + delimiter=",", + fmt="%.3f", + ) + + np.savetxt( + save_path + "predict_yaml_train_classes.csv", + predict_yaml_train_classes, + delimiter=",", + fmt="%d", + ) + np.savetxt( + save_path + "predict_yaml_test_classes.csv", + predict_yaml_test_classes, + delimiter=",", + fmt="%d", + ) + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == "__main__": + main() + if K.backend() == "tensorflow": + K.clear_session() diff --git a/examples/ADRP/adrp_default_model.txt b/examples/ADRP/adrp_default_model.txt new file mode 100644 index 00000000..5f5452de --- /dev/null +++ b/examples/ADRP/adrp_default_model.txt @@ -0,0 +1,27 @@ +[Global_Params] +data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +train_data='adrp-p1.csv' +model_name='adrp' +dense=[250, 125, 60, 30] +batch_size=32 +epochs=400 +activation='relu' +loss='mean_squared_error' +optimizer='sgd' +dropout=0.1 +learning_rate=0.0001 +momentum=0.9 +scaling='minmax' +epsilon_std=1.0 +rng_seed=2017 +initialization='glorot_uniform' +latent_dim=2 +batch_normalization=False +save_path='./001/' +use_cp=False +early_stop=True +reduce_lr=True +feature_subsample=0 + +[Monitor_Params] +timeout=0 diff --git a/examples/ADRP/reg_go2.py b/examples/ADRP/reg_go2.py new file mode 100644 index 00000000..80be28a5 --- /dev/null +++ b/examples/ADRP/reg_go2.py @@ -0,0 +1,281 @@ +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse + +import math +import matplotlib +matplotlib.use('Agg') + +import matplotlib.pyplot as plt + +import tensorflow as tf + +import keras as ke +from keras import backend as K + +from keras.layers import Input, Dense, Dropout, Activation +from keras.optimizers import SGD, Adam, RMSprop +from keras.models import Sequential, Model, model_from_json, model_from_yaml +from keras.utils import np_utils, multi_gpu_model + +from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping + + +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path) + +psr = argparse.ArgumentParser(description='input csv file') +psr.add_argument('--in', default='in_file') +psr.add_argument('--ep', type=int, default=400) +args=vars(psr.parse_args()) +print(args) + +EPOCH = args['ep'] +BATCH = 32 +#nb_classes = 2 + +data_path = args['in'] + +df_toss = (pd.read_csv(data_path,nrows=1).values) + +print('df_toss:', df_toss.shape) + +PL = df_toss.size +PS = PL - 1 + +print('PL=',PL) + +#PL = 6213 # 38 + 60483 +#PS = 6212 # 60483 +DR = 0.1 # Dropout rate + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + +class Attention(ke.layers.Layer): + def __init__(self, output_dim, **kwargs): + self.output_dim = output_dim + super(Attention, self).__init__(**kwargs) + + def build(self, input_shape): + self.kernel = self.add_weight(name='kernel', + shape=(input_shape[1], self.output_dim), + initializer='uniform', + trainable=True) + super(Attention, self).build(input_shape) + + def call(self, V): + Q = ke.backend.dot(V, self.kernel) + Q = Q * V + Q = Q / math.sqrt(self.output_dim) + Q = ke.activations.softmax(Q) + return Q + + def compute_output_shape(self, input_shape): + return input_shape + + + +def load_data(): + + data_path = args['in'] + + df = (pd.read_csv(data_path,skiprows=1).values).astype('float32') + + df_y = df[:,0].astype('float32') + df_x = df[:, 1:PL].astype(np.float32) + + +# scaler = MaxAbsScaler() + + scaler = StandardScaler() + df_x = scaler.fit_transform(df_x) + + X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, test_size= 0.20, random_state=42) + + print('x_train shape:', X_train.shape) + print('x_test shape:', X_test.shape) + + + return X_train, Y_train, X_test, Y_test + +X_train, Y_train, X_test, Y_test = load_data() + +print('X_train shape:', X_train.shape) +print('X_test shape:', X_test.shape) + +print('Y_train shape:', Y_train.shape) +print('Y_test shape:', Y_test.shape) + + +inputs = Input(shape=(PS,)) +x = Dense(250, activation='relu')(inputs) +#b = Attention(1000)(a) +#x = ke.layers.multiply([b, a]) + +#b = Dense(1000, activation='softmax')(inputs) +#x = ke.layers.multiply([a,b]) + +#x = Dense(1000, activation='relu')(x) +#x = Dropout(DR)(x) +#x = Dense(500, activation='relu')(x) +#x = Dropout(DR)(x) +#x = Dense(250, activation='relu')(x) +x = Dropout(DR)(x) +x = Dense(125, activation='relu')(x) +x = Dropout(DR)(x) +x = Dense(60, activation='relu')(x) +x = Dropout(DR)(x) +x = Dense(30, activation='relu')(x) +x = Dropout(DR)(x) +outputs = Dense(1, activation='relu')(x) + +model = Model(inputs=inputs, outputs=outputs) + +model.summary() + +#parallel_model = multi_gpu_model(model, gpus=4) +#parallel_model.compile(loss='mean_squared_error', +# optimizer=SGD(lr=0.0001, momentum=0.9), +# metrics=['mae',r2]) + +model.compile(loss='mean_squared_error', + optimizer=SGD(lr=0.0001, momentum=0.9), + metrics=['mae',r2]) + +# set up a bunch of callbacks to do work during model training.. + +checkpointer = ModelCheckpoint(filepath='reg_go.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) +csv_logger = CSVLogger('reg_go.training.log') +reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.75, patience=20, verbose=1, mode='auto', epsilon=0.0001, cooldown=3, min_lr=0.000000001) +early_stop = EarlyStopping(monitor='val_loss', patience=100, verbose=1, mode='auto') + + +#history = parallel_model.fit(X_train, Y_train, + +history = model.fit(X_train, Y_train, + batch_size=BATCH, + epochs=EPOCH, + verbose=1, + validation_data=(X_test, Y_test), + callbacks = [checkpointer, csv_logger, reduce_lr, early_stop]) + +score = model.evaluate(X_test, Y_test, verbose=0) + +print(score) + +print(history.history.keys()) +# dict_keys(['val_loss', 'val_mae', 'val_r2', 'loss', 'mae', 'r2', 'lr']) + +# summarize history for MAE +#plt.plot(history.history['mean_absolute_error']) +plt.plot(history.history['mae']) +#plt.plot(history.history['val_mean_absolute_error']) +plt.plot(history.history['val_mae']) + +plt.title('Model Mean Absolute Error') +plt.ylabel('mae') +plt.xlabel('epoch') +plt.legend(['train', 'test'], loc='upper left') + +plt.savefig('reg_go.mae.png', bbox_inches='tight') +plt.savefig('reg_go.mae.pdf', bbox_inches='tight') + +plt.close() + +# summarize history for loss +plt.plot(history.history['loss']) +plt.plot(history.history['val_loss']) +plt.title('Model Loss') +plt.ylabel('loss') +plt.xlabel('epoch') +plt.legend(['train', 'test'], loc='upper left') + +plt.savefig('reg_go.loss.png', bbox_inches='tight') +plt.savefig('reg_go.loss.pdf', bbox_inches='tight') + +plt.close() + +print('Test val_loss:', score[0]) +print('Test val_mae:', score[1]) + +#exit() + +# serialize model to JSON +model_json = model.to_json() +with open("reg_go.model.json", "w") as json_file: + json_file.write(model_json) + +# serialize model to YAML +model_yaml = model.to_yaml() +with open("reg_go.model.yaml", "w") as yaml_file: + yaml_file.write(model_yaml) + + +# serialize weights to HDF5 +model.save_weights("reg_go.model.h5") +print("Saved model to disk") + +#exit() + +# load json and create model +json_file = open('reg_go.model.json', 'r') +loaded_model_json = json_file.read() +json_file.close() +loaded_model_json = model_from_json(loaded_model_json) + + +# load yaml and create model +yaml_file = open('reg_go.model.yaml', 'r') +loaded_model_yaml = yaml_file.read() +yaml_file.close() +loaded_model_yaml = model_from_yaml(loaded_model_yaml) + + +# load weights into new model +loaded_model_json.load_weights("reg_go.model.h5") +print("Loaded json model from disk") + +# evaluate json loaded model on test data +loaded_model_json.compile(loss='mean_squared_error', optimizer='SGD', metrics=['mean_absolute_error']) +score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) + +print('json Validation loss:', score_json[0]) +print('json Validation mae:', score_json[1]) + +# load weights into new model +loaded_model_yaml.load_weights("reg_go.model.h5") +print("Loaded yaml model from disk") + +# evaluate loaded model on test data +loaded_model_yaml.compile(loss='mean_squared_error', optimizer='SGD', metrics=['mean_absolute_error']) +score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) + +print('yaml Validation loss:', score_yaml[0]) +print('yaml Validation mae:', score_yaml[1]) + +# predict using loaded yaml model on test and training data + +predict_yaml_train = loaded_model_yaml.predict(X_train) + +predict_yaml_test = loaded_model_yaml.predict(X_test) + +pred_train = predict_yaml_train[:,0] +pred_test = predict_yaml_test[:,0] + +np.savetxt("pred_train.csv", pred_train, delimiter=".", newline='\n', fmt="%.3f") +np.savetxt("pred_test.csv", pred_test, delimiter=",", newline='\n',fmt="%.3f") + +print('Correlation prediction on test and Y_test:', np.corrcoef( pred_test, Y_test)) +print('Correlation prediction on train and Y_train:', np.corrcoef( pred_train, Y_train)) + diff --git a/examples/M16/M16_test.py b/examples/M16/M16_test.py new file mode 100644 index 00000000..6825ceff --- /dev/null +++ b/examples/M16/M16_test.py @@ -0,0 +1,185 @@ +import os +import sys +import pandas as pd +import numpy as np +import keras +import warnings +warnings.filterwarnings("ignore") + + + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path2 = os.path.abspath(os.path.join(file_path, "..", "..", "common")) +sys.path.append(lib_path2) + + + +import candle + +# download all the data if needed from the repo +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/' +file_name = 'small_drug_descriptor_data_unique_samples.txt' +drug_descriptor = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'small_drug_response_data.txt' +response_data = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'Gene_Expression_Full_Data_Unique_Samples.txt' +gene_expression = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'CCLE_NCI60_Gene_Expression_Full_Data.txt' +ccle_nci60 = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + + + +# download all the gene_set files needed +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/' +for gene_set_category in ['c2.cgp','c2.cp.biocarta','c2.cp.kegg','c2.cp.pid','c2.cp.reactome','c5.bp','c5.cc','c5.mf','c6.all']: + for gene_name_type in ['entrez', 'symbols']: + file_name = gene_set_category+'.v7.0.'+gene_name_type+'.gmt' + local_file = candle.get_file(file_name, data_url+file_name, cache_subdir='examples/Gene_Sets/MSigDB.v7.0') +# extract base directory for gene_set data files +data_dir = local_file.split(file_name)[0] +print('Gene Set data is locally stored at ', data_dir) + + + +# Select features based on_missing_values +print('\n') +print('Testing select_features_by_missing_values') +print('Drug descriptor dataframe includes 10 drugs (rows) and 10 drug descriptor features (columns)') +data = pd.read_csv(drug_descriptor, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +print(data) +print('Select features with missing rates smaller than 0.1') +id = candle.select_features_by_missing_values(data, threshold=0.1) +print('Feature IDs', id) +print('Select features with missing rates smaller than 0.3') +id = candle.select_features_by_missing_values(data.values, threshold=0.3) +print('Feature IDs', id) + + + +# Select features based on variation +print('\n') +print('Testing select_features_by_variation') +print('Select features with a variance larger than 100') +id = candle.select_features_by_variation(data, variation_measure='var', threshold=100, portion=None, draw_histogram=False) +print('Feature IDs', id) +print('Select the top 2 features with the largest standard deviation') +id = candle.select_features_by_variation(data, variation_measure='std', portion=0.2) +print('Feature IDs', id) + + + +# Select decorrelated features +print('\n') +print('Testing select_decorrelated_features') +print('Select features that are not identical to each other and are not all missing.') +id = candle.select_decorrelated_features(data, threshold=None, random_seed=None) +print('Feature IDs', id) +print('Select features whose absolute mutual Spearman correlation coefficient is smaller than 0.8') +id = candle.select_decorrelated_features(data, method='spearman', threshold=0.8, random_seed=10) +print('Feature IDs', id) + + + +# Generate cross-validation partitions of data +print('\n') +print('Testing generate_cross_validation_partition') +print('Generate 5-fold cross-validation partition of 10 samples twice') +p = candle.generate_cross_validation_partition(range(10), n_folds=5, n_repeats=2, portions=None, random_seed=None) +print(p) +print('Drug response data of 5 cell lines treated by various drugs.') +data = pd.read_csv(response_data, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=None, low_memory=False) +print(data) +print('Generate partition indices to divide the data into 4 sets without shared cell lines for 5 times.') +p = candle.generate_cross_validation_partition(data.CELL, n_folds=5, n_repeats=1, portions=[1, 1, 1, 2], random_seed=1) +print(p) + + + +# Quantile normalization of gene expression data +print('\n') +print('Testing quantile_normalization') +print('Gene expression data of 897 cell lines (columns) and 17741 genes (rows).') +data = pd.read_csv(gene_expression, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +print(data) +print('Before normalization') +third_quartile = data.quantile(0.75, axis=0) +print('Max difference of third quartile between cell lines is ' + str(np.round(a=np.max(third_quartile) - np.min(third_quartile), decimals=2))) +second_quartile = data.quantile(0.5, axis=0) +print('Max difference of median between cell lines is ' + str(np.round(a=np.max(second_quartile) - np.min(second_quartile), decimals=2))) +first_quartile = data.quantile(0.25, axis=0) +print('Max difference of first quartile between cell lines is ' + str(np.round(a=np.max(first_quartile) - np.min(first_quartile), decimals=2))) +norm_data = candle.quantile_normalization(np.transpose(data)) +norm_data = np.transpose(norm_data) +print('After normalization') +third_quartile = norm_data.quantile(0.75, axis=0) +print('Max difference of third quartile between cell lines is ' + str(np.round(a=np.max(third_quartile) - np.min(third_quartile), decimals=2))) +second_quartile = norm_data.quantile(0.5, axis=0) +print('Max difference of median between cell lines is ' + str(np.round(a=np.max(second_quartile) - np.min(second_quartile), decimals=2))) +first_quartile = norm_data.quantile(0.25, axis=0) +print('Max difference of first quartile between cell lines is ' + str(np.round(a=np.max(first_quartile) - np.min(first_quartile), decimals=2))) + + + +# Generate gene-set-level data +print('\n') +print('Testing generate_gene_set_data') +gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[0] for i in norm_data.index], gene_name_type='entrez', + gene_set_category='c6.all', metric='mean', standardize=True, data_dir=data_dir) +print('Generate gene-set-level data of 897 cell lines and 189 oncogenic signature gene sets') +print(gene_set_data) +gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[1] for i in norm_data.index], gene_name_type='symbols', + gene_set_category='c2.cp.kegg', metric='sum', standardize=True, data_dir=data_dir) +print('Generate gene-set-level data of 897 cell lines and 186 KEGG pathways') +print(gene_set_data) + + + +# Combat batch normalization on gene expression data +print('\n') +print('Testing combat_batch_effect_removal') +print('Gene expression data of 60 NCI60 cell lines and 1018 CCLE cell lines with 17741 genes.') +data = pd.read_csv(ccle_nci60, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +print(data) +resource = np.array([i.split('.')[0] for i in data.columns]) + +print('Before removal of batch effect between NCI60 and CCLE datasets') + +# Identify NCI60 cell lines and quantile normalize their gene expression data +id = np.where(resource == 'NCI60')[0] +norm_data_NCI60 = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +print('Average third quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.75, axis=1)), decimals=2))) +print('Average median of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.25, axis=1)), decimals=2))) + +# Identify CCLE cell lines and quantile normalize their gene expression data +id = np.where(resource == 'CCLE')[0] +norm_data_CCLE = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +print('Average third quartile of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.75, axis=1)), decimals=2))) +print('Average median of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.25, axis=1)), decimals=2))) + +# Combine normalized data of NCI60 cell lines and CCLE cell lines +norm_data = pd.concat((norm_data_NCI60, norm_data_CCLE), axis=0) +norm_data = np.transpose(norm_data) + +# Apply ComBat algorithm to remove the batch effect between NCI60 and CCLE +corrected_data = candle.combat_batch_effect_removal(norm_data, pd.Series([i.split('.')[0] for i in norm_data.columns], index=norm_data.columns)) + +print('After removal of batch effect between NCI60 and CCLE datasets') + +resource = np.array([i.split('.')[0] for i in corrected_data.columns]) +id = np.where(resource == 'NCI60')[0] +corrected_data_NCI60 = np.transpose(corrected_data.iloc[:, id]) +print('Average third quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.75, axis=1)), decimals=2))) +print('Average median of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.25, axis=1)), decimals=2))) + +# Identify CCLE cell lines and quantile normalize their gene expression data +id = np.where(resource == 'CCLE')[0] +corrected_data_CCLE = np.transpose(corrected_data.iloc[:, id]) +print('Average third quartile of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.75, axis=1)), decimals=2))) +print('Average median of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.25, axis=1)), decimals=2))) diff --git a/examples/M16/README.md b/examples/M16/README.md new file mode 100644 index 00000000..2d9cb78f --- /dev/null +++ b/examples/M16/README.md @@ -0,0 +1,443 @@ +# Data preprocessing - feature selection examples + +## Background + +Data preprocessing is an important front-end step in data analysis that prepares data for subsequent analysis. +It not only enables the subsequent analysis by processing and transforming data, but also influences the quality of subsequent analysis sometimes significantly. +Several common examples of data preprocessing are data standardization and normalization to remove/suppress noise, removal of batch effect to combine datasets for larger studies, and generation of new representations of data to enable new analyses. +Feature selection can be viewed as a kind of data preprocessing for prediction analysis. +Its goal is to select a (minimum) subset of available features, based on which prediction models with a good performance can be constructed. +And the performance can be evaluated from multiple aspects, such as the prediction accuracy and the speed of constructing the prediction model. + +The data preprocessing methods can generate data partitions to enable flexible cross-validation analysis, normalize and remove batch effects from gene expression data of cancer cells, and generate genomic representations at the gene set level for cancer cells. +The feature selection methods can filter features based on missing values and variations, and perform feature decorrelation. +Features without much variation might not be useful for prediction and highly-correlated features are not necessary to be all included in the prediction model. +We also implement and extend the co-expression extrapolation (COXEN) gene selection method for Pilot 1 project [3], which can select predictive and generalizable genes for predicting drug response in the precision oncology applications. + +## General Data Preprocessing Functions + +```generate_cross_validation_partition``` + +To flexibly generate data partitions for cross-validation analysis, such as partitioning of grouped samples into sets that do not share groups. + +## Data Preprocessing Functions Specific to Pilot 1 Applications + +```quantile_normalizationa``` + +To perform quantile normalization of genomic data [1] with tolerance of missing values. [[see example code]](#quantile-normalization-of-gene-expression-data) + +```combat_batch_effect_removal``` + +To perform ComBat analysis [2] on gene expression data to remove batch effects. [[see example code]](#combat-batch-normalization-on-gene-expression-data) + +```generate_gene_set_data``` + +To calculate genomic representations at gene set level, such as the average expression values of genes in a pathway and the total number of SNP mutations in a genetic pathway. [[see example code]](#generate-gene-set-level-data) + +## General Feature Selection Functions + +```select_features_by_missing_values``` + +To remove features with (many) missing values. [[see example code]](#select-features-based-on-missing-values) + +```select_features_by_variation``` + +To remove features with no or small variations. [[see example code]](#select-features-based-on-variation) + +```select_decorrelated_features``` + +To select a subset of features that are not identical or highly correlated with each other. [[see example code]](#select-decorrelated-features) + +## Feature (Gene) Selection Functions Specific to Pilot 1 Applications + +```coxen_single_drug_gene_selection``` + +To perform co-expression extrapolation (COXEN) analysis [3] that selects predictive and generalizable genes for predicting the response of tumor cells to a specific drug. + +```coxen_multi_drug_gene_selection``` + +To extend the COXEN approach for selecting genes to predict the response of tumor cells to multiple drugs in precision oncology applications. + +## Running the example + +The code demonstrates feature selection methods that CANDLE provides. + +It can be run by executing ``` python M16_test.py ``` + +### Download data +Code +```python +# download all the data if needed from the repo +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/' +file_name = 'small_drug_descriptor_data_unique_samples.txt' +drug_descriptor = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'small_drug_response_data.txt' +response_data = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'Gene_Expression_Full_Data_Unique_Samples.txt' +gene_expression = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'CCLE_NCI60_Gene_Expression_Full_Data.txt' +ccle_nci60 = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') +``` +Output +```bash +Importing candle utils for keras +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/small_drug_descriptor_data_unique_samples.txt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/small_drug_response_data.txt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/Gene_Expression_Full_Data_Unique_Samples.txt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/CCLE_NCI60_Gene_Expression_Full_Data.txt +``` + +### Download gene set +Code +```python +# download all the gene_set files needed +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/' +for gene_set_category in ['c2.cgp','c2.cp.biocarta','c2.cp.kegg','c2.cp.pid','c2.cp.reactome','c5.bp','c5.cc','c5.mf','c6.all']: + for gene_name_type in ['entrez', 'symbols']: + file_name = gene_set_category+'.v7.0.'+gene_name_type+'.gmt' + local_file = candle.get_file(file_name, data_url+file_name, cache_subdir='examples/Gene_Sets/MSigDB.v7.0') +# extract base directory for gene_set data files +data_dir = local_file.split(file_name)[0] +print('Gene Set data is locally stored at ', data_dir) +``` +Output +``` +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cgp.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cgp.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.biocarta.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.biocarta.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.kegg.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.kegg.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.pid.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.pid.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.reactome.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.reactome.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.bp.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.bp.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.cc.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.cc.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.mf.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.mf.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c6.all.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c6.all.v7.0.symbols.gmt +Gene Set data is locally stored at /Users/hsyoo/projects/CANDLE/Benchmarks/common/../Data/examples/Gene_Sets/MSigDB.v7.0/ +``` + +### Select features based on missing values +Code +```python +print('Testing select_features_by_missing_values') +print('Drug descriptor dataframe includes 10 drugs (rows) and 10 drug descriptor features (columns)') +data = pd.read_csv(drug_descriptor, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +print(data) +print('Select features with missing rates smaller than 0.1') +id = candle.select_features_by_missing_values(data, threshold=0.1) +print('Feature IDs', id) +print('Select features with missing rates smaller than 0.3') +id = candle.select_features_by_missing_values(data.values, threshold=0.3) +print('Feature IDs', id) +``` +Output +```bash +Testing select_features_by_missing_values +Drug descriptor dataframe includes 10 drugs (rows) and 10 drug descriptor features (columns) + MW AMW Sv Se ... Mv Psi_e_1d Psi_e_1s VE3sign_X +Drug_1 475.40 8.804 34.718 54.523 ... 0.643 NaN NaN NaN +Drug_10 457.71 10.898 29.154 43.640 ... 0.694 NaN NaN -2.752 +Drug_100 561.80 6.688 49.975 83.607 ... 0.595 NaN NaN -4.335 +Drug_1000 362.51 6.840 32.794 52.461 ... 0.619 NaN NaN -9.968 +Drug_1001 628.83 7.763 51.593 81.570 ... 0.637 NaN NaN -2.166 +Drug_1002 377.19 10.777 26.191 36.578 ... 0.748 NaN NaN -1.526 +Drug_1003 371.42 8.254 30.896 45.473 ... 0.687 NaN NaN -4.983 +Drug_1004 453.60 8.100 37.949 55.872 ... 0.678 NaN NaN -4.100 +Drug_1005 277.35 7.704 23.940 35.934 ... 0.665 NaN NaN -5.234 +Drug_1006 409.47 8.189 34.423 50.356 ... 0.688 NaN NaN -2.513 + +[10 rows x 10 columns] +Select features with missing rates smaller than 0.1 +Feature IDs [0 1 2 3 4 5 6] +Select features with missing rates smaller than 0.3 +Feature IDs [0 1 2 3 4 5 6 9] +``` + +### Select features based on variation +Code +```python +print('Testing select_features_by_variation') +print('Select features with a variance larger than 100') +id = candle.select_features_by_variation(data, variation_measure='var', threshold=100, portion=None, draw_histogram=False) +print('Feature IDs', id) +print('Select the top 2 features with the largest standard deviation') +id = candle.select_features_by_variation(data, variation_measure='std', portion=0.2) +print('Feature IDs', id) +``` + +Output +``` +Testing select_features_by_variation +Select features with a variance larger than 100 +Feature IDs [0 3 5] +Select the top 2 features with the largest standard deviation +Feature IDs [0 5] +``` + +### Select decorrelated features +Code +```python +print('Testing select_decorrelated_features') +print('Select features that are not identical to each other and are not all missing.') +id = candle.select_decorrelated_features(data, threshold=None, random_seed=None) +print('Feature IDs', id) +print('Select features whose absolute mutual Spearman correlation coefficient is smaller than 0.8') +id = candle.select_decorrelated_features(data, method='spearman', threshold=0.8, random_seed=10) +print('Feature IDs', id) +``` +Output +``` +Testing select_decorrelated_features +Select features that are not identical to each other and are not all missing. +Feature IDs [0 1 2 3 4 5 6 9] +Select features whose absolute mutual Spearman correlation coefficient is smaller than 0.8 +Feature IDs [0 2 6 9] +``` + +### Generate cross-validation partitions of data +Code +```python +print('Testing generate_cross_validation_partition') +print('Generate 5-fold cross-validation partition of 10 samples twice') +p = candle.generate_cross_validation_partition(range(10), n_folds=5, n_repeats=2, portions=None, random_seed=None) +print(p) +print('Drug response data of 5 cell lines treated by various drugs.') +data = pd.read_csv(response_data, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=None, low_memory=False) +print(data) +print('Generate partition indices to divide the data into 4 sets without shared cell lines for 5 times.') +p = candle.generate_cross_validation_partition(data.CELL, n_folds=5, n_repeats=1, portions=[1, 1, 1, 2], random_seed=1) +print(p) +``` +Output +``` +Testing generate_cross_validation_partition +Generate 5-fold cross-validation partition of 10 samples twice +[[[0, 5], [1, 2, 3, 4, 6, 7, 8, 9]], [[1, 6], [0, 2, 3, 4, 5, 7, 8, 9]], [[2, 7], [0, 1, 3, 4, 5, 6, 8, 9]], [[3, 8], [0, 1, 2, 4, 5, 6, 7, 9]], [[4, 9], [0, 1, 2, 3, 5, 6, 7, 8]], [[5, 8], [0, 1, 2, 3, 4, 6, 7, 9]], [[3, 9], [0, 1, 2, 4, 5, 6, 7, 8]], [[2, 4], [0, 1, 3, 5, 6, 7, 8, 9]], [[1, 7], [0, 2, 3, 4, 5, 6, 8, 9]], [[0, 6], [1, 2, 3, 4, 5, 7, 8, 9]]] +Drug response data of 5 cell lines treated by various drugs. + SOURCE CELL DRUG AUC EC50 EC50se R2fit HS +0 CCLE CCLE.22RV1 CCLE.1 0.7153 5.660 0.6867 0.9533 0.6669 +1 CCLE CCLE.22RV1 CCLE.10 0.9579 7.023 0.7111 0.4332 4.0000 +2 CCLE CCLE.22RV1 CCLE.11 0.4130 7.551 0.0385 0.9948 1.3380 +3 CCLE CCLE.22RV1 CCLE.12 0.8004 5.198 11.7100 0.9944 4.0000 +4 CCLE CCLE.22RV1 CCLE.13 0.5071 7.149 0.3175 0.8069 1.0150 +.. ... ... ... ... ... ... ... ... +95 CCLE CCLE.697 CCLE.12 0.7869 5.278 20.1200 0.8856 4.0000 +96 CCLE CCLE.697 CCLE.13 0.4433 7.474 0.0265 0.9978 3.7080 +97 CCLE CCLE.697 CCLE.14 0.4337 7.466 0.0106 0.9996 3.4330 +98 CCLE CCLE.697 CCLE.15 0.8721 3.097 29.1300 0.4884 0.2528 +99 CCLE CCLE.697 CCLE.16 0.7955 7.496 0.1195 0.9396 1.9560 + +[100 rows x 8 columns] +Generate partition indices to divide the data into 4 sets without shared cell lines for 5 times. +[[[68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91], [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67], [92, 93, 94, 95, 96, 97, 98, 99], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]], [[44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67], [92, 93, 94, 95, 96, 97, 98, 99], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91]], [[92, 93, 94, 95, 96, 97, 98, 99], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43], [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91]], [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43], [68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91], [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 92, 93, 94, 95, 96, 97, 98, 99]], [[24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43], [68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91], [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 92, 93, 94, 95, 96, 97, 98, 99]]] +Using TensorFlow backend. +... +found 2 batches +found 0 numerical covariates... +found 0 categorical variables: +Standardizing Data across genes. +Fitting L/S model and finding priors +Finding parametric adjustments +``` + +### Quantile normalization of gene expression data +Code +```python +print('Testing quantile_normalization') +print('Gene expression data of 897 cell lines (columns) and 17741 genes (rows).') +data = pd.read_csv(gene_expression, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +print(data) +print('Before normalization') +third_quartile = data.quantile(0.75, axis=0) +print('Max difference of third quartile between cell lines is ' + str(np.round(a=np.max(third_quartile) - np.min(third_quartile), decimals=2))) +second_quartile = data.quantile(0.5, axis=0) +print('Max difference of median between cell lines is ' + str(np.round(a=np.max(second_quartile) - np.min(second_quartile), decimals=2))) +first_quartile = data.quantile(0.25, axis=0) +print('Max difference of first quartile between cell lines is ' + str(np.round(a=np.max(first_quartile) - np.min(first_quartile), decimals=2))) +norm_data = candle.quantile_normalization(np.transpose(data)) +norm_data = np.transpose(norm_data) +print('After normalization') +third_quartile = norm_data.quantile(0.75, axis=0) +print('Max difference of third quartile between cell lines is ' + str(np.round(a=np.max(third_quartile) - np.min(third_quartile), decimals=2))) +second_quartile = norm_data.quantile(0.5, axis=0) +print('Max difference of median between cell lines is ' + str(np.round(a=np.max(second_quartile) - np.min(second_quartile), decimals=2))) +first_quartile = norm_data.quantile(0.25, axis=0) +print('Max difference of first quartile between cell lines is ' + str(np.round(a=np.max(first_quartile) - np.min(first_quartile), decimals=2))) +``` +Output +``` +Testing quantile_normalization +Gene expression data of 897 cell lines (columns) and 17741 genes (rows). + CCL_61 CCL_62 CCL_63 ... CCL_1076 CCL_1077 CCL_1078 +entrezID geneSymbol ... +1 A1BG 0.99 0.03 0.36 ... 2.56 3.55 3.04 +29974 A1CF 4.03 3.03 0.00 ... 0.00 0.03 0.00 +2 A2M 2.68 0.03 0.16 ... 0.77 0.31 1.20 +144568 A2ML1 0.07 0.07 0.01 ... 0.01 0.00 1.09 +127550 A3GALT2 0.15 0.00 0.06 ... 2.34 0.00 0.03 +... ... ... ... ... ... ... ... +440590 ZYG11A 0.41 0.06 1.70 ... 0.75 3.44 2.44 +79699 ZYG11B 4.45 4.23 3.08 ... 4.25 3.61 3.68 +7791 ZYX 4.65 5.72 6.67 ... 7.78 4.12 5.97 +23140 ZZEF1 4.14 3.98 3.90 ... 4.62 3.76 3.54 +26009 ZZZ3 4.77 5.01 3.90 ... 4.38 3.46 3.60 + +[17741 rows x 897 columns] +Before normalization +Max difference of third quartile between cell lines is 1.86 +Max difference of median between cell lines is 2.25 +Max difference of first quartile between cell lines is 0.5 +After normalization +Max difference of third quartile between cell lines is 0.01 +Max difference of median between cell lines is 0.02 +Max difference of first quartile between cell lines is 0.06 +``` + +### Generate gene-set-level data +```python +print('Testing generate_gene_set_data') +gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[0] for i in norm_data.index], gene_name_type='entrez', + gene_set_category='c6.all', metric='mean', standardize=True, data_dir=data_dir) +print('Generate gene-set-level data of 897 cell lines and 189 oncogenic signature gene sets') +print(gene_set_data) +gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[1] for i in norm_data.index], gene_name_type='symbols', + gene_set_category='c2.cp.kegg', metric='sum', standardize=True, data_dir=data_dir) +print('Generate gene-set-level data of 897 cell lines and 186 KEGG pathways') +print(gene_set_data) +``` +Output +``` +Testing generate_gene_set_data +Generate gene-set-level data of 897 cell lines and 189 oncogenic signature gene sets + GLI1_UP.V1_DN GLI1_UP.V1_UP ... LEF1_UP.V1_DN LEF1_UP.V1_UP +CCL_61 -0.031096 0.283946 ... 0.096461 -0.329343 +CCL_62 0.362855 -0.101684 ... 0.426951 -0.477634 +CCL_63 -0.304989 -0.165160 ... 0.036932 -0.201916 +CCL_64 -0.037737 -0.043124 ... 0.154256 -0.210188 +CCL_65 0.102477 0.438871 ... -0.166487 0.287382 +... ... ... ... ... ... +CCL_1074 0.508978 0.137934 ... 0.148213 0.166717 +CCL_1075 -0.145029 0.216169 ... -0.067391 0.258455 +CCL_1076 -0.357758 0.337235 ... 0.008950 0.186134 +CCL_1077 0.086597 -0.266070 ... 0.217244 -0.276022 +CCL_1078 0.374237 -0.428383 ... 0.312984 -0.303721 + +[897 rows x 189 columns] +Generate gene-set-level data of 897 cell lines and 186 KEGG pathways + KEGG_GLYCOLYSIS_GLUCONEOGENESIS ... KEGG_VIRAL_MYOCARDITIS +CCL_61 6.495365 ... -30.504868 +CCL_62 30.679006 ... -7.205641 +CCL_63 10.534238 ... -5.414998 +CCL_64 6.142140 ... -10.555601 +CCL_65 -0.303868 ... -9.784998 +... ... ... ... +CCL_1074 -1.945281 ... 6.891960 +CCL_1075 -21.373730 ... 0.612092 +CCL_1076 -11.711818 ... -10.353794 +CCL_1077 -11.576702 ... -31.679962 +CCL_1078 -10.355489 ... -26.232325 + +[897 rows x 186 columns] +``` + +### Combat batch normalization on gene expression data +Code +```python +print('Testing combat_batch_effect_removal') +print('Gene expression data of 60 NCI60 cell lines and 1018 CCLE cell lines with 17741 genes.') +data = pd.read_csv(ccle_nci60, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +print(data) +resource = np.array([i.split('.')[0] for i in data.columns]) + +print('Before removal of batch effect between NCI60 and CCLE datasets') +# Identify NCI60 cell lines and quantile normalize their gene expression data +id = np.where(resource == 'NCI60')[0] +norm_data_NCI60 = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +print('Average third quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.75, axis=1)), decimals=2))) +print('Average median of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.25, axis=1)), decimals=2))) + +# Identify CCLE cell lines and quantile normalize their gene expression data +id = np.where(resource == 'CCLE')[0] +norm_data_CCLE = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +print('Average third quartile of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.75, axis=1)), decimals=2))) +print('Average median of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.25, axis=1)), decimals=2))) + +# Combine normalized data of NCI60 cell lines and CCLE cell lines +norm_data = pd.concat((norm_data_NCI60, norm_data_CCLE), axis=0) +norm_data = np.transpose(norm_data) + +# Apply ComBat algorithm to remove the batch effect between NCI60 and CCLE +corrected_data = candle.combat_batch_effect_removal(norm_data, pd.Series([i.split('.')[0] for i in norm_data.columns], index=norm_data.columns)) + +print('After removal of batch effect between NCI60 and CCLE datasets') + +resource = np.array([i.split('.')[0] for i in corrected_data.columns]) +id = np.where(resource == 'NCI60')[0] +corrected_data_NCI60 = np.transpose(corrected_data.iloc[:, id]) +print('Average third quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.75, axis=1)), decimals=2))) +print('Average median of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.25, axis=1)), decimals=2))) + +# Identify CCLE cell lines and quantile normalize their gene expression data +id = np.where(resource == 'CCLE')[0] +corrected_data_CCLE = np.transpose(corrected_data.iloc[:, id]) +print('Average third quartile of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.75, axis=1)), decimals=2))) +print('Average median of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.25, axis=1)), decimals=2))) +``` +Output +``` +Testing combat_batch_effect_removal +Gene expression data of 60 NCI60 cell lines and 1018 CCLE cell lines with 17741 genes. + NCI60.786-0|CCL_1 ... CCLE.ZR7530|CCL_1078 +entrezID geneSymbol ... +1 A1BG 0.00 ... 3.04 +29974 A1CF 0.00 ... 0.00 +2 A2M 0.00 ... 1.20 +144568 A2ML1 0.00 ... 1.09 +127550 A3GALT2 0.00 ... 0.03 +... ... ... ... +440590 ZYG11A 0.01 ... 2.44 +79699 ZYG11B 3.37 ... 3.68 +7791 ZYX 7.05 ... 5.97 +23140 ZZEF1 4.05 ... 3.54 +26009 ZZZ3 4.10 ... 3.60 + +[17741 rows x 1078 columns] +Before removal of batch effect between NCI60 and CCLE datasets +Average third quartile of NCI60 cell lines is 4.0 +Average median of NCI60 cell lines is 1.71 +Average first quartile of NCI60 cell lines is 0.01 +Average third quartile of CCLE cell lines is 4.88 +Average median of CCLE cell lines is 2.75 +Average first quartile of CCLE cell lines is 0.14 +Adjusting data +After removal of batch effect between NCI60 and CCLE datasets +Average third quartile of NCI60 cell lines is 4.81 +Average median of NCI60 cell lines is 2.65 +Average first quartile of NCI60 cell lines is 0.23 +Average third quartile of CCLE cell lines is 4.83 +Average median of CCLE cell lines is 2.72 +Average first quartile of CCLE cell lines is 0.13 +``` + +## References + +1. Bolstad BM, Irizarry RA, Astrand M, et al. \(2003\) *A comparison of normalization methods for high density oligonucleotide array data based on variance and bias* Bioinformatics. 2003 Jan 22;19\(2\):185-93. + +2. Johnson WE, Rabinovic A, and Li C \(2007\) *Adjusting batch effects in microarray expression data using Empirical Bayes methods* Biostatistics 8\(1\):118-127. + +3. Lee JK, Havaleshko DM, Cho H, et al. \(2007\) *A strategy for predicting the chemosensitivity of human cancers and its application to drug discovery* Proc Natl Acad Sci USA, 2007 Aug 7; 104\(32\):13086-91. Epub 2007 Jul 31 + diff --git a/examples/darts/README.rst b/examples/darts/README.rst new file mode 100644 index 00000000..3a03eac0 --- /dev/null +++ b/examples/darts/README.rst @@ -0,0 +1,78 @@ +============== +DARTS Examples +============== + +Differentiable architecture search + +TLDR +---- + +Our recommended ordering of examples: + +1. **Uno**: learn how to use the neural network building blocks in DARTS to + define a fully connected model using DARTS. + +2. **Advanced**: how to define our own neural network primitives to be optimized + by DARTS. + +Setup +----- + +Darts makes use of Pytorch. You can find binaries for both Pytorch and Torchvision (used in the advanced +example) at the `pytorch website`_. + +The Algorithm +------------- + +This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending +the work to handle convolutional neural networks for NLP problems and more. +Details of the original authors' approach can be found in their 2019 ICLR paper_. + +DARTS works by composing various neural net primitives, defined as Pytorch *nn.Modules*, +to create a larger directed acyclic graph (DAG) that is to be your model. This +composition is differentiable as we take the softmax of the choice of primitive types +at each layer of the network. To make this more clear, let's first define a few abstractions +in the algorithm: + +1. **Primitve**: this is the fundamental block of computation, defined as an *nn.Module*. + At each layer of your network, one of these primitves will be chosen by taking the + softmax of all possible primitives at that layer. Examples could be a convolution block, + a linear layer, a skip connect, or anything that you can come up with (subject to a few + constraints). + +2. **Cell**: this is an abstraction that holds each of the primitive types for level of your + network. This is where we perform the softmax over the possible primitive types. + +3. **Nodes**: this is the level of abstraction that would normally be considered a layer in + your network. It can contain one or more *Cells*. + +4. **Architecture**: The abstraction that contains all nodes in the graph. This computes a + Hessian product with respect to the *alpha* parameters as defined in the paper. + +5. **Genotype**: genotypes are instances of a particular configuration of the graph. As the + optimization runs, and each cell computes the softmax over their primitive types, the final + configuration of all nodes with their resulting primitive is a genotype. + +In the DARTS algorithm, we define a number of primitives that we would like to compose together +to form our neural network. The original paper started with 8 primitive types. These types +were originally designed for a vision task, and largely consist of convolution type operations. +We have since adapted these types for the *P3B5* benchmark, creating 1D convolution types for +our NLP tasks. If you would like to see how these primitives are defined, along with their +necessary constructors used by DARTS, you can find them in +`darts.modules.operations.conv.py`_. + +These primitives are then contained within a cell, and one or more cells are contained within a +node in the graph. DARTS then works by composing these nodes together and taking the softmax over +their primitives in each cell. Finally, the *Architecture* abstraction contains all nodes, and is +responsible for differentiating the composition of the nodes with respect to two *alpha* parameters +as defined in the paper. The end result is that we have a differentiable model that composes its +components as the model is training. + +As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. +The final model will be the *Genotype* with corresponding to the lowest loss. + +.. References +.. ---------- +.. _paper: https://openreview.net/forum?id=S1eYHoC5FX +.. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py +.. _pytorch website: https://pytorch.org/ diff --git a/examples/darts/advanced/README.rst b/examples/darts/advanced/README.rst new file mode 100644 index 00000000..0da11fdf --- /dev/null +++ b/examples/darts/advanced/README.rst @@ -0,0 +1,143 @@ +============== +DARTS Advanced +============== + +In this example we will take a look at how to define our own primitives to be handled by DARTS. If +you have not read the `Uno example`_, I would recommend taking a look at that first. There we showed +how we can use the built in primitives to DARTS. As reference, you can also look to see how those +built it primitives are defined in `darts.modules.operations.linear.py`_ and +`darts.modules.operations.conv.py`_. + +In order to define custom networks to be handled by DARTS, you need to define a few things: + +1. **Network Stem**: This is an *nn.Module* that takes in your input data, processes it in some way, + and feeds its features of size *cell_dim* to your remaining network primitives. The parameter + *cell_dim* must be the input size for all of your primitives. Since DARTS can compose your primitives + in *any* order, the input and output dimension of all of your primitives must be of size *cell_dim*. + +2. **Primitives**: These *nn.Modules* are the basic building blocks for your network. They can be anything + that you dream of, so long as their input and output dimensions are of size *cell_dim*. + +3. **A constructor for your primitives**: This is a dictionary of lambda functions used to construct your + network primitives. By convention, this is a dictionary called *OPS*. We will look at this a bit closer + below. + +Defining our Components +----------------------- + +Let's take a look at the various pieces that we need to define. All of these components can be found in +`operations.py`_. + +Network Stem +------------ + +As we mentioned above, this is the module that is defined at the beginning of your network, mapping your +input data to *cell_dim*. + +.. code-block:: python + + class Stem(nn.Module): + """ Network stem + + This will always be the beginning of the network. + DARTS will only recompose modules after the stem. + For this reason, we define this separate from the + other modules in the network. + + Args: + input_dim: the input dimension for your data + + cell_dim: the intermediate dimension size for + the remaining modules of the network. + """ + def __init__(self, in_channels: int=1, cell_dim: int=100, kernel_size=3): + super(Stem, self).__init__() + self.stem = nn.Conv2d(in_channels, cell_dim, kernel_size) + + def forward(self, x): + return self.stem(x) + +Primitives +---------- + +DARTS primitives are Pytorch *nn.Modules*. For this example, we have defined three primitives: *ConvBlock*, +*DilConv*, and the *Identity* (a skip layer). It is important to remember DARTS will try many different +orderings of these primitives between *nodes*. Therefore, the imput and output dimensions of each of these +primitives must be of size *cell_dim*. + +It is also important to know that DARTS expects the *Identity* function to be included in the primitives. +This is so that DARTS can account for varying depths of neural networks. Since at each node, DARTS must choose +one primitive (choosing meaning taking the softmax over the primitives), having the no-op *Identity* means +that we can optimize over the depth of the network. It would be possible to define a 100 layer network and +have the output *Genotype* be only a few layers deep. If we were to not include the *Identity*, every layer +would be some transformation of the previous layer's features, and we could run the risk of overparameterizing +our network. + +A Constructor for our Primitives +-------------------------------- + +Since DARTS does not control what primitives you define, we need to provide it with a constructor for those +primitives. By convention, this is handled by a dictionary of lambda functions called *OPS*. The keys of this +dictionary are the names of our primitives, and the values of the dictionary are lambda functions that +construct those primitives. Let's take a look at the example's *OPS*: + +.. code-block:: python + + """ DARTS operations contstructor """ + OPS = { + 'none' : lambda c, stride, affine: Identity(), + 'conv_3' : lambda c, stride, affine: ConvBlock(c, c, 3, stride), + 'dil_conv': lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine) + } + +As mentioned, the keys of *OPS* are the names we give to each of our primitives. These keys will be +what DARTS uses when defining *Genotypes*. Note that the the lambda functions take three parameters: +1. *c*, the number of channels (or features) of the layer; 2. *stride*, the stride for convolutions; and +3. *affine* whether to use affine transforms in batch normalization. These parameters are the default +implementation of DARTS, and must be present. Any other hyperparameters of our custom primitives must be +given default values. One last thing to note: in order to keep things consistent, DARTS reserves the keyword +*none* for the *Identity* primitive. Again, this primitive must be included in any custom primitive set, and +it's key must be *none*. This method of constructing our primitives could be changed in future versions of +DARTS to better acccommodate fancier primitives. As always, pull requests are welcome! + +Putting it all Together +----------------------- + +Once we have defined our stem, primitives, and our *OPS* constructor, we can that hand them over to DARTS: + +.. code-block:: python + + model = darts.Network( + stem, cell_dim=100, classifier_dim=676, + ops=OPS, tasks=tasks, criterion=criterion, device=device + ).to(device) + + architecture = darts.Architecture(model, args, device=device) + +Note that we must specify the *classifier_dim* the number of input features from our primitives. Since each +of the primitives must have the same number of input and output features, this will be the flattned number +of features from any of your primitives. Since DARTS cannot know ahead of time what your primitives will be, +we must specify how many features will go into our final fully connected layer of the network. + +Run the Example +--------------- + +First, make sure that you can get the example data by installing `torchvision`: + +.. code-block:: + + pip install torchvision + +Then run the example with + +.. code-block:: + + python example.py + +.. References +.. ---------- +.. _paper: https://openreview.net/forum?id=S1eYHoC5FX +.. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py +.. _darts.modules.operations.linear.py: ../../../common/darts/modules.operations.linear.py +.. _operations.py: ./operations.py +.. _Uno example: ../uno diff --git a/examples/darts/advanced/advanced_example.py b/examples/darts/advanced/advanced_example.py new file mode 100644 index 00000000..dbadfed2 --- /dev/null +++ b/examples/darts/advanced/advanced_example.py @@ -0,0 +1,218 @@ +import torch +import torch.nn as nn +from torch import optim +from torch.utils.data import DataLoader +from torchvision import datasets, transforms + +import logging + +import example_setup as bmk +import darts +import candle + +from operations import ( + Stem, OPS +) + + +logging.basicConfig(level = logging.INFO) +logger = logging.getLogger("darts_advanced") + + +def initialize_parameters(): + """ Initialize the parameters for the Advanced example """ + + uno_example = bmk.AdvancedExample( + bmk.file_path, + 'default_model.txt', + 'pytorch', + prog='advanced_example', + desc='Differentiable Architecture Search - Advanced example', + ) + + # Initialize parameters + gParameters = candle.finalize_parameters(uno_example) + return gParameters + + +def run(params): + args = candle.ArgumentStruct(**params) + + args.cuda = torch.cuda.is_available() + device = torch.device(f"cuda" if args.cuda else "cpu") + darts.banner(device=device) + + trainloader = torch.utils.data.DataLoader( + datasets.MNIST( + './data', train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.batch_size, shuffle=True) + + validloader = torch.utils.data.DataLoader( + datasets.MNIST( + './data', train=False, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.batch_size, shuffle=True) + + tasks = { + 'digits': 10, + } + + criterion = nn.CrossEntropyLoss().to(device) + + stem = Stem(cell_dim=100) + + model = darts.Network( + stem, cell_dim=100, classifier_dim=676, + ops=OPS, tasks=tasks, criterion=criterion, device=device + ).to(device) + + architecture = darts.Architecture(model, args, device=device) + + optimizer = optim.SGD( + model.parameters(), + args.learning_rate, + momentum=args.momentum, + weight_decay=args.weight_decay + ) + + scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, + float(args.epochs), + eta_min=args.learning_rate_min + ) + + train_meter = darts.EpochMeter(tasks, 'train') + valid_meter = darts.EpochMeter(tasks, 'valid') + + for epoch in range(args.epochs): + + lr = scheduler.get_lr()[0] + logger.info(f'\nEpoch: {epoch} lr: {lr}') + + genotype = model.genotype() + logger.info(f'Genotype: {genotype}\n') + + train( + trainloader, + model, + architecture, + criterion, + optimizer, + scheduler, + args, + tasks, + train_meter, + device + ) + + validate(validloader, model, criterion, args, tasks, valid_meter, device) + + +def train(trainloader, + model, + architecture, + criterion, + optimizer, + scheduler, + args, + tasks, + meter, + device): + + valid_iter = iter(trainloader) + + for step, (data, target) in enumerate(trainloader): + batch_size = data.size(0) + model.train() + target = _wrap_target(target) + data = darts.to_device(data, device) + target = darts.to_device(target, device) + + x_search, target_search = next(valid_iter) + target_search = _wrap_target(target_search) + x_search = darts.to_device(x_search, device) + target_search = darts.to_device(target_search, device) + + lr = scheduler.get_lr()[0] + + # 1. update alpha + architecture.step( + data, + target, + x_search, + target_search, + lr, + optimizer, + unrolled=False + ) + + logits = model(data) + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') + + # 2. update weight + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) + optimizer.step() + scheduler.step() + + prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) + + if step % args.log_interval == 0: + logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') + + meter.update_epoch() + meter.save(args.save_path) + + +def validate(validloader, model, criterion, args, tasks, meter, device): + model.eval() + with torch.no_grad(): + for step, (data, target) in enumerate(validloader): + target = _wrap_target(target) + + data = darts.to_device(data, device) + target = darts.to_device(target, device) + + batch_size = data.size(0) + + logits = model(data) + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') + + prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) + + if step % args.log_interval == 0: + logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') + + meter.update_epoch() + meter.save(args.save_path) + + +def _wrap_target(target): + """ Wrap the MNIST target in a dictionary + + The multitask classifier of DARTS expects a + dictionary of target tasks. Here we simply wrap + MNIST's target in a dictionary. + """ + return {'digits': target} + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__=='__main__': + main() diff --git a/examples/darts/advanced/default_model.txt b/examples/darts/advanced/default_model.txt new file mode 100644 index 00000000..13a35b75 --- /dev/null +++ b/examples/darts/advanced/default_model.txt @@ -0,0 +1,14 @@ +[Global_Params] +model_name = 'darts_uno' +data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +save_path = './results' +log_interval = 100 +train_data = 'top_21_auc_1fold.uno.h5' +learning_rate = 0.025 +learning_rate_min = 0.001 +momentum = 0.9 +weight_decay = 3e-4 +grad_clip = 5 +batch_size = 100 +epochs = 10 +rng_seed = 13 diff --git a/examples/darts/advanced/example_setup.py b/examples/darts/advanced/example_setup.py new file mode 100644 index 00000000..53f46fac --- /dev/null +++ b/examples/darts/advanced/example_setup.py @@ -0,0 +1,46 @@ +import os +import sys + + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', '..', 'common')) +sys.path.append(lib_path2) + + +import candle + +additional_definitions = [ + {'name':'grad_clip','type':int}, + {'name':'learning_rate_min','type':float, 'help':'Minimum learning rate'}, + {'name':'log_interval','type':int, 'help':'Logging interval'}, + {'name':'unrolled','type':candle.str2bool}, + {'name':'weight_decay','type':float}, + {'name':'grad_clip','type':int} +] + +REQUIRED = [ + 'learning_rate', + 'learning_rate_min', + 'momentum', + 'weight_decay', + 'grad_clip', + 'rng_seed', + 'batch_size', + 'epochs', +] + + +class AdvancedExample(candle.Benchmark): + """ Example for Advanced use of DARTS """ + + def set_locals(self): + """ Set parameters for the benchmark. + + Args: + required: set of required parameters for the benchmark. + """ + if REQUIRED is not None: + self.required = set(REQUIRED) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + diff --git a/examples/darts/advanced/operations.py b/examples/darts/advanced/operations.py new file mode 100644 index 00000000..d07c878d --- /dev/null +++ b/examples/darts/advanced/operations.py @@ -0,0 +1,94 @@ +import torch.nn as nn +import torch.nn.functional as F + + +""" DARTS operations contstructor """ +OPS = { + 'none' : lambda c, stride, affine: Identity(), + 'conv_3' : lambda c, stride, affine: ConvBlock(c, c, 3, stride), + 'dil_conv': lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine) +} + + +class Stem(nn.Module): + """ Network stem + + This will always be the beginning of the network. + DARTS will only recompose modules after the stem. + For this reason, we define this separate from the + other modules in the network. + + Args: + input_dim: the input dimension for your data + + cell_dim: the intermediate dimension size for + the remaining modules of the network. + """ + def __init__(self, in_channels: int=1, cell_dim: int=100, kernel_size=3): + super(Stem, self).__init__() + self.stem = nn.Conv2d(in_channels, cell_dim, kernel_size) + + def forward(self, x): + x = self.stem(x) +# print(f'stem: {x.shape}') + return x + + +class ConvBlock(nn.Module): + """ ReLu -> Conv2d """ + + def __init__(self, c_in, c_out, kernel_size, stride, affine=True): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d( + c_in, c_out, kernel_size=kernel_size, stride=stride + ) + + def forward(self, x): + return self.conv(F.relu(x)) + + +class DilConv(nn.Module): + """ ReLU Dilated Convolution """ + + def __init__(self, c_in, c_out, kernel_size, + stride, padding, dilation, affine=True): + super(DilConv, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + + nn.Conv2d( + c_in, + c_in, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=c_in, + bias=False + ), + + nn.Conv2d( + c_in, + c_out, + kernel_size=1, + padding=0, + bias=False + ), + + nn.BatchNorm2d(c_out, affine=affine), + ) + + def forward(self, x): + return self.op(x) + + +class Identity(nn.Module): + """ Identity module """ + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + diff --git a/examples/darts/uno/.gitignore b/examples/darts/uno/.gitignore new file mode 100644 index 00000000..8d5ef26b --- /dev/null +++ b/examples/darts/uno/.gitignore @@ -0,0 +1 @@ +ftp.mcs.anl.gov/ diff --git a/examples/darts/uno/README.rst b/examples/darts/uno/README.rst new file mode 100644 index 00000000..1e421e98 --- /dev/null +++ b/examples/darts/uno/README.rst @@ -0,0 +1,77 @@ +========= +DARTS UNO +========= + +Let's take a look at a look at using DARTS for the Pilot 1 Uno example. In the Uno +problem the task is to classify tumor dose response with respect to a few different +data sources. For simplicity, we will use one source, Uno's gene data, to be used +for this classification. + +The Uno models are typically fully connected deep networks. DARTS provides some basic linear network +primitives which can be found in `darts.modules.operations.linear.py`_. For simplicity, we will make +use of those primitives for this example. To see how we can define new primitives, see the `advanced`_ +example. + +There are two main abstractions that we need to instantiate in order to get up and running: + +* **LinearNetwork**: + +.. code-block:: python + + LinearNetwork(input_dim, tasks, criterion, device) + +The *LinearNetwork* takes a few parameters: + +1. *input_dim* (int): the data input dimension +2. *tasks* (Dict[str, int]): a dictionary of classification tasks where the keys are the task names + and the values are the number of classes for that task. +3. *criterion*: a Pytorch loss function +4. *device* (str): either "cpu" or "gpu" + +* **Architecture**: + +.. code-block:: python + + Architecture(model, args, device) + +The *Architecture* expects the following arguments: + +1. *model*: and instance of the *LinearNetwork* +2. *args*: an instance of argparse args containing the weight decay and momentum parameters for the + *Architecture*'s optimizer controlling the Hessian optimization. +3. *device* (str): "cpu" or "gpu" + +Model training should familiar to those that are accustomed to using Pytorch with one small difference: + +.. code-block:: python + + # ... + for step, (data, target) in enumerate(trainloader): + #... + architecture.step( + data, target, x_search, target_search, lr, optimizer, unrolled + ) + # ... + # ... + +To understand what is going on here, recall that DARTS is a bi-level optimization procedure, +where there are two Pytorch optimizers, one for the normal gradient step for our model weights, +and another to for our *Architecture* to step in the composition of our neural net's nodes. The +*architecture.step* function is then taking that composition step. It expects that we pass it our +data and labels of the training set, but also the data and labels of our validation set. For +simplicity of this tutorial, *x_search* and *target_search* are from our training set, but these +would normally use a separate validation set. + +Run the Example +--------------- + +.. code-block:: + + python uno_example.py + +.. References +.. ---------- +.. _paper: https://openreview.net/forum?id=S1eYHoC5FX +.. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py +.. _darts.modules.operations.linear.py: ../../../common/darts/modules.operations.linear.py +.. _advanced: ../advanced diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt new file mode 100644 index 00000000..cb69d184 --- /dev/null +++ b/examples/darts/uno/default_model.txt @@ -0,0 +1,14 @@ +[Global_Params] +model_name = 'darts_uno' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +save_path = './results' +log_interval = 100 +train_data = 'top_21_auc_1fold.uno.h5' +learning_rate = 0.025 +learning_rate_min = 0.001 +momentum = 0.9 +weight_decay = 3e-4 +grad_clip = 5 +batch_size = 100 +epochs = 10 +rng_seed = 13 diff --git a/examples/darts/uno/example_setup.py b/examples/darts/uno/example_setup.py new file mode 100644 index 00000000..75c36fbe --- /dev/null +++ b/examples/darts/uno/example_setup.py @@ -0,0 +1,48 @@ +import os +import sys + + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', '..', 'common')) +sys.path.append(lib_path2) + + +import candle + +additional_definitions = [ +{'name':'grad_clip','type':int}, +{'name':'learning_rate_min','type':float, 'help':'Minimum learning rate'}, +{'name':'log_interval','type':int, 'help':'Logging interval'}, +{'name':'unrolled','type':candle.str2bool}, +{'name':'weight_decay','type':float}, +{'name':'grad_clip','type':int} +] + +REQUIRED = [ + 'learning_rate', + 'learning_rate_min', + 'momentum', + 'weight_decay', + 'grad_clip', + 'rng_seed', + 'batch_size', + 'epochs', +] + + +class UnoExample(candle.Benchmark): + """ Example for Uno """ + + def set_locals(self): + """ Set parameters for the benchmark. + + Args: + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + if REQUIRED is not None: + self.required = set(REQUIRED) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py new file mode 100644 index 00000000..9b4bd3be --- /dev/null +++ b/examples/darts/uno/uno_example.py @@ -0,0 +1,199 @@ +import logging + +import torch +import torch.nn as nn +from torch import optim +from torch.utils.data import DataLoader +import logging + +import example_setup as bmk +import darts +import candle + +logging.basicConfig(level = logging.INFO) +logger = logging.getLogger("darts_uno") + + +def initialize_parameters(): + """ Initialize the parameters for the Uno example """ + + uno_example = bmk.UnoExample( + bmk.file_path, + 'default_model.txt', + 'pytorch', + prog='uno_example', + desc='Differentiable Architecture Search - Uno example', + ) + + # Initialize parameters + gParameters = candle.finalize_parameters(uno_example) + return gParameters + + +def run(params): + args = candle.ArgumentStruct(**params) + + args.cuda = torch.cuda.is_available() + device = torch.device(f"cuda" if args.cuda else f"cpu") + darts.banner(device=device) + + train_data = darts.Uno('./data', 'train', download=True) + valid_data = darts.Uno('./data', 'test') + + trainloader = DataLoader(train_data, batch_size=args.batch_size) + validloader = DataLoader(valid_data, batch_size=args.batch_size) + + criterion = nn.CrossEntropyLoss().to(device) + + tasks = { + 'response': 2, + } + + model = darts.LinearNetwork( + input_dim=942, tasks=tasks, criterion=criterion, device=device + ).to(device) + + architecture = darts.Architecture(model, args, device=device) + + optimizer = optim.SGD( + model.parameters(), + args.learning_rate, + momentum=args.momentum, + weight_decay=args.weight_decay + ) + + scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, + float(args.epochs), + eta_min=args.learning_rate_min + ) + + train_meter = darts.EpochMeter(tasks, 'train') + valid_meter = darts.EpochMeter(tasks, 'valid') + + genotype_store = darts.GenotypeStorage(root=args.save_path) + + for epoch in range(args.epochs): + + lr = scheduler.get_lr()[0] + logger.info(f'\nEpoch: {epoch} lr: {lr}') + + genotype = model.genotype() + logger.info(f'Genotype: {genotype}\n') + + train( + trainloader, + model, + architecture, + criterion, + optimizer, + scheduler, + args, + tasks, + train_meter, + genotype, + genotype_store, + device + ) + + validate(validloader, model, criterion, args, tasks, valid_meter, device) + + +def train(trainloader, + model, + architecture, + criterion, + optimizer, + scheduler, + args, + tasks, + meter, + genotype, + genotype_store, + device): + + valid_iter = iter(trainloader) + min_accuracy = 0.0 + for step, (data, target) in enumerate(trainloader): + + batch_size = data.size(0) + model.train() + + data = darts.to_device(data, device) + target = darts.to_device(target, device) + + x_search, target_search = next(valid_iter) + x_search = darts.to_device(x_search, device) + target_search = darts.to_device(target_search, device) + + lr = scheduler.get_lr()[0] + + # 1. update alpha + architecture.step( + data, + target, + x_search, + target_search, + lr, + optimizer, + unrolled=False + ) + + logits = model(data) + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') + + # 2. update weight + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) + optimizer.step() + scheduler.step() + + prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) + + accuracy_avg = meter.acc_meter.get_avg_accuracy('response') + if accuracy_avg > min_accuracy: + genotype_store.save_genotype(genotype) + min_accuracy = accuracy_avg + + if step % args.log_interval == 0: + logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') + + meter.update_epoch() + meter.save(args.save_path) + + + +def validate(validloader, model, criterion, args, tasks, meter, device): + model.eval() + with torch.no_grad(): + for step, (data, target) in enumerate(validloader): + + data = darts.to_device(data, device) + target = darts.to_device(target, device) + + batch_size = data.size(0) + + logits = model(data) + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') + + prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) + + if step % args.log_interval == 0: + logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') + + meter.update_epoch() + meter.save(args.save_path) + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__=='__main__': + main() diff --git a/examples/mnist/README.md b/examples/mnist/README.md new file mode 100644 index 00000000..b9f5ab31 --- /dev/null +++ b/examples/mnist/README.md @@ -0,0 +1,59 @@ +# MNIST Example + +This example demonstrate how to convert keras code into CANDLE compliant. +Please refer [tutorial](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) for more detail. + +Here is the list of files, + +- mnist.py: CANDLE class +- mnist_cnn.py and mnist_mlp.py: original mnist implementation from keras project +- mnist_cnn_candle.py: mnist_cnn.py converted in CANDLE compliant mode +- mnist_mlp_candle.py: mnist_mlp.py converted in CANDLE compliant mode +- mnist_params.txt: model parameters are stored in a file for reproduciblity + + +``` +$ python mnist_cnn_candle.py -e 3 +Using TensorFlow backend. + +Importing candle utils for keras +Params: +{'activation': 'relu', +'batch_size': 128, +'data_type': , +'epochs': 3, +'experiment_id': 'EXP000', +'gpus': [], +'logfile': None, +'optimizer': 'rmsprop', +'output_dir': '/Users/hsyoo/projects/CANDLE/Benchmarks/examples/mnist/Output/EXP000/RUN000', +'profiling': False, +'rng_seed': 7102, +'run_id': 'RUN000', +'shuffle': False, +'timeout': -1, +'train_bool': True, +'verbose': None} +Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz +11493376/11490434 [==============================] - 2s 0us/step +x_train shape: (60000, 28, 28, 1) +60000 train samples +10000 test samples +Instructions for updating: +Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`. + +Instructions for updating: +Use tf.where in 2.0, which has the same broadcast rule as np.where +Train on 60000 samples, validate on 10000 samples +Epoch 1/3 +2020-05-13 11:53:17.373979: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA +To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags. +2020-05-13 11:53:17.374474: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 16. Tune using inter_op_parallelism_threads for best performance. +60000/60000 [==============================] - 56s 932us/step - loss: 0.2719 - acc: 0.9157 - val_loss: 0.0683 - val_acc: 0.9774 +Epoch 2/3 +60000/60000 [==============================] - 55s 909us/step - loss: 0.0904 - acc: 0.9733 - val_loss: 0.0411 - val_acc: 0.9872 +Epoch 3/3 +60000/60000 [==============================] - 55s 909us/step - loss: 0.0666 - acc: 0.9808 - val_loss: 0.0339 - val_acc: 0.9893 +Test loss: 0.03386178284487105 +Test accuracy: 0.9893 +``` diff --git a/examples/mnist/mnist_cnn_candle.py b/examples/mnist/mnist_cnn_candle.py index bfade296..126869db 100644 --- a/examples/mnist/mnist_cnn_candle.py +++ b/examples/mnist/mnist_cnn_candle.py @@ -13,7 +13,7 @@ def initialize_parameters(): ) # Initialize parameters - gParameters = candle.initialize_parameters(mnist_common) + gParameters = candle.finalize_parameters(mnist_common) csv_logger = CSVLogger('{}/params.log'.format(gParameters)) return gParameters diff --git a/examples/mnist/mnist_mlp_candle.py b/examples/mnist/mnist_mlp_candle.py index e6288100..d3a0cd34 100644 --- a/examples/mnist/mnist_mlp_candle.py +++ b/examples/mnist/mnist_mlp_candle.py @@ -13,7 +13,7 @@ def initialize_parameters(): ) # Initialize parameters - gParameters = candle.initialize_parameters(mnist_common) + gParameters = candle.finalize_parameters(mnist_common) csv_logger = CSVLogger('{}/params.log'.format(gParameters)) return gParameters diff --git a/examples/unet/unet_candle.py b/examples/unet/unet_candle.py index 3c7a6c11..cb15f156 100644 --- a/examples/unet/unet_candle.py +++ b/examples/unet/unet_candle.py @@ -13,7 +13,7 @@ def initialize_parameters(): ) # Initialize parameters - gParameters = candle.initialize_parameters(unet_common) + gParameters = candle.finalize_parameters(unet_common) return gParameters def run(gParameters):