Merge pull request #374 from parthgvora/staging

Implement SPORF
neurodata · Dec 15, 2020 · df734df · df734df
2 parents 2dc6db6 + 6e1205e
commit df734df
Show file tree

Hide file tree

Showing 9 changed files with 1,810 additions and 6 deletions.
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
@@ -13,6 +13,8 @@ The following tutorials highlight what one can do with the ``ProgLearn`` package
     tutorials/random_class_exp
     tutorials/rotation_cifar
     tutorials/spiral_exp
+    tutorials/sporf_datasets
+    tutorials/sporf_decision_boundaries
     tutorials/uncertaintyforest_running_example
     tutorials/uncertaintyforest_posteriorestimates
     tutorials/uncertaintyforest_conditionalentropyestimates

diff --git a/docs/tutorials/functions/sporf_datasets_functions.py b/docs/tutorials/functions/sporf_datasets_functions.py
@@ -0,0 +1,90 @@
+import sys
+import numpy as np
+import pandas as pd
+import csv
+from numpy import genfromtxt
+
+from proglearn.progressive_learner import ProgressiveLearner
+from proglearn.voters import TreeClassificationVoter
+from proglearn.transformers import TreeClassificationTransformer
+from proglearn.transformers import ObliqueTreeClassificationTransformer
+from proglearn.deciders import SimpleArgmaxAverage
+
+from sklearn.model_selection import train_test_split, cross_val_score
+
+def load_simulated_data(file):
+    data = genfromtxt(file, delimiter=',')
+    X = data[:, :-1]
+    y = data[:, -1]
+
+    return X, y
+
+def load_data(data_file, task_num):
+    if "Hill_Valley" in data_file:
+        df = pd.read_csv(data_file)
+        X = df[df.columns[:-1]].to_numpy()
+        y = df[df.columns[-1]].to_numpy()
+
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)
+
+    if "acute" in data_file:
+
+        df = pd.read_table(data_file, encoding='utf-16')
+        df[df == "no"] = 0
+        df[df == "yes"] = 1
+
+        data = df.to_numpy()
+        temps = data[:, 0]
+
+        temperature = []
+        for i in range(len(temps)):
+            temp_str = temps[i]
+            temp_str = temp_str.replace(",", ".")
+            temperature.append(float(temp_str))
+
+        data[:, 0] = np.array(temperature)
+
+        X = np.array(data[:, :5], dtype=float)
+
+        # 6 for task 1, 7 for task 2
+        if task_num == 1:
+            y = np.array(data[:, 6], dtype=float)
+        else:
+            y = np.array(data[:, 7], dtype=float)
+
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)
+
+    return X_train, X_test, y_train, y_test, len(np.unique(y))
+
+
+def test(data_file, reps, n_trees, task_num,
+         default_transformer_class, default_transformer_kwargs):
+    default_voter_class = TreeClassificationVoter
+    default_voter_kwargs = {}
+
+    default_decider_class = SimpleArgmaxAverage
+
+    kappa = np.zeros(reps)
+    for i in range(reps):
+        X_train, X_test, y_train, y_test, n_classes = load_data(data_file, task_num)
+        default_decider_kwargs = {"classes": np.arange(n_classes)}
+
+        pl = ProgressiveLearner(
+            default_transformer_class=default_transformer_class,
+            default_transformer_kwargs=default_transformer_kwargs,
+            default_voter_class=default_voter_class,
+            default_voter_kwargs=default_voter_kwargs,
+            default_decider_class=default_decider_class,
+            default_decider_kwargs=default_decider_kwargs)
+
+        pl.add_task(X_train, y_train, num_transformers=n_trees)
+
+        y_hat = pl.predict(X_test, task_id=0)
+
+        acc = np.sum(y_test == y_hat) / len(y_test)
+        print("Accuracy after iteration ", i, ": ", acc)
+
+        chance_pred = 1 / n_classes
+        kappa[i] = (acc - chance_pred) / (1 - chance_pred)
+
+    return np.mean(kappa) * 100, (np.std(kappa) * 100) / np.sqrt(reps)
diff --git a/docs/tutorials/functions/sporf_decision_boundaries_functions.py b/docs/tutorials/functions/sporf_decision_boundaries_functions.py
@@ -0,0 +1,96 @@
+from rerf.rerfClassifier import rerfClassifier
+
+import numpy as np
+np.random.seed(42)
+
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.datasets import make_moons, make_circles, make_classification
+from sklearn.ensemble import RandomForestClassifier
+
+from proglearn.forest import LifelongClassificationForest
+from proglearn.voters import TreeClassificationVoter
+from proglearn.transformers import TreeClassificationTransformer
+from proglearn.transformers import ObliqueTreeClassificationTransformer
+from proglearn.deciders import SimpleArgmaxAverage
+
+def test(NT, h, names, classifiers, datasets):
+    i = 1
+    # iterate over datasets
+    for ds_cnt, ds in enumerate(datasets):
+        # preprocess dataset, split into training and test part
+        X, y = ds
+        X = StandardScaler().fit_transform(X)
+        X_train, X_test, y_train, y_test = \
+            train_test_split(X, y, test_size=.4, random_state=42)
+
+        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
+        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                             np.arange(y_min, y_max, h))
+
+        # just plot the dataset first
+        cm = plt.cm.RdBu
+        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
+        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
+        if ds_cnt == 0:
+            ax.set_title("Input data")
+        # Plot the training points
+        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
+                   edgecolors='k')
+        # Plot the testing points
+        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
+                   edgecolors='k')
+        ax.set_xlim(xx.min(), xx.max())
+        ax.set_ylim(yy.min(), yy.max())
+        ax.set_xticks(())
+        ax.set_yticks(())
+        i += 1
+
+        # iterate over classifiers
+        for name, clf in zip(names, classifiers):
+            ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
+
+            if "Proglearn" in name:
+
+                clf = LifelongClassificationForest(oblique=True,
+                                                   default_feature_combinations=1, default_density=0.5)
+                clf.add_task(X_train, y_train, n_estimators=NT)
+                y_hat = clf.predict(X_test, task_id=0)
+                score = np.sum(y_hat == y_test) / len(y_test)
+
+            else:
+                clf.fit(X_train, y_train)
+                score = clf.score(X_test, y_test)
+
+            # Plot the decision boundary. For that, we will assign a color to each
+            # point in the mesh [x_min, x_max]x[y_min, y_max].
+            if hasattr(clf, "decision_function"):
+                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+            elif "Proglearn" in name:
+                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()], task_id=0)[:, 1]
+            else:
+                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
+
+            # Put the result into a color plot
+            Z = Z.reshape(xx.shape)
+            ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
+
+            # Plot the training points
+            ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
+                       edgecolors='k')
+            # Plot the testing points
+            ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
+                       edgecolors='k', alpha=0.6)
+
+            ax.set_xlim(xx.min(), xx.max())
+            ax.set_ylim(yy.min(), yy.max())
+            ax.set_xticks(())
+            ax.set_yticks(())
+            if ds_cnt == 0:
+                ax.set_title(name)
+            ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
+                    size=15, horizontalalignment='right')
+            i += 1