Skip to content

Commit

Permalink
Merge pull request #374 from parthgvora/staging
Browse files Browse the repository at this point in the history
Implement SPORF
  • Loading branch information
levinwil authored Dec 15, 2020
2 parents 2dc6db6 + 6e1205e commit df734df
Show file tree
Hide file tree
Showing 9 changed files with 1,810 additions and 6 deletions.
2 changes: 2 additions & 0 deletions docs/tutorials.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ The following tutorials highlight what one can do with the ``ProgLearn`` package
tutorials/random_class_exp
tutorials/rotation_cifar
tutorials/spiral_exp
tutorials/sporf_datasets
tutorials/sporf_decision_boundaries
tutorials/uncertaintyforest_running_example
tutorials/uncertaintyforest_posteriorestimates
tutorials/uncertaintyforest_conditionalentropyestimates
Expand Down
90 changes: 90 additions & 0 deletions docs/tutorials/functions/sporf_datasets_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import sys
import numpy as np
import pandas as pd
import csv
from numpy import genfromtxt

from proglearn.progressive_learner import ProgressiveLearner
from proglearn.voters import TreeClassificationVoter
from proglearn.transformers import TreeClassificationTransformer
from proglearn.transformers import ObliqueTreeClassificationTransformer
from proglearn.deciders import SimpleArgmaxAverage

from sklearn.model_selection import train_test_split, cross_val_score

def load_simulated_data(file):
data = genfromtxt(file, delimiter=',')
X = data[:, :-1]
y = data[:, -1]

return X, y

def load_data(data_file, task_num):
if "Hill_Valley" in data_file:
df = pd.read_csv(data_file)
X = df[df.columns[:-1]].to_numpy()
y = df[df.columns[-1]].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)

if "acute" in data_file:

df = pd.read_table(data_file, encoding='utf-16')
df[df == "no"] = 0
df[df == "yes"] = 1

data = df.to_numpy()
temps = data[:, 0]

temperature = []
for i in range(len(temps)):
temp_str = temps[i]
temp_str = temp_str.replace(",", ".")
temperature.append(float(temp_str))

data[:, 0] = np.array(temperature)

X = np.array(data[:, :5], dtype=float)

# 6 for task 1, 7 for task 2
if task_num == 1:
y = np.array(data[:, 6], dtype=float)
else:
y = np.array(data[:, 7], dtype=float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)

return X_train, X_test, y_train, y_test, len(np.unique(y))


def test(data_file, reps, n_trees, task_num,
default_transformer_class, default_transformer_kwargs):
default_voter_class = TreeClassificationVoter
default_voter_kwargs = {}

default_decider_class = SimpleArgmaxAverage

kappa = np.zeros(reps)
for i in range(reps):
X_train, X_test, y_train, y_test, n_classes = load_data(data_file, task_num)
default_decider_kwargs = {"classes": np.arange(n_classes)}

pl = ProgressiveLearner(
default_transformer_class=default_transformer_class,
default_transformer_kwargs=default_transformer_kwargs,
default_voter_class=default_voter_class,
default_voter_kwargs=default_voter_kwargs,
default_decider_class=default_decider_class,
default_decider_kwargs=default_decider_kwargs)

pl.add_task(X_train, y_train, num_transformers=n_trees)

y_hat = pl.predict(X_test, task_id=0)

acc = np.sum(y_test == y_hat) / len(y_test)
print("Accuracy after iteration ", i, ": ", acc)

chance_pred = 1 / n_classes
kappa[i] = (acc - chance_pred) / (1 - chance_pred)

return np.mean(kappa) * 100, (np.std(kappa) * 100) / np.sqrt(reps)
96 changes: 96 additions & 0 deletions docs/tutorials/functions/sporf_decision_boundaries_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from rerf.rerfClassifier import rerfClassifier

import numpy as np
np.random.seed(42)

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.ensemble import RandomForestClassifier

from proglearn.forest import LifelongClassificationForest
from proglearn.voters import TreeClassificationVoter
from proglearn.transformers import TreeClassificationTransformer
from proglearn.transformers import ObliqueTreeClassificationTransformer
from proglearn.deciders import SimpleArgmaxAverage

def test(NT, h, names, classifiers, datasets):
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.4, random_state=42)

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))

# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1

# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

if "Proglearn" in name:

clf = LifelongClassificationForest(oblique=True,
default_feature_combinations=1, default_density=0.5)
clf.add_task(X_train, y_train, n_estimators=NT)
y_hat = clf.predict(X_test, task_id=0)
score = np.sum(y_hat == y_test) / len(y_test)

else:
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
elif "Proglearn" in name:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()], task_id=0)[:, 1]
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
edgecolors='k', alpha=0.6)

ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
if ds_cnt == 0:
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
Loading

0 comments on commit df734df

Please sign in to comment.