From fc95e6b9166a59f0ceef30fd5960aeb54c138b46 Mon Sep 17 00:00:00 2001 From: ngow210 Date: Wed, 17 Jul 2024 11:36:35 +1200 Subject: [PATCH 1/8] Ensemble updated --- _episodes/04-ensemble-methods.md | 90 ++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/_episodes/04-ensemble-methods.md b/_episodes/04-ensemble-methods.md index f4dc28a..cdd24c1 100644 --- a/_episodes/04-ensemble-methods.md +++ b/_episodes/04-ensemble-methods.md @@ -168,6 +168,96 @@ plt.show() There is still some overfitting indicated by the regions that contain only single points but using the same hyper-parameter settings used to fit the decision tree classifier, we can see that overfitting is reduced. +## Stacking: classification +import seaborn as sns +penguins = sns.load_dataset('penguins') + +feature_names = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] +penguins.dropna(subset=feature_names, inplace=True) + +species_names = penguins['species'].unique() + +# Define data and targets +X = penguins[feature_names] + +y = penguins.species + +# Split data in training and test set +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) + +print(f'train size: {X_train.shape}') +print(f'test size: {X_test.shape}') + +from sklearn.ensemble import ( + GradientBoostingClassifier, + RandomForestClassifier, + VotingClassifier, +) +from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.gaussian_process.kernels import RBF +from sklearn.tree import DecisionTreeClassifier + +# training estimators +rf_clf = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=1, random_state=5) +gb_clf = GradientBoostingClassifier(random_state=5) +gp_clf = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=5) +dt_clf = DecisionTreeClassifier(max_depth=5, random_state=5) + +voting_reg = VotingClassifier([("rf", rf_clf), ("gb", gb_clf), ("gp", gp_clf), ("dt", dt_clf)]) + +# fit voting estimator +voting_reg.fit(X_train, y_train) + +# lets also train the individual models for comparison +rf_clf.fit(X_train, y_train) +gb_clf.fit(X_train, y_train) +gp_clf.fit(X_train, y_train) +dt_clf.fit(X_train, y_train) + +import matplotlib.pyplot as plt + +# make predictions +X_test_20 = X_test[:20] # first 20 for visualisation + +rf_pred = rf_clf.predict(X_test_20) +gb_pred = gb_clf.predict(X_test_20) +gp_pred = gp_clf.predict(X_test_20) +dt_pred = dt_clf.predict(X_test_20) +voting_pred = voting_reg.predict(X_test_20) + +print(rf_pred) +print(gb_pred) +print(gp_pred) +print(dt_pred) +print(voting_pred) + +plt.figure() +plt.plot(gb_pred, "o", color="green", label="GradientBoostingClassifier") +plt.plot(rf_pred, "o", color="blue", label="RandomForestClassifier") +plt.plot(gp_pred, "o", color="darkblue", label="GuassianProcessClassifier") +plt.plot(dt_pred, "o", color="lightblue", label="DecisionTreeClassifier") +plt.plot(voting_pred, "x", color="red", ms=10, label="VotingRegressor") + +plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False) +plt.ylabel("predicted") +plt.xlabel("training samples") +plt.legend(loc="best") +plt.title("Regressor predictions and their average") + +plt.show() + +print(f'random forest: {rf_clf.score(X_test, y_test)}') + +print(f'gradient boost: {gb_clf.score(X_test, y_test)}') + +print(f'guassian process: {gp_clf.score(X_test, y_test)}') + +print(f'decision tree: {dt_clf.score(X_test, y_test)}') + +print(f'voting regressor: {voting_reg.score(X_test, y_test)}') + ## Stacking a regression problem We've had a look at a bagging approach but we'll now take a look at a stacking approach and apply it to a regression problem. We'll also introduce a new dataset to play around with. From a481ff51d95e1391898cb9ab1988954aab1cb8cf Mon Sep 17 00:00:00 2001 From: ngow210 Date: Wed, 17 Jul 2024 11:42:32 +1200 Subject: [PATCH 2/8] Ensemble updated regression house price --- _episodes/04-ensemble-methods.md | 75 ++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/_episodes/04-ensemble-methods.md b/_episodes/04-ensemble-methods.md index cdd24c1..118ce23 100644 --- a/_episodes/04-ensemble-methods.md +++ b/_episodes/04-ensemble-methods.md @@ -262,6 +262,81 @@ print(f'voting regressor: {voting_reg.score(X_test, y_test)}') We've had a look at a bagging approach but we'll now take a look at a stacking approach and apply it to a regression problem. We'll also introduce a new dataset to play around with. +### California house price prediction + +import sklearn +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import train_test_split +X, y = fetch_california_housing(return_X_y=True, as_frame=True) + +print(X.shape) +print(y.shape) + +print(X.head()) +print("======================================") +## Target is in units of 100,000 +print(y.head()) + +# split into train and test sets +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) + +print(f'train size: {X_train.shape}') +print(f'test size: {X_test.shape}') + +from sklearn.ensemble import ( + GradientBoostingRegressor, + RandomForestRegressor, + VotingRegressor, +) +from sklearn.linear_model import LinearRegression + +# training estimators +rf_reg = RandomForestRegressor(random_state=5) +gb_reg = GradientBoostingRegressor(random_state=5) +linear_reg = LinearRegression() +voting_reg = VotingRegressor([("rf", rf_reg), ("gb", gb_reg), ("lr", linear_reg)]) + +# fit voting estimator +voting_reg.fit(X_train, y_train) + +# lets also train the individual models for comparison +rf_reg.fit(X_train, y_train) +gb_reg.fit(X_train, y_train) +linear_reg.fit(X_train, y_train) + +import matplotlib.pyplot as plt + +# make predictions +X_test_20 = X_test[:20] # first 20 for visualisation + +rf_pred = rf_reg.predict(X_test_20) +gb_pred = gb_reg.predict(X_test_20) +linear_pred = linear_reg.predict(X_test_20) +voting_pred = voting_reg.predict(X_test_20) + +plt.figure() +plt.plot(gb_pred, "o", color="navy", label="GradientBoostingRegressor") +plt.plot(rf_pred, "o", color="blue", label="RandomForestRegressor") +plt.plot(linear_pred, "o", color="skyblue", label="LinearRegression") +plt.plot(voting_pred, "x", color="red", ms=10, label="VotingRegressor") + +plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False) +plt.ylabel("predicted") +plt.xlabel("training samples") +plt.legend(loc="best") +plt.title("Regressor predictions and their average") + +plt.show() + +print(f'random forest: {rf_reg.score(X_test, y_test)}') + +print(f'gradient boost: {gb_reg.score(X_test, y_test)}') + +print(f'linear regression: {linear_reg.score(X_test, y_test)}') + +print(f'voting regressor: {voting_reg.score(X_test, y_test)}') + + ### The diabetes dataset The diabetes dataset, contains 10 baseline variables for 442 diabetes patients where the target attribute is quantitative measure of disease progression one year after baseline. For more information see [Efron et al., (2004)](https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf). The useful thing about this data it is available as part of the [sci-kit learn library](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset). We'll start by loading the dataset to very briefly inspect the attributes by printing them out. From bf329fbfbecabdd92b3daf8ecc34ab460a7c2196 Mon Sep 17 00:00:00 2001 From: ngow210 Date: Wed, 17 Jul 2024 15:58:10 +1200 Subject: [PATCH 3/8] voting regressor house price --- fig/house_price_voting_regressor.svg | 1194 ++++++++++++++++++++++++++ 1 file changed, 1194 insertions(+) create mode 100644 fig/house_price_voting_regressor.svg diff --git a/fig/house_price_voting_regressor.svg b/fig/house_price_voting_regressor.svg new file mode 100644 index 0000000..c6936b8 --- /dev/null +++ b/fig/house_price_voting_regressor.svg @@ -0,0 +1,1194 @@ + + + + + + + + 2024-07-17T15:55:56.397301 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 14bf8b6d9497ea8d16e2cf258debd1238f8df8cb Mon Sep 17 00:00:00 2001 From: ngow210 Date: Tue, 23 Jul 2024 11:19:35 +1200 Subject: [PATCH 4/8] voting regressor house price content change --- _episodes/04-ensemble-methods.md | 101 ++----------------------------- 1 file changed, 6 insertions(+), 95 deletions(-) diff --git a/_episodes/04-ensemble-methods.md b/_episodes/04-ensemble-methods.md index 118ce23..d195cbb 100644 --- a/_episodes/04-ensemble-methods.md +++ b/_episodes/04-ensemble-methods.md @@ -166,104 +166,14 @@ plt.show() ![random forest clf space](../fig/EM_rf_clf_space.png) -There is still some overfitting indicated by the regions that contain only single points but using the same hyper-parameter settings used to fit the decision tree classifier, we can see that overfitting is reduced. - -## Stacking: classification -import seaborn as sns -penguins = sns.load_dataset('penguins') - -feature_names = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] -penguins.dropna(subset=feature_names, inplace=True) - -species_names = penguins['species'].unique() - -# Define data and targets -X = penguins[feature_names] - -y = penguins.species - -# Split data in training and test set -from sklearn.model_selection import train_test_split - -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) - -print(f'train size: {X_train.shape}') -print(f'test size: {X_test.shape}') - -from sklearn.ensemble import ( - GradientBoostingClassifier, - RandomForestClassifier, - VotingClassifier, -) -from sklearn.gaussian_process import GaussianProcessClassifier -from sklearn.gaussian_process.kernels import RBF -from sklearn.tree import DecisionTreeClassifier - -# training estimators -rf_clf = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=1, random_state=5) -gb_clf = GradientBoostingClassifier(random_state=5) -gp_clf = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=5) -dt_clf = DecisionTreeClassifier(max_depth=5, random_state=5) - -voting_reg = VotingClassifier([("rf", rf_clf), ("gb", gb_clf), ("gp", gp_clf), ("dt", dt_clf)]) - -# fit voting estimator -voting_reg.fit(X_train, y_train) - -# lets also train the individual models for comparison -rf_clf.fit(X_train, y_train) -gb_clf.fit(X_train, y_train) -gp_clf.fit(X_train, y_train) -dt_clf.fit(X_train, y_train) - -import matplotlib.pyplot as plt - -# make predictions -X_test_20 = X_test[:20] # first 20 for visualisation - -rf_pred = rf_clf.predict(X_test_20) -gb_pred = gb_clf.predict(X_test_20) -gp_pred = gp_clf.predict(X_test_20) -dt_pred = dt_clf.predict(X_test_20) -voting_pred = voting_reg.predict(X_test_20) - -print(rf_pred) -print(gb_pred) -print(gp_pred) -print(dt_pred) -print(voting_pred) - -plt.figure() -plt.plot(gb_pred, "o", color="green", label="GradientBoostingClassifier") -plt.plot(rf_pred, "o", color="blue", label="RandomForestClassifier") -plt.plot(gp_pred, "o", color="darkblue", label="GuassianProcessClassifier") -plt.plot(dt_pred, "o", color="lightblue", label="DecisionTreeClassifier") -plt.plot(voting_pred, "x", color="red", ms=10, label="VotingRegressor") - -plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False) -plt.ylabel("predicted") -plt.xlabel("training samples") -plt.legend(loc="best") -plt.title("Regressor predictions and their average") - -plt.show() - -print(f'random forest: {rf_clf.score(X_test, y_test)}') - -print(f'gradient boost: {gb_clf.score(X_test, y_test)}') - -print(f'guassian process: {gp_clf.score(X_test, y_test)}') - -print(f'decision tree: {dt_clf.score(X_test, y_test)}') - -print(f'voting regressor: {voting_reg.score(X_test, y_test)}') +There is still some overfitting indicated by the regions that contain only single points but using the same hyper-parameter settings used to fit the decision tree classifier, we can see that overfitting is reduced. ## Stacking a regression problem We've had a look at a bagging approach but we'll now take a look at a stacking approach and apply it to a regression problem. We'll also introduce a new dataset to play around with. ### California house price prediction - +~~~ import sklearn from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split @@ -335,7 +245,8 @@ print(f'gradient boost: {gb_reg.score(X_test, y_test)}') print(f'linear regression: {linear_reg.score(X_test, y_test)}') print(f'voting regressor: {voting_reg.score(X_test, y_test)}') - +~~~ +{: .language-python} ### The diabetes dataset The diabetes dataset, contains 10 baseline variables for 442 diabetes patients where the target attribute is quantitative measure of disease progression one year after baseline. For more information see [Efron et al., (2004)](https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf). The useful thing about this data it is available as part of the [sci-kit learn library](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset). We'll start by loading the dataset to very briefly inspect the attributes by printing them out. @@ -459,8 +370,8 @@ print(f'voting regressor: {voting_reg.score(X_test, y_test)}') ~~~ {: .language-python} -Each of our models score a pretty poor 0.52-0.53, which is barely better than a coin flip. However what we can see is that the stacked result generated by the voting regressor produces a slightly improved score of 0.55, which is better than any of the three models/estimators taken individually. The whole model is greater than the sum of the individual parts. And of course, we could try and improve our accuracy score by tweaking with our indivdual model hyperparameters, or adjusting our training data features and train-test-split data. - +## Review this +Each of our models score 0.61-0.82, which is a good accuracy score, do note that the toy datasets are not representative of real world data. However what we can see is that the stacked result generated by the voting regressor fits different sub-models and then averages the individual predictions to form a final prediction. The benefit of this approach is that it reduces overfitting and increases generalizability. Of course, we could try and improve our accuracy score by tweaking with our indivdual model hyperparameters, using more advaced boosted models or adjusting our training data features and train-test-split data. > ## Exercise: Stacking a classification problem. > Sci-kit learn also has method for stacking ensemble classifiers ```sklearn.ensemble.VotingClassifier``` do you think you could apply a stack to the penguins dataset using a random forest, SVM and decision tree classifier, or a selection of any other classifier estimators available in sci-kit learn? From 15762c20bbf0d00e2cd9acf1a122364cb626d27b Mon Sep 17 00:00:00 2001 From: ngow210 Date: Tue, 23 Jul 2024 13:42:19 +1200 Subject: [PATCH 5/8] voting regressor house price page changed --- _episodes/04-ensemble-methods.md | 99 ++++++++++++++++---------------- 1 file changed, 48 insertions(+), 51 deletions(-) diff --git a/_episodes/04-ensemble-methods.md b/_episodes/04-ensemble-methods.md index d195cbb..2e0b164 100644 --- a/_episodes/04-ensemble-methods.md +++ b/_episodes/04-ensemble-methods.md @@ -170,29 +170,54 @@ There is still some overfitting indicated by the regions that contain only singl ## Stacking a regression problem -We've had a look at a bagging approach but we'll now take a look at a stacking approach and apply it to a regression problem. We'll also introduce a new dataset to play around with. +We've had a look at a bagging approach, but we'll now take a look at a stacking approach and apply it to a regression problem. We'll also introduce a new dataset to play around with. ### California house price prediction +The California housing dataset for regression problems contains 8 features such as, Median Income, House Age, Average Rooms, Average Bedrooms etc. for 20,640 properties. The target variable is the median house value for those 20,640 properties, note that all prices are in units of $100,000. This toy dataset is available as part of the [scikit learn library](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html). We'll start by loading the dataset to very briefly inspect the attributes by printing them out. + ~~~ import sklearn from sklearn.datasets import fetch_california_housing -from sklearn.model_selection import train_test_split + +# load the dataset X, y = fetch_california_housing(return_X_y=True, as_frame=True) +## All price variables are in units of $100,000 print(X.shape) -print(y.shape) - print(X.head()) -print("======================================") -## Target is in units of 100,000 + +print("Housing price as the target: ") + +## Target is in units of $100,000 print(y.head()) +print(y.shape) +~~~ +{: .language-python} + +For the the purposes of learning how to create and use ensemble methods and since it is a toy dataset, we will blindly use this dataset without inspecting it, cleaning or pre-processing it further. + +> ## Exercise: Investigate and visualise the dataset +> For this episode we simply want to learn how to build and use an Ensemble rather than actually solve a regression problem. To build up your skills as an ML practitioner, investigate and visualise this dataset. What can you say about the dataset itself, and what can you summarise about about any potential relationships or prediction problems? +{: .challenge} + +Lets start by splitting the dataset into training and testing subsets: + +~~~ +# split into train and test sets, We are selecting an 80%-20% train-test split. +from sklearn.model_selection import train_test_split -# split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) print(f'train size: {X_train.shape}') print(f'test size: {X_test.shape}') +~~~ +{: .language-python} + +Lets stack a series of regression models. In the same way the RandomForest classifier derives a results from a series of trees, we will combine the results from a series of different models in our stack. This is done using what's called an ensemble meta-estimator called a VotingRegressor. + +We'll apply a Voting regressor to a random forest, gradient boosting and linear regressor. +~~~ from sklearn.ensemble import ( GradientBoostingRegressor, RandomForestRegressor, @@ -200,20 +225,23 @@ from sklearn.ensemble import ( ) from sklearn.linear_model import LinearRegression -# training estimators +# Initialize estimators rf_reg = RandomForestRegressor(random_state=5) gb_reg = GradientBoostingRegressor(random_state=5) linear_reg = LinearRegression() voting_reg = VotingRegressor([("rf", rf_reg), ("gb", gb_reg), ("lr", linear_reg)]) -# fit voting estimator +# fit/train voting estimator voting_reg.fit(X_train, y_train) -# lets also train the individual models for comparison +# lets also fit/train the individual models for comparison rf_reg.fit(X_train, y_train) gb_reg.fit(X_train, y_train) linear_reg.fit(X_train, y_train) +~~~ +{: .language-python} +~~~ import matplotlib.pyplot as plt # make predictions @@ -237,48 +265,18 @@ plt.legend(loc="best") plt.title("Regressor predictions and their average") plt.show() - -print(f'random forest: {rf_reg.score(X_test, y_test)}') - -print(f'gradient boost: {gb_reg.score(X_test, y_test)}') - -print(f'linear regression: {linear_reg.score(X_test, y_test)}') - -print(f'voting regressor: {voting_reg.score(X_test, y_test)}') ~~~ {: .language-python} -### The diabetes dataset -The diabetes dataset, contains 10 baseline variables for 442 diabetes patients where the target attribute is quantitative measure of disease progression one year after baseline. For more information see [Efron et al., (2004)](https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf). The useful thing about this data it is available as part of the [sci-kit learn library](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset). We'll start by loading the dataset to very briefly inspect the attributes by printing them out. ~~~ -from sklearn.datasets import load_diabetes - -print(load_diabetes()) -~~~ -{: .language-python} - -For more details on this SKLearn dataset see [this link for details.](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset) - -For the the purposes of learning how to create and use ensemble methods we are about to commit a cardinal sin of machine learning and blindly use this dataset without inspecting it any further. - -> ## Exercise: Investigate and visualise the dataset -> For this episode we simply want to learn how to build and use an Ensemble rather than actually solve a regression problem. To build up your skills as an ML practitioner, investigate and visualise this dataset. What can you say about the dataset itself, and what can you summarise about about any potential relationships or prediction problems? -{: .challenge} - -Lets start by splitting the dataset into training and testing subsets: - -~~~ -from sklearn.model_selection import train_test_split +print(f'random forest: {rf_reg.score(X_test, y_test)}') -# load in data -X, y = load_diabetes(return_X_y=True) +print(f'gradient boost: {gb_reg.score(X_test, y_test)}') -# split into train and test sets -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) +print(f'linear regression: {linear_reg.score(X_test, y_test)}') -print(f'train size: {X_train.shape}') -print(f'test size: {X_test.shape}') +print(f'voting regressor: {voting_reg.score(X_test, y_test)}') ~~~ {: .language-python} @@ -321,7 +319,6 @@ voting_reg.fit(X_train, y_train) rf_reg.fit(X_train, y_train) gb_reg.fit(X_train, y_train) linear_reg.fit(X_train, y_train) - ~~~ {: .language-python} @@ -339,9 +336,9 @@ linear_pred = linear_reg.predict(X_test_20) voting_pred = voting_reg.predict(X_test_20) plt.figure() -plt.plot(rf_pred, "o", color="navy", label="GradientBoostingRegressor") -plt.plot(gb_pred, "o", color="blue", label="RandomForestRegressor") -plt.plot(linear_pred, "o", color="skyblue", label="LinearRegression") +plt.plot(gb_pred, "o", color="black", label="GradientBoostingRegressor") +plt.plot(rf_pred, "o", color="blue", label="RandomForestRegressor") +plt.plot(linear_pred, "o", color="green", label="LinearRegression") plt.plot(voting_pred, "x", color="red", ms=10, label="VotingRegressor") plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False) @@ -350,11 +347,12 @@ plt.xlabel("training samples") plt.legend(loc="best") plt.title("Regressor predictions and their average") + plt.show() ~~~ {: .language-python} -![Regressor predictions and average from stack](../fig/EM_stacked_plot.png) +![Regressor predictions and average from stack](../fig/house_price_voting_regressor.svg) FInally, lets see how the average compares against each single estimator in the stack? @@ -370,11 +368,10 @@ print(f'voting regressor: {voting_reg.score(X_test, y_test)}') ~~~ {: .language-python} -## Review this Each of our models score 0.61-0.82, which is a good accuracy score, do note that the toy datasets are not representative of real world data. However what we can see is that the stacked result generated by the voting regressor fits different sub-models and then averages the individual predictions to form a final prediction. The benefit of this approach is that it reduces overfitting and increases generalizability. Of course, we could try and improve our accuracy score by tweaking with our indivdual model hyperparameters, using more advaced boosted models or adjusting our training data features and train-test-split data. > ## Exercise: Stacking a classification problem. -> Sci-kit learn also has method for stacking ensemble classifiers ```sklearn.ensemble.VotingClassifier``` do you think you could apply a stack to the penguins dataset using a random forest, SVM and decision tree classifier, or a selection of any other classifier estimators available in sci-kit learn? +> Scikit learn also has method for stacking ensemble classifiers ```sklearn.ensemble.VotingClassifier``` do you think you could apply a stack to the penguins dataset using a random forest, SVM and decision tree classifier, or a selection of any other classifier estimators available in sci-kit learn? > > ~~~ > penguins = sns.load_dataset('penguins') From efc80c36fa1ac229c8b0ed0822cbaa7580a40f03 Mon Sep 17 00:00:00 2001 From: ngow210 Date: Wed, 24 Jul 2024 10:04:59 +1200 Subject: [PATCH 6/8] dimensionality reduction refactor and ensemble clean up --- _episodes/04-ensemble-methods.md | 81 +++------------------ _episodes/06-dimensionality-reduction.md | 90 +++++++++++------------- 2 files changed, 51 insertions(+), 120 deletions(-) diff --git a/_episodes/04-ensemble-methods.md b/_episodes/04-ensemble-methods.md index 2e0b164..f261bbd 100644 --- a/_episodes/04-ensemble-methods.md +++ b/_episodes/04-ensemble-methods.md @@ -173,7 +173,7 @@ There is still some overfitting indicated by the regions that contain only singl We've had a look at a bagging approach, but we'll now take a look at a stacking approach and apply it to a regression problem. We'll also introduce a new dataset to play around with. ### California house price prediction -The California housing dataset for regression problems contains 8 features such as, Median Income, House Age, Average Rooms, Average Bedrooms etc. for 20,640 properties. The target variable is the median house value for those 20,640 properties, note that all prices are in units of $100,000. This toy dataset is available as part of the [scikit learn library](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html). We'll start by loading the dataset to very briefly inspect the attributes by printing them out. +The California housing dataset for regression problems contains 8 training features such as, Median Income, House Age, Average Rooms, Average Bedrooms etc. for 20,640 properties. The target variable is the median house value for those 20,640 properties, note that all prices are in units of $100,000. This toy dataset is available as part of the [scikit learn library](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html). We'll start by loading the dataset to very briefly inspect the attributes by printing them out. ~~~ import sklearn @@ -217,69 +217,6 @@ Lets stack a series of regression models. In the same way the RandomForest class We'll apply a Voting regressor to a random forest, gradient boosting and linear regressor. -~~~ -from sklearn.ensemble import ( - GradientBoostingRegressor, - RandomForestRegressor, - VotingRegressor, -) -from sklearn.linear_model import LinearRegression - -# Initialize estimators -rf_reg = RandomForestRegressor(random_state=5) -gb_reg = GradientBoostingRegressor(random_state=5) -linear_reg = LinearRegression() -voting_reg = VotingRegressor([("rf", rf_reg), ("gb", gb_reg), ("lr", linear_reg)]) - -# fit/train voting estimator -voting_reg.fit(X_train, y_train) - -# lets also fit/train the individual models for comparison -rf_reg.fit(X_train, y_train) -gb_reg.fit(X_train, y_train) -linear_reg.fit(X_train, y_train) -~~~ -{: .language-python} - -~~~ -import matplotlib.pyplot as plt - -# make predictions -X_test_20 = X_test[:20] # first 20 for visualisation - -rf_pred = rf_reg.predict(X_test_20) -gb_pred = gb_reg.predict(X_test_20) -linear_pred = linear_reg.predict(X_test_20) -voting_pred = voting_reg.predict(X_test_20) - -plt.figure() -plt.plot(gb_pred, "o", color="navy", label="GradientBoostingRegressor") -plt.plot(rf_pred, "o", color="blue", label="RandomForestRegressor") -plt.plot(linear_pred, "o", color="skyblue", label="LinearRegression") -plt.plot(voting_pred, "x", color="red", ms=10, label="VotingRegressor") - -plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False) -plt.ylabel("predicted") -plt.xlabel("training samples") -plt.legend(loc="best") -plt.title("Regressor predictions and their average") - -plt.show() -~~~ -{: .language-python} - - -~~~ -print(f'random forest: {rf_reg.score(X_test, y_test)}') - -print(f'gradient boost: {gb_reg.score(X_test, y_test)}') - -print(f'linear regression: {linear_reg.score(X_test, y_test)}') - -print(f'voting regressor: {voting_reg.score(X_test, y_test)}') -~~~ -{: .language-python} - Lets stack a series of regression models. In the same way the RandomForest classifier derives a results from a series of trees, we will combine the results from a series of different models in our stack. This is done using what's called an ensemble meta-estimator called a VotingRegressor. We'll apply a Voting regressor to a random forest, gradient boosting and linear regressor. @@ -306,23 +243,23 @@ from sklearn.ensemble import ( ) from sklearn.linear_model import LinearRegression -# training estimators +# Initialize estimators rf_reg = RandomForestRegressor(random_state=5) gb_reg = GradientBoostingRegressor(random_state=5) linear_reg = LinearRegression() -voting_reg = VotingRegressor([("gb", rf_reg), ("rf", gb_reg), ("lr", linear_reg)]) +voting_reg = VotingRegressor([("rf", rf_reg), ("gb", gb_reg), ("lr", linear_reg)]) -# fit voting estimator +# fit/train voting estimator voting_reg.fit(X_train, y_train) -# lets also train the individual models for comparison +# lets also fit/train the individual models for comparison rf_reg.fit(X_train, y_train) gb_reg.fit(X_train, y_train) linear_reg.fit(X_train, y_train) ~~~ {: .language-python} -We fit the voting regressor in the same way we would fit a single model. When the voting regressor is instantiated we pass it a parameter containing a list of tuples that contain the estimators we wish to stack: in this case the random forest, gradient boosting and linear regressors. To get a sense of what this is doing lets predict the first 20 samples in the test portion of the data and plot the results. +We fit the voting regressor in the same way we would fit a single model. When the voting regressor is instantiated we pass it a parameter containing a list of tuples that contain the estimators we wish to stack: in this case the random forest, gradient boosting and linear regressors. To get a sense of what this is doing lets predict the first 20 samples in the test portion of the data and plot the results. ~~~ import matplotlib.pyplot as plt @@ -347,15 +284,13 @@ plt.xlabel("training samples") plt.legend(loc="best") plt.title("Regressor predictions and their average") - plt.show() ~~~ {: .language-python} ![Regressor predictions and average from stack](../fig/house_price_voting_regressor.svg) - -FInally, lets see how the average compares against each single estimator in the stack? +Finally, lets see how the average compares against each single estimator in the stack? ~~~ print(f'random forest: {rf_reg.score(X_test, y_test)}') @@ -368,7 +303,7 @@ print(f'voting regressor: {voting_reg.score(X_test, y_test)}') ~~~ {: .language-python} -Each of our models score 0.61-0.82, which is a good accuracy score, do note that the toy datasets are not representative of real world data. However what we can see is that the stacked result generated by the voting regressor fits different sub-models and then averages the individual predictions to form a final prediction. The benefit of this approach is that it reduces overfitting and increases generalizability. Of course, we could try and improve our accuracy score by tweaking with our indivdual model hyperparameters, using more advaced boosted models or adjusting our training data features and train-test-split data. +Each of our models score between 0.61-0.82, which at the high end is good, but at the low end is a pretty poor prediction accuracy score. Do note that the toy datasets are not representative of real world data. However what we can see is that the stacked result generated by the voting regressor fits different sub-models and then averages the individual predictions to form a final prediction. The benefit of this approach is that, it reduces overfitting and increases generalizability. Of course, we could try and improve our accuracy score by tweaking with our indivdual model hyperparameters, using more advaced boosted models or adjusting our training data features and train-test-split data. > ## Exercise: Stacking a classification problem. > Scikit learn also has method for stacking ensemble classifiers ```sklearn.ensemble.VotingClassifier``` do you think you could apply a stack to the penguins dataset using a random forest, SVM and decision tree classifier, or a selection of any other classifier estimators available in sci-kit learn? diff --git a/_episodes/06-dimensionality-reduction.md b/_episodes/06-dimensionality-reduction.md index 05b4b7b..bbe48e8 100644 --- a/_episodes/06-dimensionality-reduction.md +++ b/_episodes/06-dimensionality-reduction.md @@ -28,7 +28,40 @@ The MNIST dataset contains 70,000 images of handwritten numbers, and are labelle To make this episode a bit less computationally intensive, the Scikit-Learn example that we will work with is a smaller sample of 1797 images. Each image is 8x8 in size for a total of 64 pixels per image, resulting in 64 features for us to work with. The pixels can take a value between 0-15 (4bits). Let's retrieve and inspect the Scikit-Learn dataset with the following code: ~~~ -from sklearn import datasets +import numpy as np +import matplotlib.pyplot as plt +import sklearn.cluster as skl_cluster +from sklearn import manifold, decomposition, datasets + +# Let's define these here to avoid repetitive code +def plots(x_manifold): + tx = x_manifold[:, 0] + ty = x_manifold[:, 1] + + # without labels + fig = plt.figure(1, figsize=(4, 4)) + plt.scatter(tx, ty, edgecolor='k',label=labels) + plt.show() + +def plot_clusters(x_manifold, clusters): + tx = x_manifold[:, 0] + ty = x_manifold[:, 1] + fig = plt.figure(1, figsize=(4, 4)) + plt.scatter(tx, ty, s=5, linewidth=0, c=clusters) + for cluster_x, cluster_y in Kmean.cluster_centers_: + plt.scatter(cluster_x, cluster_y, s=100, c='r', marker='x') + plt.show() + +def plot_clusters_labels(x_manifold, labels): + tx = x_manifold[:, 0] + ty = x_manifold[:, 1] + + # with labels + fig = plt.figure(1, figsize=(5, 4)) + plt.scatter(tx, ty, c=labels, cmap="nipy_spectral", + edgecolor='k', label=labels) + plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10)) + plt.show() # load in dataset as a Pandas Dataframe, return X and Y features, labels = datasets.load_digits(return_X_y=True, as_frame=True) @@ -49,8 +82,6 @@ As humans we are pretty good at object and pattern recognition. We can look at t > > > ## Solution > > ~~~ -> > import matplotlib.pyplot as plt -> > import numpy as np > > > > print(features.iloc[0]) > > image_1D = features.iloc[0] @@ -107,12 +138,9 @@ For more in depth explanations of PCA please see the following links: Let's apply PCA to the MNIST dataset and retain the two most-major components: ~~~ -from sklearn import decomposition - # PCA with 2 components pca = decomposition.PCA(n_components=2) -pca.fit(features) -x_pca = pca.transform(features) +x_pca = pca.fit_transform(features) print(x_pca.shape) ~~~ @@ -121,16 +149,7 @@ print(x_pca.shape) This returns us an array of 1797x2 where the 2 remaining columns(our new "features" or "dimensions") contain vector representations of the first principle components (column 0) and second principle components (column 1) for each of the images. We can plot these two new features against each other: ~~~ -import numpy as np -import matplotlib.pyplot as plt - -tx = x_pca[:, 0] -ty = x_pca[:, 1] - -# without labels -fig = plt.figure(1, figsize=(4, 4)) -plt.scatter(tx, ty, edgecolor='k',label=labels) -plt.show() +plots(x_pca) ~~~ {: .language-python} @@ -139,18 +158,10 @@ plt.show() We now have a 2D representation of our 64D dataset that we can work with instead. Let's try some quick K-means clustering on our 2D representation of the data. Because we already have some knowledge about our data we can set `k=10` for the 10 digits present in the dataset. ~~~ -import sklearn.cluster as skl_cluster - Kmean = skl_cluster.KMeans(n_clusters=10) - Kmean.fit(x_pca) clusters = Kmean.predict(x_pca,labels) - -fig = plt.figure(1, figsize=(4, 4)) -plt.scatter(tx, ty, s=5, linewidth=0, c=clusters) -for cluster_x, cluster_y in Kmean.cluster_centers_: - plt.scatter(cluster_x, cluster_y, s=100, c='r', marker='x') -plt.show() +plot_clusters(x_pca, clusters) ~~~ {: .language-python} @@ -159,6 +170,9 @@ plt.show() And now we can compare how these clusters look against our actual image labels by colour coding our first scatter plot: ~~~ +tx = x_pca[:, 0] +ty = x_pca[:, 1] + fig = plt.figure(1, figsize=(5, 4)) plt.scatter(tx, ty, c=labels, cmap="nipy_spectral", edgecolor='k',label=labels) @@ -186,45 +200,27 @@ For more in depth explanations of t-SNE and manifold learning please see the fol Scikit-Learn allows us to apply t-SNE in a relatively simple way. Lets code and apply t-SNE to the MNIST dataset in the same manner that we did for the PCA example, and reduce the data down from 64D to 2D again: ~~~ -from sklearn import manifold - # t-SNE embedding # initialising with "pca" explicitly preserves global structure tsne = manifold.TSNE(n_components=2, init='pca', random_state = 0) x_tsne = tsne.fit_transform(features) - -fig = plt.figure(1, figsize=(4, 4)) -plt.scatter(x_tsne[:, 0], x_tsne[:, 1], edgecolor='k') -plt.show() +plots(x_tsne) ~~~ {: .language-python} ![Reduction using PCA](../fig/tsne_unlabelled.png) - It looks like t-SNE has done a much better job of splitting our data up into clusters using only a 2D representation of the data. Once again, let's run a simple k-means clustering on this new 2D representation, and compare with the actual color-labelled data: ~~~ -import sklearn.cluster as skl_cluster - Kmean = skl_cluster.KMeans(n_clusters=10) Kmean.fit(x_tsne) clusters = Kmean.predict(x_tsne,labels) -fig = plt.figure(1, figsize=(4, 4)) -plt.scatter(x_tsne[:,0], x_tsne[:,1], s=5, linewidth=0, c=clusters) -for cluster_x, cluster_y in Kmean.cluster_centers_: - plt.scatter(cluster_x, cluster_y, s=100, c='r', marker='x') -plt.show() - -# with labels -fig = plt.figure(1, figsize=(5, 4)) -plt.scatter(x_tsne[:, 0], x_tsne[:, 1], c=labels, cmap="nipy_spectral", - edgecolor='k',label=labels) -plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10)) -plt.show() +plot_clusters(x_tsne, clusters) +plot_clusters_labels(x_tsne, labels) ~~~ {: .language-python} From 3c2ad8a7e4e60e547d252a294c3f548564313705 Mon Sep 17 00:00:00 2001 From: ngow210 Date: Wed, 24 Jul 2024 13:32:24 +1200 Subject: [PATCH 7/8] clean up ep 04,06 and fig --- _episodes/06-dimensionality-reduction.md | 9 +- fig/house_price_voting_regressor.svg | 210 +++++++++++------------ 2 files changed, 106 insertions(+), 113 deletions(-) diff --git a/_episodes/06-dimensionality-reduction.md b/_episodes/06-dimensionality-reduction.md index bbe48e8..2070ae3 100644 --- a/_episodes/06-dimensionality-reduction.md +++ b/_episodes/06-dimensionality-reduction.md @@ -170,14 +170,7 @@ plot_clusters(x_pca, clusters) And now we can compare how these clusters look against our actual image labels by colour coding our first scatter plot: ~~~ -tx = x_pca[:, 0] -ty = x_pca[:, 1] - -fig = plt.figure(1, figsize=(5, 4)) -plt.scatter(tx, ty, c=labels, cmap="nipy_spectral", - edgecolor='k',label=labels) -plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10)) -plt.show() +plot_clusters_labels(x_pca, labels) ~~~ {: .language-python} diff --git a/fig/house_price_voting_regressor.svg b/fig/house_price_voting_regressor.svg index c6936b8..753cde5 100644 --- a/fig/house_price_voting_regressor.svg +++ b/fig/house_price_voting_regressor.svg @@ -6,7 +6,7 @@ - 2024-07-17T15:55:56.397301 + 2024-07-24T13:20:26.580306 image/svg+xml @@ -331,12 +331,12 @@ z - - + @@ -395,7 +395,7 @@ z - + @@ -437,7 +437,7 @@ z - + @@ -478,7 +478,7 @@ z - + @@ -493,7 +493,7 @@ z - + @@ -542,7 +542,7 @@ z - + @@ -557,7 +557,7 @@ z - + @@ -593,7 +593,7 @@ z - + @@ -671,7 +671,7 @@ z - +" style="stroke: #000000"/> - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - +" style="stroke: #008000"/> - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + @@ -971,7 +971,7 @@ z - + @@ -1065,7 +1065,7 @@ z - + @@ -1111,7 +1111,7 @@ z - + @@ -1148,7 +1148,7 @@ z - + @@ -1187,7 +1187,7 @@ z - + From db8362f10a80eaba4966fc0f2c10c1d957649f60 Mon Sep 17 00:00:00 2001 From: ngow210 Date: Wed, 24 Jul 2024 14:08:18 +1200 Subject: [PATCH 8/8] ensemble penguins classification added --- _episodes/ensemble_classification.md | 89 ++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 _episodes/ensemble_classification.md diff --git a/_episodes/ensemble_classification.md b/_episodes/ensemble_classification.md new file mode 100644 index 0000000..1499a4d --- /dev/null +++ b/_episodes/ensemble_classification.md @@ -0,0 +1,89 @@ +## Stacking: classification +import seaborn as sns +penguins = sns.load_dataset('penguins') + +feature_names = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] +penguins.dropna(subset=feature_names, inplace=True) + +species_names = penguins['species'].unique() + +# Define data and targets +X = penguins[feature_names] + +y = penguins.species + +# Split data in training and test set +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) + +print(f'train size: {X_train.shape}') +print(f'test size: {X_test.shape}') + +from sklearn.ensemble import ( + GradientBoostingClassifier, + RandomForestClassifier, + VotingClassifier, +) +from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.gaussian_process.kernels import RBF +from sklearn.tree import DecisionTreeClassifier + +# training estimators +rf_clf = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=1, random_state=5) +gb_clf = GradientBoostingClassifier(random_state=5) +gp_clf = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=5) +dt_clf = DecisionTreeClassifier(max_depth=5, random_state=5) + +voting_reg = VotingClassifier([("rf", rf_clf), ("gb", gb_clf), ("gp", gp_clf), ("dt", dt_clf)]) + +# fit voting estimator +voting_reg.fit(X_train, y_train) + +# lets also train the individual models for comparison +rf_clf.fit(X_train, y_train) +gb_clf.fit(X_train, y_train) +gp_clf.fit(X_train, y_train) +dt_clf.fit(X_train, y_train) + +import matplotlib.pyplot as plt + +# make predictions +X_test_20 = X_test[:20] # first 20 for visualisation + +rf_pred = rf_clf.predict(X_test_20) +gb_pred = gb_clf.predict(X_test_20) +gp_pred = gp_clf.predict(X_test_20) +dt_pred = dt_clf.predict(X_test_20) +voting_pred = voting_reg.predict(X_test_20) + +print(rf_pred) +print(gb_pred) +print(gp_pred) +print(dt_pred) +print(voting_pred) + +plt.figure() +plt.plot(gb_pred, "o", color="green", label="GradientBoostingClassifier") +plt.plot(rf_pred, "o", color="blue", label="RandomForestClassifier") +plt.plot(gp_pred, "o", color="darkblue", label="GuassianProcessClassifier") +plt.plot(dt_pred, "o", color="lightblue", label="DecisionTreeClassifier") +plt.plot(voting_pred, "x", color="red", ms=10, label="VotingRegressor") + +plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False) +plt.ylabel("predicted") +plt.xlabel("training samples") +plt.legend(loc="best") +plt.title("Regressor predictions and their average") + +plt.show() + +print(f'random forest: {rf_clf.score(X_test, y_test)}') + +print(f'gradient boost: {gb_clf.score(X_test, y_test)}') + +print(f'guassian process: {gp_clf.score(X_test, y_test)}') + +print(f'decision tree: {dt_clf.score(X_test, y_test)}') + +print(f'voting regressor: {voting_reg.score(X_test, y_test)}') \ No newline at end of file