Merge pull request #21 from NidhiGowdra/gh-pages

Changes to classification
carpentries-incubator · Jul 30, 2024 · 8fb9d3c · 8fb9d3c
2 parents 3f8a217 + 66930c7
commit 8fb9d3c
Show file tree

Hide file tree

Showing 16 changed files with 93 additions and 64 deletions.
diff --git a/_episodes/03-classification.md b/_episodes/03-classification.md
@@ -19,7 +19,14 @@ Classification is a supervised method to recognise and group data objects into a
 In this episode we are going to introduce the concept of supervised classification by classifying penguin data into different species of penguins using Scikit-Learn.
 
 ## The penguins dataset
-We're going to be using the penguins dataset of Allison Horst, published [here](https://github.com/allisonhorst/palmerpenguins) in 2020, which is comprised of 342 observations of three species of penguins: Adelie, Chinstrap & Gentoo. For each penguin we have measurements of bill length and depth (mm), flipper length (mm), body mass (g), and information on species, island, and sex.
+We're going to be using the penguins dataset of Allison Horst, published [here](https://github.com/allisonhorst/palmerpenguins), The dataset contains 344 size measurements for three penguin species (Chinstrap, Gentoo and Adélie) observed on three islands in the Palmer Archipelago, Antarctica.
+
+![*Artwork by @allison_horst*](../fig/palmer_penguins.png)
+
+The physical attributes measured are flipper length, beak length, beak width, body mass, and sex.
+![*Artwork by @allison_horst*](../fig/culmen_depth.png)
+
+In other words, the dataset contains 344 rows with 7 features i.e. 5 physical attributes, species and the island where the observations were made.
 
 ~~~
 import seaborn as sns
@@ -39,7 +46,7 @@ The above table contains multiple categorical objects such as species. If we att
 
 ### Preprocessing our data
 
-Lets do some pre-processing on our dataset and specify our `X` features and `Y` labels:
+Lets do some pre-processing on our dataset and specify our `X` features and `y` labels:
 
 ~~~
 # Extract the data we need
@@ -49,8 +56,7 @@ dataset.dropna(subset=feature_names, inplace=True)
 class_names = dataset['species'].unique()
 
 X = dataset[feature_names]
-
-Y = dataset['species']
+y = dataset['species']
 ~~~
 {: .language-python}
 
@@ -82,19 +88,19 @@ In the previous regression episode we created the penguin training data by takin
 ~~~
 from sklearn.model_selection import train_test_split
 
-x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
 ~~~
 {: .language-python}
 
-We'll use `x_train` and `y_train` to develop our model, and only look at `x_test` and `y_test` when it's time to evaluate its performance.
+We'll use `X_train` and `y_train` to develop our model, and only look at `X_test` and `y_test` when it's time to evaluate its performance.
 
 ### Visualising the data
 In order to better understand how a model might classify this data, we can first take a look at the data visually, to see what patterns we might identify.
 
 ~~~
 import matplotlib.pyplot as plt
 
-fig01 = sns.scatterplot(x_train, x=feature_names[0], y=feature_names[1], hue=dataset['species'])
+fig01 = sns.scatterplot(X_train, x=feature_names[0], y=feature_names[1], hue=dataset['species'])
 plt.show()
 ~~~
 {: .language-python}
@@ -125,9 +131,9 @@ Training and using a decision tree in Scikit-Learn is straightforward:
 from sklearn.tree import DecisionTreeClassifier, plot_tree
 
 clf = DecisionTreeClassifier(max_depth=2)
-clf.fit(x_train, y_train)
+clf.fit(X_train, y_train)
 
-clf.predict(x_test)
+clf.predict(X_test)
 ~~~
 {: .language-python}
 
@@ -138,7 +144,7 @@ clf.predict(x_test)
 We can conveniently check how our model did with the .score() function, which will make predictions and report what proportion of them were accurate:
 
 ~~~
-clf_score = clf.score(x_test, y_test)
+clf_score = clf.score(X_test, y_test)
 print(clf_score)
 ~~~
 {: .language-python}
@@ -174,11 +180,11 @@ f1 = feature_names[0]
 f2 = feature_names[3]
 
 clf = DecisionTreeClassifier(max_depth=2)
-clf.fit(x_train[[f1, f2]], y_train)
+clf.fit(X_train[[f1, f2]], y_train)
 
-d = DecisionBoundaryDisplay.from_estimator(clf, x_train[[f1, f2]])
+d = DecisionBoundaryDisplay.from_estimator(clf, X_train[[f1, f2]])
 
-sns.scatterplot(x_train, x=f1, y=f2, hue=y_train, palette="husl")
+sns.scatterplot(X_train, x=f1, y=f2, hue=y_train, palette="husl")
 plt.show()
 ~~~
 {: .language-python}
@@ -199,8 +205,8 @@ max_depths = [1, 2, 3, 4, 5]
 accuracy = []
 for i, d in enumerate(max_depths):
     clf = DecisionTreeClassifier(max_depth=d)
-    clf.fit(x_train, y_train)
-    acc = clf.score(x_test, y_test)
+    clf.fit(X_train, y_train)
+    acc = clf.score(X_test, y_test)
 
     accuracy.append((d, acc))
 
@@ -221,7 +227,7 @@ Let's reuse our fitting and plotting codes from above to inspect a decision tree
 
 ~~~
 clf = DecisionTreeClassifier(max_depth=5)
-clf.fit(x_train, y_train)
+clf.fit(X_train, y_train)
 
 fig = plt.figure(figsize=(12, 10))
 plot_tree(clf, class_names=class_names, feature_names=feature_names, filled=True, ax=fig.gca())
@@ -237,11 +243,11 @@ f1 = feature_names[0]
 f2 = feature_names[3]
 
 clf = DecisionTreeClassifier(max_depth=5)
-clf.fit(x_train[[f1, f2]], y_train)
+clf.fit(X_train[[f1, f2]], y_train)
 
-d = DecisionBoundaryDisplay.from_estimator(clf, x_train[[f1, f2]])
+d = DecisionBoundaryDisplay.from_estimator(clf, X_train[[f1, f2]])
 
-sns.scatterplot(x_train, x=f1, y=f2, hue=y_train, palette='husl')
+sns.scatterplot(X_train, x=f1, y=f2, hue=y_train, palette='husl')
 plt.show()
 ~~~
 {: .language-python}
@@ -266,9 +272,9 @@ from sklearn import preprocessing
 import pandas as pd
 
 scalar = preprocessing.StandardScaler()
-scalar.fit(x_train)
-x_train_scaled = pd.DataFrame(scalar.transform(x_train), columns=x_train.columns, index=x_train.index)
-x_test_scaled = pd.DataFrame(scalar.transform(x_test), columns=x_test.columns, index=x_test.index)
+scalar.fit(X_train)
+X_train_scaled = pd.DataFrame(scalar.transform(X_train), columns=X_train.columns, index=X_train.index)
+X_test_scaled = pd.DataFrame(scalar.transform(X_test), columns=X_test.columns, index=X_test.index)
 ~~~
 {: .language-python}
 
@@ -280,9 +286,9 @@ With this scaled data, training the models works exactly the same as before.
 from sklearn import svm
 
 SVM = svm.SVC(kernel='poly', degree=3, C=1.5)
-SVM.fit(x_train_scaled, y_train)
+SVM.fit(X_train_scaled, y_train)
 
-svm_score = SVM.score(x_test_scaled, y_test)
+svm_score = SVM.score(X_test_scaled, y_test)
 print("Decision tree score is ", clf_score)
 print("SVM score is ", svm_score)
 ~~~
@@ -291,7 +297,7 @@ print("SVM score is ", svm_score)
 We can again visualise the decision space produced, also using only two parameters:
 
 ~~~
-x2 = x_train_scaled[[feature_names[0], feature_names[1]]]
+x2 = X_train_scaled[[feature_names[0], feature_names[1]]]
 
 SVM = svm.SVC(kernel='poly', degree=3, C=1.5)
 SVM.fit(x2, y_train)

diff --git a/_episodes/04-ensemble-methods.md b/_episodes/04-ensemble-methods.md
@@ -119,8 +119,7 @@ forest = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=
 # train our model
 forest.fit(X_train, y_train)
 
-# compare our models
-print(tree.score(X_test, y_test))
+# Score our model
 print(forest.score(X_test, y_test))
 ~~~
 {: .language-python}
@@ -153,10 +152,16 @@ We can see the first 5 (of 100) trees that were fitted as part of the forest.
 If we train the random forest estimator using the same two parameters used to plot the classification space for the decision tree classifier what do we think the plot will look like?
 
 ~~~
+# lets train a random forest for only two features (body mass and bill length)
+from sklearn.inspection import DecisionBoundaryDisplay
+f1 = feature_names[0]
+f2 = feature_names[3]
+
 # plot classification space for body mass and bill length with random forest
 forest_2d = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=1, random_state=5)
 forest_2d.fit(X_train[[f1, f2]], y_train)
 
+# Lets plot the decision boundaries made by the model for the two trained features
 d = DecisionBoundaryDisplay.from_estimator(forest_2d, X_train[[f1, f2]])
 
 sns.scatterplot(X_train, x=f1, y=f2, hue=y_train, palette="husl")

diff --git a/_episodes/05-clustering.md b/_episodes/05-clustering.md
@@ -75,9 +75,31 @@ Now lets create some random blobs using the `make_blobs` function. The `n_sample
 ~~~
 import matplotlib.pyplot as plt
 
+#Lets define some functions here to avoid repetitive code
+def plots_labels(data, labels):
+    tx = data[:, 0]
+    ty = data[:, 1]
+
+    fig = plt.figure(1, figsize=(4, 4))
+    plt.scatter(tx, ty, edgecolor='k', c=labels)
+    plt.show()
+
+def plot_clusters(data, clusters, Kmean):
+    tx = data[:, 0]
+    ty = data[:, 1]
+    fig = plt.figure(1, figsize=(4, 4))
+    plt.scatter(tx, ty, s=5, linewidth=0, c=clusters)
+    for cluster_x, cluster_y in Kmean.cluster_centers_:
+        plt.scatter(cluster_x, cluster_y, s=100, c='r', marker='x')
+    plt.show()
+~~~
+{: .language-python}
+
+Lets create the clusters.
+
+~~~
 data, cluster_id = skl_datasets.make_blobs(n_samples=400, cluster_std=0.75, centers=4, random_state=1)
-plt.scatter(data[:,0], data[:,1], s=5, linewidth=0)
-plt.show()
+plots_labels(data, cluster_id)
 ~~~
 {: .language-python}
 
@@ -95,15 +117,14 @@ clusters = Kmean.predict(data)
 The data can now be plotted to show all the points we randomly generated. To make it clearer which cluster points have been classified we can set the colours (the c parameter) to use the `clusters` list that was returned by the `predict` function. The Kmeans algorithm also lets us know where it identified the centre of each cluster. These are stored as a list called 'cluster_centers_' inside the `Kmean` object. Let's plot the points from the clusters, colouring them by the output from the K-means algorithm, and also plot the centres of each cluster as a red X.
 
 ~~~
-plt.scatter(data[:, 0], data[:, 1], s=5, linewidth=0, c=clusters)
-for cluster_x, cluster_y in Kmean.cluster_centers_:
-    plt.scatter(cluster_x, cluster_y, s=100, c='r', marker='x')
-plt.show()
+plot_clusters(data, clusters, Kmean)
 ~~~
 {: .language-python}
 
 ![Plot of the fitted random clusters](../fig/random_clusters_centre.png)
 
+Here is the code all in a single block.
+
 ~~~
 import sklearn.cluster as skl_cluster
 import sklearn.datasets as skl_datasets
@@ -115,10 +136,7 @@ Kmean = skl_cluster.KMeans(n_clusters=4)
 Kmean.fit(data)
 clusters = Kmean.predict(data)
 
-plt.scatter(data[:, 0], data[:, 1], s=5, linewidth=0, c=clusters)
-for cluster_x, cluster_y in Kmean.cluster_centers_:
-    plt.scatter(cluster_x, cluster_y, s=100, c='r', marker='x')
-plt.show()
+plot_clusters(data, clusters, Kmean)
 ~~~
 {: .language-python}
 
@@ -207,8 +225,7 @@ Lets try out using Scikit-Learn's spectral clustering. To make the concentric ci
 import sklearn.datasets as skl_data
 
 circles, circles_clusters = skl_data.make_circles(n_samples=400, noise=.01, random_state=0)
-plt.scatter(circles[:, 0], circles[:, 1], s=15, linewidth=0)
-plt.show()
+plots_labels(circles, circles_clusters)
 ~~~
 {: .language-python}
 
@@ -222,8 +239,7 @@ The SpectralClustering class combines the fit and predict functions into a singl
 
 ~~~
 labels = model.fit_predict(circles)
-plt.scatter(circles[:, 0], circles[:, 1], s=15, linewidth=0, c=labels, cmap='flag')
-plt.show()
+plots_labels(circles, labels)
 ~~~
 {: .language-python}
 
@@ -241,14 +257,12 @@ Kmean.fit(circles)
 clusters = Kmean.predict(circles)
 
 # plot the data, colouring it by cluster
-plt.scatter(circles[:, 0], circles[:, 1], s=15, linewidth=0.1, c=clusters,cmap='flag')
-plt.show()
+plot_clusters(circles, clusters, Kmean)
 
 # cluster with spectral clustering
 model = skl_cluster.SpectralClustering(n_clusters=2, affinity='nearest_neighbors', assign_labels='kmeans')
 labels = model.fit_predict(circles)
-plt.scatter(circles[:, 0], circles[:, 1], s=15, linewidth=0, c=labels, cmap='flag')
-plt.show()
+plots_labels(circles, labels)
 ~~~
 {: .language-python}
 

diff --git a/_episodes/06-dimensionality-reduction.md b/_episodes/06-dimensionality-reduction.md
@@ -34,35 +34,38 @@ import sklearn.cluster as skl_cluster
 from sklearn import manifold, decomposition, datasets
 
 # Let's define these here to avoid repetitive code
-def plots(x_manifold):
-    tx = x_manifold[:, 0]
-    ty = x_manifold[:, 1]
-
-    # without labels
+def plots_labels(data, labels):
+    tx = data[:, 0]
+    ty = data[:, 1]
+    
     fig = plt.figure(1, figsize=(4, 4))
-    plt.scatter(tx, ty, edgecolor='k',label=labels)
+    plt.scatter(tx, ty, edgecolor='k', c=labels)
     plt.show()
 
-def plot_clusters(x_manifold, clusters):
-    tx = x_manifold[:, 0]
-    ty = x_manifold[:, 1]
+def plot_clusters(data, clusters, Kmean):
+    tx = data[:, 0]
+    ty = data[:, 1]
     fig = plt.figure(1, figsize=(4, 4))
     plt.scatter(tx, ty, s=5, linewidth=0, c=clusters)
     for cluster_x, cluster_y in Kmean.cluster_centers_:
         plt.scatter(cluster_x, cluster_y, s=100, c='r', marker='x')
     plt.show()
 
-def plot_clusters_labels(x_manifold, labels):
-    tx = x_manifold[:, 0]
-    ty = x_manifold[:, 1]
+def plot_clusters_labels(data, labels):
+    tx = data[:, 0]
+    ty = data[:, 1]
 
     # with labels
     fig = plt.figure(1, figsize=(5, 4))
     plt.scatter(tx, ty, c=labels, cmap="nipy_spectral", 
             edgecolor='k', label=labels)
     plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
     plt.show()
+~~~
+{: .language-python}
 
+Next lets load in the digits dataset,
+~~~
 # load in dataset as a Pandas Dataframe, return X and Y
 features, labels = datasets.load_digits(return_X_y=True, as_frame=True)
 
@@ -149,7 +152,8 @@ print(x_pca.shape)
 This returns us an array of 1797x2 where the 2 remaining columns(our new "features" or "dimensions") contain vector representations of the first principle components (column 0) and second principle components (column 1) for each of the images. We can plot these two new features against each other:
 
 ~~~
-plots(x_pca)
+# We are passing None becuase it is an unlabelled plot
+plots_labels(x_pca, None)
 ~~~
 {: .language-python}
 
@@ -160,8 +164,8 @@ We now have a 2D representation of our 64D dataset that we can work with instead
 ~~~
 Kmean = skl_cluster.KMeans(n_clusters=10)
 Kmean.fit(x_pca)
-clusters = Kmean.predict(x_pca,labels)
-plot_clusters(x_pca, clusters)
+clusters = Kmean.predict(x_pca)
+plot_clusters(x_pca, clusters, Kmean)
 ~~~
 {: .language-python}
 
@@ -184,7 +188,7 @@ It's worth noting that PCA does not handle outlier data well primarily due to gl
 
 t-SNE is a powerful example of manifold learning - a non-deterministic non-linear approach to dimensionality reduction. Manifold learning tasks are based on the idea that the dimension of many datasets is artificially high. This is likely the case for our MNIST dataset, as the corner pixels of our images are unlikely to contain digit data, and thus those dimensions are almost negligable compared with others.
 
-The versatility of the algorithm in transforming the underlying structural information into lower-order projections makes t-SNE applicable to a wide range of research domains.  
+The versatility of the algorithm in transforming the underlying structural information into lower-order projections makes t-SNE applicable to a wide range of research domains.
 
 For more in depth explanations of t-SNE and manifold learning please see the following links which also contain som very nice visual examples of manifold learning in action:
 * [https://thedatafrog.com/en/articles/visualizing-datasets/](https://thedatafrog.com/en/articles/visualizing-datasets/)
@@ -198,7 +202,7 @@ Scikit-Learn allows us to apply t-SNE in a relatively simple way. Lets code and
 tsne = manifold.TSNE(n_components=2, init='pca', random_state = 0)
 x_tsne = tsne.fit_transform(features)
 
-plots(x_tsne)
+plots_labels(x_tsne, None)
 ~~~
 {: .language-python}
 
@@ -210,9 +214,9 @@ It looks like t-SNE has done a much better job of splitting our data up into clu
 Kmean = skl_cluster.KMeans(n_clusters=10)
 
 Kmean.fit(x_tsne)
-clusters = Kmean.predict(x_tsne,labels)
+clusters = Kmean.predict(x_tsne)
 
-plot_clusters(x_tsne, clusters)
+plot_clusters(x_tsne, clusters, Kmean)
 plot_clusters_labels(x_tsne, labels)
 ~~~
 {: .language-python}

diff --git a/fig/culmen_depth.png b/fig/culmen_depth.png
diff --git a/fig/kmeans_concentric_circle.png b/fig/kmeans_concentric_circle.png
diff --git a/fig/kmeans_overlapping_clusters.png b/fig/kmeans_overlapping_clusters.png
diff --git a/fig/palmer_penguins.png b/fig/palmer_penguins.png
diff --git a/fig/pca_clustered.png b/fig/pca_clustered.png
diff --git a/fig/pca_labelled.png b/fig/pca_labelled.png
diff --git a/fig/pca_unlabelled.png b/fig/pca_unlabelled.png
diff --git a/fig/random_clusters.png b/fig/random_clusters.png
diff --git a/fig/random_clusters_centre.png b/fig/random_clusters_centre.png
diff --git a/fig/tsne_clustered.png b/fig/tsne_clustered.png
diff --git a/fig/tsne_labelled.png b/fig/tsne_labelled.png
diff --git a/fig/tsne_unlabelled.png b/fig/tsne_unlabelled.png