Skip to content

Commit

Permalink
Merge pull request #94 from nickmao1994/main
Browse files Browse the repository at this point in the history
Imporve plots, tables and final reports
  • Loading branch information
kphaterp authored Dec 11, 2021
2 parents f3dfa90 + 9af1d41 commit 99c47fc
Show file tree
Hide file tree
Showing 7 changed files with 461 additions and 142 deletions.
271 changes: 193 additions & 78 deletions docs/Project_report_milestone2.ipynb

Large diffs are not rendered by default.

Binary file added models/coeff.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified results/model/coeff_bar.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified results/model/cv_result.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
268 changes: 208 additions & 60 deletions src/models/model_building.ipynb

Large diffs are not rendered by default.

60 changes: 56 additions & 4 deletions src/models/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
from docopt import docopt
import matplotlib.pyplot as plt
import mglearn
from mglearn.plot_2d_separator import (plot_2d_separator, plot_2d_classification,
plot_2d_scores)
from mglearn.plot_helpers import cm2 as cm, discrete_scatter

import numpy as np
import pandas as pd
import pickle
Expand Down Expand Up @@ -100,15 +104,63 @@ def coeff_plot(best_model, out_dir):
"""
logger.info("Drawing bar plot for coefficents...")
feature_names = np.array(best_model[:-1].get_feature_names_out())
name = []
for n in feature_names.tolist():
name.append(n.split('__')[1])
coeffs = best_model.named_steps["logisticregression"].coef_.flatten()
coeff_df = pd.DataFrame(coeffs, index=feature_names, columns=["Coefficient"])
coeff_df = pd.DataFrame(coeffs, index=name, columns=["Coefficient"])
coeff_df_sorted = coeff_df.sort_values(by="Coefficient", ascending=False)
coeff_df_sorted.to_html(os.path.join(out_dir, "coeff_sorted.html"), escape=False)
mglearn.tools.visualize_coefficients(coeffs, feature_names, n_top_features=5)
plt.savefig(os.path.join(out_dir, "coeff_bar.png"))
visualize_coefficients(coeffs, feature_names, n_top_features=5)
plt.savefig(os.path.join(out_dir, "coeff_bar.png"), bbox_inches = 'tight')
logger.info("Bar plot for coefficents saved")



def visualize_coefficients(coefficients, feature_names, n_top_features=25):
"""Visualize coefficients of a linear model.
Parameters
----------
coefficients : nd-array, shape (n_features,)
Model coefficients.
feature_names : list or nd-array of strings, shape (n_features,)
Feature names for labeling the coefficients.
n_top_features : int, default=25
How many features to show. The function will show the largest (most
positive) and smallest (most negative) n_top_features coefficients,
for a total of 2 * n_top_features coefficients.
"""
coefficients = coefficients.squeeze()
if coefficients.ndim > 1:
# this is not a row or column vector
raise ValueError("coeffients must be 1d array or column vector, got"
" shape {}".format(coefficients.shape))
coefficients = coefficients.ravel()

if len(coefficients) != len(feature_names):
raise ValueError("Number of coefficients {} doesn't match number of"
"feature names {}.".format(len(coefficients),
len(feature_names)))
# get coefficients with large absolute values
coef = coefficients.ravel()
positive_coefficients = np.argsort(coef)[-n_top_features:]
negative_coefficients = np.argsort(coef)[:n_top_features]
interesting_coefficients = np.hstack([negative_coefficients,
positive_coefficients])
# plot them
plt.figure(figsize=(15, 5))
colors = [cm(1) if c < 0 else cm(0)
for c in coef[interesting_coefficients]]
plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients],
color=colors)
feature_names = np.array(feature_names)
plt.subplots_adjust(bottom=0.3)
plt.xticks(np.arange(0, 2 * n_top_features),
feature_names[interesting_coefficients], rotation=60,
ha="right")
plt.ylabel("Coefficient magnitude")
plt.xlabel("Feature")


if __name__ == "__main__":

# Parse command line parameters
Expand Down
4 changes: 4 additions & 0 deletions src/models/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ def train_plot(train_results, out_dir):
"""
logger.info("Making train results plot...")
train_results.plot(x="param_logisticregression__C", y="mean_test_score")
plt.plot(100, 0.826403, marker="o", markersize=10, markeredgecolor="red", markerfacecolor="red")
plt.xlabel("Hyperparameter of logistic regression C")
plt.ylabel("Mean test score")
plt.legend(["Mean test score", "Best estimator"])
plt.xscale("log")
plt.savefig(out_dir)
logger.info(f"Train results plot saved to {out_dir}")
Expand Down

0 comments on commit 99c47fc

Please sign in to comment.