From c9c197edf41eec86d6caa4082a3c1b637eb76bf4 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 21 Oct 2024 12:28:15 +0100 Subject: [PATCH] Added logistic regression for linear proving in evaluation --- scripts/shapeembed/evaluation.py | 68 +++++++++++++++++++------------- scripts/shapeembed/shapeembed.py | 7 ++-- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index d530e9f6..40aeec52 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -5,6 +5,7 @@ from sklearn.cluster import KMeans from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression from sklearn.discriminant_analysis import StandardScaler from sklearn import metrics from sklearn.metrics import make_scorer @@ -67,37 +68,50 @@ def score_dataframe( df, name , "f1": make_scorer(metrics.f1_score, average="macro") #, "roc_auc": make_scorer(metrics.roc_auc_score, average="macro") } + #, ("pca", PCA(n_components=0.95, whiten=True, random_state=rand_seed)) + #, ("clf", DummyClassifier()) + # Create a linear classifier + lin_pipeline = Pipeline([ + ("scaler", StandardScaler()) + #, ("clf", LinearRegression()) + , ("clf", LogisticRegression()) + ]) # Create a random forest classifier - pipeline = Pipeline([ + randforest_pipeline = Pipeline([ ("scaler", StandardScaler()) - #, ("pca", PCA(n_components=0.95, whiten=True, random_state=rand_seed)) , ("clf", RandomForestClassifier()) - #, ("clf", DummyClassifier()) ]) - # build confusion matrix - clean_df.columns = clean_df.columns.astype(str) # only string column names - lbl_pred = cross_val_predict( pipeline - , clean_df.drop('class', axis=1) - , clean_df['class']) - conf_mat = confusion_matrix(clean_df['class'], lbl_pred) - # Perform k-fold cross-validation - cv_results = cross_validate( - estimator=pipeline - , X=clean_df.drop('class', axis=1) - , y=clean_df['class'] - , cv=StratifiedKFold(n_splits=k_folds) - , scoring=scoring - , n_jobs=-1 - , return_train_score=False - ) - # Put the results into a DataFrame - df = pandas.DataFrame(cv_results) - df = df.drop(["fit_time", "score_time"], axis=1) - df.insert(loc=0, column='trial', value=name) - tag_columns.reverse() - for tag_col_name, tag_col_value in tag_columns: - df.insert(loc=0, column=tag_col_name, value=tag_col_value) - return conf_mat, df + dfs = [] + conf_mats = {} + for pipename, pipeline in [ ( 'linear', lin_pipeline) + , ('randforest', randforest_pipeline) ]: + #for pipename, pipeline in [ ('randforest', randforest_pipeline) ]: + # build confusion matrix + clean_df.columns = clean_df.columns.astype(str) # only string column names + lbl_pred = cross_val_predict( pipeline + , clean_df.drop('class', axis=1) + , clean_df['class']) + conf_mat = confusion_matrix(clean_df['class'], lbl_pred) + # Perform k-fold cross-validation + cv_results = cross_validate( + estimator=pipeline + , X=clean_df.drop('class', axis=1) + , y=clean_df['class'] + , cv=StratifiedKFold(n_splits=k_folds) + , scoring=scoring + , n_jobs=-1 + , return_train_score=False + ) + # Put the results into a DataFrame + df = pandas.DataFrame(cv_results) + df = df.drop(["fit_time", "score_time"], axis=1) + df.insert(loc=0, column='trial', value=f'{name}_{pipename}') + tag_columns.reverse() + for tag_col_name, tag_col_value in tag_columns: + df.insert(loc=0, column=tag_col_name, value=tag_col_value) + dfs.append(df) + conf_mats[pipename] = conf_mat + return conf_mats, pandas.concat(dfs) def confusion_matrix_plot( cm, name, outputdir , figsize=(10,7) ): diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index e62dbd59..58d59059 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -355,11 +355,12 @@ def main_process(params): logger.info(f'-- generate shapeembed umap --') umap_plot(shapeembed_df, f'{pfx}-shapeembed', outputdir=params.output_dir) logger.info(f'-- score shape embed --') - shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)+[(k, v.item()) for k, v in model.metrics.items()]) + shapeembed_cms, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)+[(k, v.item()) for k, v in model.metrics.items()]) logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}') shapeembed_score_df.to_csv(f"{params.output_dir}/{pfx}-shapeembed-score_df.csv") - logger.info(f'-- confusion matrix:\n{shapeembed_cm}') - confusion_matrix_plot(shapeembed_cm, f'{pfx}-shapeembed', params.output_dir) + for kind, shapeembed_cm in shapeembed_cms.items(): + logger.info(f'-- {kind} confusion matrix:\n{shapeembed_cm}') + confusion_matrix_plot(shapeembed_cm, f'{pfx}-{kind}-shapeembed', params.output_dir) # XXX TODO move somewhere else if desired XXX ## combined shapeembed + efd + regionprops #logger.info(f'-- shapeembed + efd + regionprops --')