Added logistic regression for linear proving in evaluation

uhlmanngroup · Oct 21, 2024 · c9c197e · c9c197e
1 parent c0232c7
commit c9c197e
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 30 deletions.
diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
@@ -5,6 +5,7 @@
 from sklearn.cluster import KMeans
 from sklearn.pipeline import Pipeline
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
 from sklearn.discriminant_analysis import StandardScaler
 from sklearn import metrics
 from sklearn.metrics import make_scorer
@@ -67,37 +68,50 @@ def score_dataframe( df, name
   , "f1": make_scorer(metrics.f1_score, average="macro")
   #, "roc_auc": make_scorer(metrics.roc_auc_score, average="macro")
   }
+  #, ("pca", PCA(n_components=0.95, whiten=True, random_state=rand_seed))
+  #, ("clf", DummyClassifier())
+  # Create a linear classifier
+  lin_pipeline = Pipeline([
+    ("scaler", StandardScaler())
+  #, ("clf", LinearRegression())
+  , ("clf", LogisticRegression())
+  ])
   # Create a random forest classifier
-  pipeline = Pipeline([
+  randforest_pipeline = Pipeline([
     ("scaler", StandardScaler())
-  #, ("pca", PCA(n_components=0.95, whiten=True, random_state=rand_seed))
   , ("clf", RandomForestClassifier())
-  #, ("clf", DummyClassifier())
   ])
-  # build confusion matrix
-  clean_df.columns = clean_df.columns.astype(str) # only string column names
-  lbl_pred = cross_val_predict( pipeline
-                              , clean_df.drop('class', axis=1)
-                              , clean_df['class'])
-  conf_mat = confusion_matrix(clean_df['class'], lbl_pred)
-  # Perform k-fold cross-validation
-  cv_results = cross_validate(
-    estimator=pipeline
-  , X=clean_df.drop('class', axis=1)
-  , y=clean_df['class']
-  , cv=StratifiedKFold(n_splits=k_folds)
-  , scoring=scoring
-  , n_jobs=-1
-  , return_train_score=False
-  )
-  # Put the results into a DataFrame
-  df = pandas.DataFrame(cv_results)
-  df = df.drop(["fit_time", "score_time"], axis=1)
-  df.insert(loc=0, column='trial', value=name)
-  tag_columns.reverse()
-  for tag_col_name, tag_col_value in tag_columns:
-    df.insert(loc=0, column=tag_col_name, value=tag_col_value)
-  return conf_mat, df
+  dfs = []
+  conf_mats = {}
+  for pipename, pipeline in [ (    'linear',        lin_pipeline)
+                            , ('randforest', randforest_pipeline) ]:
+  #for pipename, pipeline in [ ('randforest', randforest_pipeline) ]:
+    # build confusion matrix
+    clean_df.columns = clean_df.columns.astype(str) # only string column names
+    lbl_pred = cross_val_predict( pipeline
+                                , clean_df.drop('class', axis=1)
+                                , clean_df['class'])
+    conf_mat = confusion_matrix(clean_df['class'], lbl_pred)
+    # Perform k-fold cross-validation
+    cv_results = cross_validate(
+      estimator=pipeline
+    , X=clean_df.drop('class', axis=1)
+    , y=clean_df['class']
+    , cv=StratifiedKFold(n_splits=k_folds)
+    , scoring=scoring
+    , n_jobs=-1
+    , return_train_score=False
+    )
+    # Put the results into a DataFrame
+    df = pandas.DataFrame(cv_results)
+    df = df.drop(["fit_time", "score_time"], axis=1)
+    df.insert(loc=0, column='trial', value=f'{name}_{pipename}')
+    tag_columns.reverse()
+    for tag_col_name, tag_col_value in tag_columns:
+      df.insert(loc=0, column=tag_col_name, value=tag_col_value)
+    dfs.append(df)
+    conf_mats[pipename] = conf_mat
+  return conf_mats, pandas.concat(dfs)
 
 def confusion_matrix_plot( cm, name, outputdir
                          , figsize=(10,7) ):

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
@@ -355,11 +355,12 @@ def main_process(params):
   logger.info(f'-- generate shapeembed umap --')
   umap_plot(shapeembed_df, f'{pfx}-shapeembed', outputdir=params.output_dir)
   logger.info(f'-- score shape embed --')
-  shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)+[(k, v.item()) for k, v in model.metrics.items()])
+  shapeembed_cms, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)+[(k, v.item()) for k, v in model.metrics.items()])
   logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}')
   shapeembed_score_df.to_csv(f"{params.output_dir}/{pfx}-shapeembed-score_df.csv")
-  logger.info(f'-- confusion matrix:\n{shapeembed_cm}')
-  confusion_matrix_plot(shapeembed_cm, f'{pfx}-shapeembed', params.output_dir)
+  for kind, shapeembed_cm in shapeembed_cms.items():
+    logger.info(f'-- {kind} confusion matrix:\n{shapeembed_cm}')
+    confusion_matrix_plot(shapeembed_cm, f'{pfx}-{kind}-shapeembed', params.output_dir)
   # XXX TODO move somewhere else if desired XXX
   ## combined shapeembed + efd + regionprops
   #logger.info(f'-- shapeembed + efd + regionprops --')