Skip to content

Commit

Permalink
Added logistic regression for linear proving in evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
afoix committed Oct 21, 2024
1 parent c0232c7 commit c9c197e
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 30 deletions.
68 changes: 41 additions & 27 deletions scripts/shapeembed/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import StandardScaler
from sklearn import metrics
from sklearn.metrics import make_scorer
Expand Down Expand Up @@ -67,37 +68,50 @@ def score_dataframe( df, name
, "f1": make_scorer(metrics.f1_score, average="macro")
#, "roc_auc": make_scorer(metrics.roc_auc_score, average="macro")
}
#, ("pca", PCA(n_components=0.95, whiten=True, random_state=rand_seed))
#, ("clf", DummyClassifier())
# Create a linear classifier
lin_pipeline = Pipeline([
("scaler", StandardScaler())
#, ("clf", LinearRegression())
, ("clf", LogisticRegression())
])
# Create a random forest classifier
pipeline = Pipeline([
randforest_pipeline = Pipeline([
("scaler", StandardScaler())
#, ("pca", PCA(n_components=0.95, whiten=True, random_state=rand_seed))
, ("clf", RandomForestClassifier())
#, ("clf", DummyClassifier())
])
# build confusion matrix
clean_df.columns = clean_df.columns.astype(str) # only string column names
lbl_pred = cross_val_predict( pipeline
, clean_df.drop('class', axis=1)
, clean_df['class'])
conf_mat = confusion_matrix(clean_df['class'], lbl_pred)
# Perform k-fold cross-validation
cv_results = cross_validate(
estimator=pipeline
, X=clean_df.drop('class', axis=1)
, y=clean_df['class']
, cv=StratifiedKFold(n_splits=k_folds)
, scoring=scoring
, n_jobs=-1
, return_train_score=False
)
# Put the results into a DataFrame
df = pandas.DataFrame(cv_results)
df = df.drop(["fit_time", "score_time"], axis=1)
df.insert(loc=0, column='trial', value=name)
tag_columns.reverse()
for tag_col_name, tag_col_value in tag_columns:
df.insert(loc=0, column=tag_col_name, value=tag_col_value)
return conf_mat, df
dfs = []
conf_mats = {}
for pipename, pipeline in [ ( 'linear', lin_pipeline)
, ('randforest', randforest_pipeline) ]:
#for pipename, pipeline in [ ('randforest', randforest_pipeline) ]:
# build confusion matrix
clean_df.columns = clean_df.columns.astype(str) # only string column names
lbl_pred = cross_val_predict( pipeline
, clean_df.drop('class', axis=1)
, clean_df['class'])
conf_mat = confusion_matrix(clean_df['class'], lbl_pred)
# Perform k-fold cross-validation
cv_results = cross_validate(
estimator=pipeline
, X=clean_df.drop('class', axis=1)
, y=clean_df['class']
, cv=StratifiedKFold(n_splits=k_folds)
, scoring=scoring
, n_jobs=-1
, return_train_score=False
)
# Put the results into a DataFrame
df = pandas.DataFrame(cv_results)
df = df.drop(["fit_time", "score_time"], axis=1)
df.insert(loc=0, column='trial', value=f'{name}_{pipename}')
tag_columns.reverse()
for tag_col_name, tag_col_value in tag_columns:
df.insert(loc=0, column=tag_col_name, value=tag_col_value)
dfs.append(df)
conf_mats[pipename] = conf_mat
return conf_mats, pandas.concat(dfs)

def confusion_matrix_plot( cm, name, outputdir
, figsize=(10,7) ):
Expand Down
7 changes: 4 additions & 3 deletions scripts/shapeembed/shapeembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,11 +355,12 @@ def main_process(params):
logger.info(f'-- generate shapeembed umap --')
umap_plot(shapeembed_df, f'{pfx}-shapeembed', outputdir=params.output_dir)
logger.info(f'-- score shape embed --')
shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)+[(k, v.item()) for k, v in model.metrics.items()])
shapeembed_cms, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)+[(k, v.item()) for k, v in model.metrics.items()])
logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}')
shapeembed_score_df.to_csv(f"{params.output_dir}/{pfx}-shapeembed-score_df.csv")
logger.info(f'-- confusion matrix:\n{shapeembed_cm}')
confusion_matrix_plot(shapeembed_cm, f'{pfx}-shapeembed', params.output_dir)
for kind, shapeembed_cm in shapeembed_cms.items():
logger.info(f'-- {kind} confusion matrix:\n{shapeembed_cm}')
confusion_matrix_plot(shapeembed_cm, f'{pfx}-{kind}-shapeembed', params.output_dir)
# XXX TODO move somewhere else if desired XXX
## combined shapeembed + efd + regionprops
#logger.info(f'-- shapeembed + efd + regionprops --')
Expand Down

0 comments on commit c9c197e

Please sign in to comment.