From dfe529d91d370650c53f184082352dd3c954f3fc Mon Sep 17 00:00:00 2001 From: timovdk <5330531+timovdk@users.noreply.github.com> Date: Wed, 29 Jan 2025 10:18:39 +0100 Subject: [PATCH] setup old-defaults study --- asreview2-optuna/classifiers.py | 2 +- asreview2-optuna/feature_extractors.py | 17 +++++++++-------- asreview2-optuna/main.py | 8 ++++---- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/asreview2-optuna/classifiers.py b/asreview2-optuna/classifiers.py index 8abf18f..a733073 100644 --- a/asreview2-optuna/classifiers.py +++ b/asreview2-optuna/classifiers.py @@ -9,7 +9,7 @@ def naive_bayes_params(trial: optuna.trial.FrozenTrial): # Use logarithmic normal distribution for alpha (alpha effect is non-linear) - alpha = trial.suggest_float("alpha", 0.5, 50, log=True) + alpha = 3.822 #trial.suggest_float("alpha", 0.5, 50, log=True) #alpha = trial.suggest_float("nb__alpha", 1.0, 15.0) return {"alpha": alpha} diff --git a/asreview2-optuna/feature_extractors.py b/asreview2-optuna/feature_extractors.py index 8e9c8c1..7a4b7bd 100644 --- a/asreview2-optuna/feature_extractors.py +++ b/asreview2-optuna/feature_extractors.py @@ -5,21 +5,22 @@ def tfidf_params(trial: optuna.trial.FrozenTrial): #max_features = trial.suggest_int("tfidf__max_features", 15_000, 50_000) - max_df = trial.suggest_float("tfidf__max_df", 0.7, 1.0) + #max_df = trial.suggest_float("tfidf__max_df", 0.7, 1.0) - min_df = trial.suggest_int("tfidf__min_df", 2, 4) + #min_df = trial.suggest_int("tfidf__min_df", 2, 4) #max_ngram_range = trial.suggest_int("tfidf__max_ngram_range", 1, 3) - ngram_range = (1, 2) + #ngram_range = (1, 2) - sublinear_tf = True#trial.suggest_categorical("tfidf__sublinear_tf", [True, False]) + #sublinear_tf = True#trial.suggest_categorical("tfidf__sublinear_tf", [True, False]) return { #"max_features": max_features, - "max_df": max_df, - "min_df": min_df, - "ngram_range": ngram_range, - "sublinear_tf": sublinear_tf, + #"max_df": max_df, + #"min_df": min_df, + #"ngram_range": ngram_range, + #sublinear_tf": sublinear_tf, + #"ngram_range": (1, 1), } diff --git a/asreview2-optuna/main.py b/asreview2-optuna/main.py index bb5a0df..6973535 100644 --- a/asreview2-optuna/main.py +++ b/asreview2-optuna/main.py @@ -19,12 +19,12 @@ from feature_extractors import feature_extractor_params, feature_extractors # Study variables -VERSION = 1 +VERSION = 2 STUDY_SET = "full" CLASSIFIER_TYPE = "nb" # Options: "nb", "log", "svm", "rf" -FEATURE_EXTRACTOR_TYPE = "labse" # Options: "tfidf", "onehot", "labse", "bge-m3" +FEATURE_EXTRACTOR_TYPE = "tfidf" # Options: "tfidf", "onehot", "labse", "bge-m3" PICKLE_FOLDER_PATH = Path("synergy-dataset", f"pickles_{FEATURE_EXTRACTOR_TYPE}") -PRE_PROCESSED_FMS = True # False = on the fly +PRE_PROCESSED_FMS = False # False = on the fly PARALLELIZE_OBJECTIVE = True # Optuna variables @@ -164,7 +164,7 @@ def process_row(row, clf_params, fe_params, ratio): def objective_report(report_order): def objective(trial): # Use normal distribution for ratio (ratio effect is linear) - ratio = trial.suggest_float("ratio", 1.0, 2.0) + ratio = trial.suggest_float("ratio", 1.0, 10.0) # ratio = 1.5 clf_params = classifier_params[CLASSIFIER_TYPE](trial) fe_params = (