Add all simulation options to Makita (#48)

* Add all simulation options to Makita * Update README.md
asreview · Nov 30, 2023 · a9c6fac · a9c6fac
1 parent f19a7ec
commit a9c6fac
Show file tree

Hide file tree

Showing 8 changed files with 194 additions and 58 deletions.
diff --git a/README.md b/README.md
@@ -70,7 +70,13 @@ Your simulation study is now properly set up and ready for use. To start the sim
 sh jobs.sh
 ```
 
-The `jobs.sh` script is a shell script that runs all jobs in the project folder.
+Or on Windows:
+
+```console
+jobs.bat
+```
+
+The `jobs` script runs all jobs in the project folder.
 
 ### Platform support
 
@@ -86,12 +92,15 @@ By default, the job file depends on the platform. Windows users will see a `jobs
 asreview makita template basic --job_file my_jobs_file.my_ext
 ```
 
-
 ## Templates
 
-The following table gives an overview of the available templates. See [Getting started](#getting-started) for instructions on usage.
+The following table gives an overview of the available templates.
+See [Getting started](#getting-started) for instructions on usage.
 
-> Note: If no seed is set with the template command, the default seed is used. While this is important for the reproducibility of the results, it may lead to long-term bias. To avoid seed-related bias across different simulation studies, a seed should be for the prior records and models.
+> Note: If no seed is set with the template command, the default seed is used.
+> While this is important for the reproducibility of the results, it may lead to
+> long-term bias. To avoid seed-related bias across different simulation
+> studies, a seed should be for the prior records and models.
 
 ### Basic template
 
@@ -102,14 +111,22 @@ The basic template prepares a script for conducting a simulation study with one
 optional arguments:
 
 ```console
-  -h, --help                                       show this help message and exit
-  -f OUTPUT_FILE                                   File with jobs
-  -s DATA_FOLDER                                   Dataset folder
-  -o OUTPUT_FOLDER                                 Output folder
-  --init_seed INIT_SEED                            Seed of the priors. Seed is set by default!
-  --model_seed MODEL_SEED                          Seed of the models. Seed is set by default!
-  --template TEMPLATE                              Overwrite template with template file path.
-  --n_runs N_RUNS                                  Number of runs
+  -h, --help                                show this help message and exit
+  --job_file JOB_FILE, -f JOB_FILE          The name of the file with jobs.                 Default jobs.bat for Windows, otherwise jobs.sh.
+  -s DATA_FOLDER                            Dataset folder
+  -o OUTPUT_FOLDER                          Output folder
+  --init_seed INIT_SEED                     Seed of the priors.                             Seed is set to 535 by default.
+  --model_seed MODEL_SEED                   Seed of the models.                             Seed is set to 165 by default.
+  --template TEMPLATE                       Overwrite template with template file path.
+  --platform PLATFORM                       Platform to run jobs: Windows, Darwin, Linux.   Default: the system of rendering templates.
+  --n_runs N_RUNS                           Number of runs.                                 Default: 1.
+  --no_wordclouds                           Disables the generation of wordclouds.
+  --classifier CLASSIFIER                   Classifier to use.                              Default: nb.
+  --feature_extractor FEATURE_EXTRACTOR     Feature_extractor to use.                       Default: tfidf.
+  --query_strategy QUERY_STRATEGY           Query strategy to use.                          Default: max.
+  --balance_strategy BALANCE_STRATEGY       Balance strategy to use.                        Default: double.
+  --instances_per_query INSTANCES_PER_QUERY Number of instances per query.                  Default: 1.
+  --stop_if STOP_IF                         The number of label actions to simulate.        Default 'min' will stop simulating when all relevant records are found.
 ```
 
 ### ARFI template
@@ -121,14 +138,22 @@ The ARFI template (All relevant, fixed irrelevant) prepares a script for running
 optional arguments:
 
 ```console
-  -h, --help                                       show this help message and exit
-  -f OUTPUT_FILE                                   File with jobs
-  -s DATA_FOLDER                                   Dataset folder
-  -o OUTPUT_FOLDER                                 Output folder
-  --init_seed INIT_SEED                            Seed of the priors. Seed is set by default!
-  --model_seed MODEL_SEED                          Seed of the models. Seed is set by default!
-  --template TEMPLATE                              Overwrite template with template file path.
-  --n_priors N_PRIORS                              Number of priors
+  -h, --help                                show this help message and exit
+  --job_file JOB_FILE, -f JOB_FILE          The name of the file with jobs.                 Default jobs.bat for Windows, otherwise jobs.sh.
+  -s DATA_FOLDER                            Dataset folder
+  -o OUTPUT_FOLDER                          Output folder
+  --init_seed INIT_SEED                     Seed of the priors.                             Seed is set to 535 by default.
+  --model_seed MODEL_SEED                   Seed of the models.                             Seed is set to 165 by default.
+  --template TEMPLATE                       Overwrite template with template file path.
+  --platform PLATFORM                       Platform to run jobs: Windows, Darwin, Linux.   Default: the system of rendering templates.
+  --n_priors N_PRIORS                       Number of priors.                               Default: 10.
+  --no_wordclouds                           Disables the generation of wordclouds.
+  --classifier CLASSIFIER                   Classifier to use.                              Default: nb.
+  --feature_extractor FEATURE_EXTRACTOR     Feature_extractor to use.                       Default: tfidf.
+  --query_strategy QUERY_STRATEGY           Query strategy to use.                          Default: max.
+  --balance_strategy BALANCE_STRATEGY       Balance strategy to use.                        Default: double.
+  --instances_per_query INSTANCES_PER_QUERY Number of instances per query.                  Default: 1.
+  --stop_if STOP_IF                         The number of label actions to simulate.        Default 'min' will stop simulating when all relevant records are found.
 ```
 
 ### Multiple models template
@@ -140,34 +165,30 @@ The multiple model template prepares a script for running a simulation study com
 optional arguments:
 
 ```console
-  -h, --help                                       Show this help message and exit
-  -f OUTPUT_FILE                                   File with jobs
-  -s DATA_FOLDER                                   Dataset folder
-  -o OUTPUT_FOLDER                                 Output folder
-  --init_seed INIT_SEED                            Seed of the priors. Seed is set by default!
-  --model_seed MODEL_SEED                          Seed of the models. Seed is set by default!
-  --template TEMPLATE                              Overwrite template with template file path.
-  --classifiers CLASSIFIERS [CLASSIFIERS ...]                           Classifiers to use
-  --feature_extractors FEATURE_EXTRACTOR [FEATURE_EXTRACTORS ...]   Feature extractors to use
-  --impossible_models IMPOSSIBLE_MODELS [IMPOSSIBLE_MODELS ...]         Model combinations to exclude
-```
-
-The default models are:
-
-```python
-classifiers           ["logistic", "nb", "rf", "svm"]
-feature_extractors   ["doc2vec", "sbert", "tfidf"]
-impossible_models     [["nb", "doc2vec"], ["nb", "sbert"]]
-```
-
->Example command: If you want to generate a multiple models template with classifiers `logistic`
-and `nb`, and feature extraction `tfidf`, you can use the following command:
-
-```console
-asreview makita template multiple_models --classifiers logistic nb --feature_extractors tfidf
+  -h, --help                                show this help message and exit
+  --job_file JOB_FILE, -f JOB_FILE          The name of the file with jobs.                 Default jobs.bat for Windows, otherwise jobs.sh.
+  -s DATA_FOLDER                            Dataset folder
+  -o OUTPUT_FOLDER                          Output folder
+  --init_seed INIT_SEED                     Seed of the priors.                             Seed is set to 535 by default.
+  --model_seed MODEL_SEED                   Seed of the models.                             Seed is set to 165 by default.
+  --template TEMPLATE                       Overwrite template with template file path.
+  --platform PLATFORM                       Platform to run jobs: Windows, Darwin, Linux.   Default: the system of rendering templates.
+  --n_runs N_RUNS                           Number of runs.                                 Default: 1.
+  --no_wordclouds                           Disables the generation of wordclouds.
+  --query_strategy QUERY_STRATEGY           Query strategy to use.                          Default: max.
+  --balance_strategy BALANCE_STRATEGY       Balance strategy to use.                        Default: double.
+  --instances_per_query INSTANCES_PER_QUERY Number of instances per query.                  Default: 1.
+  --stop_if STOP_IF                         The number of label actions to simulate.        Default 'min' will stop simulating when all relevant records are found.
+  --classifiers CLASSIFIERS                 Classifiers to use                              Default: ['logistic', 'nb', 'rf', 'svm']
+  --feature_extractors FEATURE_EXTRACTOR    Feature extractors to use                       Default: ['doc2vec', 'sbert', 'tfidf']
+  --impossible_models IMPOSSIBLE_MODELS     Model combinations to exclude                   Default: ['nb,doc2vec', 'nb,sbert']
 ```
 
->If you want to specify certain combinations of classifiers and feature extractors that should not be used, you can use the `--impossible_models` option. For instance, if you want to exclude the combinations of `nb` with `doc2vec` and `logistic` with `tfidf`, use the following command:
+If you want to specify certain combinations of classifiers and feature
+extractors that should and should not be used, you can use the `--classifiers`,
+`--feature_extractors`, and `--impossible_models` option. For instance, if you
+want to exclude the combinations of `nb` with `doc2vec` and `logistic` with
+`tfidf`, use the following command:
 
 ```console
 asreview makita template multiple_models --classifiers logistic nb --feature_extractors tfidf doc2vec --impossible_models nb,doc2vec logistic,tfidf
@@ -227,9 +248,9 @@ The following scripts are available:
 - merge_tds.py
 - split_data_with_multiple_labels.py [DEPRECATED]
 
-#### Time to Discovery Tables 
+#### Time to Discovery Tables
 
-The 'merge_tds.py' script creates a table of the time to discovery (TD) values for each dataset, with each row corresponding to each record ID of the relevant records in a dataset, and the columns correspond to each simulation run (e.g, for the multiple models template each column corresponds to a simualtion run with each active learning model). Additionally, the tables includes the average-record-TD values (the average of the TD values for a record across multiple simulation runs), and the average-simulation-TD values (the average of the TD values across all records for a single simulation run). 
+The 'merge_tds.py' script creates a table of the time to discovery (TD) values for each dataset, with each row corresponding to each record ID of the relevant records in a dataset, and the columns correspond to each simulation run (e.g, for the multiple models template each column corresponds to a simualtion run with each active learning model). Additionally, the tables includes the average-record-TD values (the average of the TD values for a record across multiple simulation runs), and the average-simulation-TD values (the average of the TD values across all records for a single simulation run).
 
 #### Run Makita via Docker
 

diff --git a/asreviewcontrib/makita/entrypoint.py b/asreviewcontrib/makita/entrypoint.py
@@ -67,13 +67,13 @@ def execute(self, argv):  # noqa: C901
             "--init_seed",
             type=int,
             default=535,
-            help="Seed of the priors. Seed is set by default!",
+            help="Seed of the priors. Seed is set to 535 by default.",
         )
         parser_template.add_argument(
             "--model_seed",
             type=int,
             default=165,
-            help="Seed of the models. Seed is set by default!",
+            help="Seed of the models. Seed is set to 165 by default.",
         )
         parser_template.add_argument(
             "--template", type=str, help="Overwrite template with template file path."
@@ -88,31 +88,83 @@ def execute(self, argv):  # noqa: C901
             "--n_runs",
             type=int,
             default=1,
-            help="Number of runs. Only for templates 'basic' and 'multiple_models'.",
+            help="Number of runs. Only for templates 'basic' and 'multiple_models'. "
+            "Default: 1.",
         )
         parser_template.add_argument(
             "--n_priors",
             type=int,
             default=10,
-            help="Number of priors. Only for template 'arfi'.",
+            help="Number of priors. Only for template 'arfi'. "
+            "Default: 10.",
+        )
+        parser_template.add_argument(
+            "--no_wordclouds",
+            action="store_false",
+            help="Disables the generation of wordclouds. "
+        )
+        parser_template.add_argument(
+            "--classifier",
+            type=str,
+            default="nb",
+            help="Classifier to use. Only for template 'basic' and 'arfi'. "
+            "Default: nb.",
+        )
+        parser_template.add_argument(
+            "--feature_extractor",
+            type=str,
+            default="tfidf",
+            help="Feature_extractor to use. Only for template 'basic' and 'arfi'. "
+            "Default: tfidf.",
+        )
+        parser_template.add_argument(
+            "--query_strategy",
+            type=str,
+            default="max",
+            help="Query strategy to use. "
+            "Default: max.",
+        )
+        parser_template.add_argument(
+            "--balance_strategy",
+            type=str,
+            default="double",
+            help="Balance strategy to use. "
+            "Default: double.",
+        )
+        parser_template.add_argument(
+            "--instances_per_query",
+            type=int,
+            default=1,
+            help="Number of instances per query. "
+            "Default: 1.",
+        )
+        parser_template.add_argument(
+            "--stop_if",
+            type=str,
+            default="min",
+            help="The number of label actions to simulate. "
+            "Default 'min' will stop simulating when all relevant records are found.",
         )
         parser_template.add_argument(
             "--classifiers",
             nargs="+",
             default=["logistic", "nb", "rf", "svm"],
-            help="Classifiers to use. Only for template 'multiple_models'.",
+            help="Classifiers to use. Only for template 'multiple_models'. "
+            "Default: ['logistic', 'nb', 'rf', 'svm']",
         )
         parser_template.add_argument(
             "--feature_extractors",
             nargs="+",
             default=["doc2vec", "sbert", "tfidf"],
-            help="Feature extractors to use. Only for template 'multiple_models'.",
+            help="Feature extractors to use. Only for template 'multiple_models'. "
+            "Default: ['doc2vec', 'sbert', 'tfidf']",
         )
         parser_template.add_argument(
             "--impossible_models",
             nargs="+",
             default=["nb,doc2vec", "nb,sbert"],
-            help="Model combinations to exclude. Only for template 'multiple_models'.",
+            help="Model combinations to exclude. Only for template 'multiple_models'. "
+            "Default: ['nb,doc2vec', 'nb,sbert']",
         )
 
         parser_template.set_defaults(func=self._template_cli)
@@ -165,9 +217,16 @@ def _template(self, args):
             job = render_jobs_basic(
                 datasets,
                 output_folder=Path(args.o),
+                create_wordclouds=args.no_wordclouds,
                 n_runs=args.n_runs,
                 init_seed=args.init_seed,
                 model_seed=args.model_seed,
+                classifier=args.classifier,
+                feature_extractor=args.feature_extractor,
+                query_strategy=args.query_strategy,
+                balance_strategy=args.balance_strategy,
+                instances_per_query=args.instances_per_query,
+                stop_if=args.stop_if,
                 fp_template=fp_template,
                 job_file=args.job_file,
                 platform_sys=args.platform,
@@ -178,9 +237,16 @@ def _template(self, args):
             job = render_jobs_arfi(
                 datasets,
                 output_folder=Path(args.o),
+                create_wordclouds=args.no_wordclouds,
                 n_priors=args.n_priors,
                 init_seed=args.init_seed,
                 model_seed=args.model_seed,
+                classifier=args.classifier,
+                feature_extractor=args.feature_extractor,
+                query_strategy=args.query_strategy,
+                balance_strategy=args.balance_strategy,
+                instances_per_query=args.instances_per_query,
+                stop_if=args.stop_if,
                 fp_template=fp_template,
                 job_file=args.job_file,
                 platform_sys=args.platform,
@@ -191,12 +257,17 @@ def _template(self, args):
             job = render_jobs_multiple_models(
                 datasets,
                 output_folder=Path(args.o),
+                create_wordclouds=args.no_wordclouds,
                 n_runs=args.n_runs,
                 init_seed=args.init_seed,
                 model_seed=args.model_seed,
                 all_classifiers=args.classifiers,
                 all_feature_extractors=args.feature_extractors,
                 impossible_models=args.impossible_models,
+                query_strategy=args.query_strategy,
+                balance_strategy=args.balance_strategy,
+                instances_per_query=args.instances_per_query,
+                stop_if=args.stop_if,
                 fp_template=fp_template,
                 job_file=args.job_file,
                 platform_sys=args.platform,

diff --git a/asreviewcontrib/makita/template_arfi.py b/asreviewcontrib/makita/template_arfi.py
@@ -17,9 +17,16 @@ def render_jobs_arfi(
     datasets,
     output_folder="output",
     scripts_folder="scripts",
+    create_wordclouds=True,
     n_priors=10,
     init_seed=535,
     model_seed=165,
+    classifier="nb",
+    feature_extractor="tfidf",
+    query_strategy="max",
+    balance_strategy="double",
+    instances_per_query=1,
+    stop_if='min',
     fp_template=None,
     job_file=None,
     platform_sys=None,
@@ -88,6 +95,13 @@ def render_jobs_arfi(
     return template.render(
         {
             "datasets": params,
+            "create_wordclouds": create_wordclouds,
+            "classifier": classifier,
+            "feature_extractor": feature_extractor,
+            "query_strategy": query_strategy,
+            "balance_strategy": balance_strategy,
+            "instances_per_query": instances_per_query,
+            "stop_if": stop_if,
             "init_seed": init_seed,
             "output_folder": output_folder,
             "scripts_folder": scripts_folder,