Add template to use and compare prior records from file (#62)

asreview · Sep 26, 2024 · 700888a · 700888a
1 parent a52d639
commit 700888a
Show file tree

Hide file tree

Showing 19 changed files with 19,741 additions and 34 deletions.
diff --git a/.github/workflows/ci-workflow.yml b/.github/workflows/ci-workflow.yml
@@ -28,11 +28,13 @@ jobs:
       run: |
         ruff check .
     - name: Create directories using Python
-      run: python -c "import os; [os.makedirs(path, exist_ok=True) for path in ['./tmp/basic/data-test', './tmp/arfi/data', './tmp/multimodel/data', './tmp/scripts', './tmp/synergy/data']]"
+      run: python -c "import os; [os.makedirs(path, exist_ok=True) for path in ['./tmp/basic/data-test', './tmp/arfi/data', './tmp/prior/data', './tmp/multimodel/data', './tmp/scripts', './tmp/synergy/data']]"
     - name: set up environment
       run: |
         cp .github/workflows/test_data/labels.csv ./tmp/basic/data-test/labels.csv
         cp .github/workflows/test_data/labels.csv ./tmp/arfi/data/labels.csv
+        cp .github/workflows/test_data/labels.csv ./tmp/prior/data/labels.csv
+        cp .github/workflows/test_data/labels.csv ./tmp/prior/data/prior_labels.csv
         cp .github/workflows/test_data/labels.csv ./tmp/multimodel/data/labels.csv
     - name: Render makita templates
       run: |
@@ -42,6 +44,9 @@ jobs:
         cd ../arfi
         asreview makita template arfi | tee output.txt
         grep -q "ERROR" output.txt && exit 1 || true
+        cd ../prior
+        asreview makita template prior | tee output.txt
+        grep -q "ERROR" output.txt && exit 1 || true
         cd ../multimodel
         asreview makita template multimodel | tee output.txt
         grep -q "ERROR" output.txt && exit 1 || true

diff --git a/README.md b/README.md
@@ -193,6 +193,8 @@ optional arguments:
   --impossible_models IMPOSSIBLE_MODELS     Model combinations to exclude                   Default: ['nb,doc2vec', 'nb,sbert']
 ```
 
+#### Example usage
+
 If you want to specify certain combinations of classifiers and feature
 extractors that should and should not be used, you can use the `--classifiers`,
 `--feature_extractors`, `--query_strategies`, `--balance_strategies` and `--impossible_models` option. For instance, if you
@@ -203,6 +205,48 @@ want to exclude the combinations of `nb` with `doc2vec` and `logistic` with
 asreview makita template multimodel --classifiers logistic nb --feature_extractors tfidf doc2vec --query_strategies max max_random max_uncertainty cluster --impossible_models nb,doc2vec logistic,tfidf
 ```
 
+### Prior template
+
+command: `prior`
+
+The prior template evaluates how large amounts of prior knowledge might affect simulation performance. It processes two types of data in the data folder: labeled dataset(s) to be simulated and labeled dataset(s) to be used as prior knowledge. The filename(s) of the dataset(s) containing the prior knowledge should use the naming prefix `prior_[dataset_name]`. 
+
+The template runs two simulations: the first simulation uses all records from the `prior_` dataset(s) as prior knowledge, and the second uses a 1+1 randomly chosen set of prior knowledge from the non-prior knowledge dataset. Both runs simulate performance on the combined non-prior dataset(s).
+
+Running this template creates a `generated_data` folder. This folder contains two datasets; `dataset_with_priors.csv` and `dataset_without_priors.csv`. The simulations specified in the generated jobs file will use these datasets for their simulations.
+
+optional arguments:
+
+```console
+  -h, --help                                show this help message and exit
+  --job_file JOB_FILE, -f JOB_FILE          The name of the file with jobs.                 Default jobs.bat for Windows, otherwise jobs.sh.
+  -s DATA_FOLDER                            Dataset folder
+  -o OUTPUT_FOLDER                          Output folder
+  --init_seed INIT_SEED                     Seed of the priors.                             Seed is set to 535 by default.
+  --model_seed MODEL_SEED                   Seed of the models.                             Seed is set to 165 by default.
+  --template TEMPLATE                       Overwrite template with template file path.
+  --platform PLATFORM                       Platform to run jobs: Windows, Darwin, Linux.   Default: the system of rendering templates.
+  --n_runs N_RUNS                           Number of runs.                                 Default: 1.
+  --skip_wordclouds                         Disables the generation of wordclouds.
+  --overwrite                               Automatically accepts all overwrite requests.
+  --classifier CLASSIFIER                   Classifier to use.                              Default: nb.
+  --feature_extractor FEATURE_EXTRACTOR     Feature_extractor to use.                       Default: tfidf.
+  --query_strategy QUERY_STRATEGY           Query strategy to use.                          Default: max.
+  --balance_strategy BALANCE_STRATEGY       Balance strategy to use.                        Default: double.
+  --instances_per_query INSTANCES_PER_QUERY Number of instances per query.                  Default: 1.
+  --stop_if STOP_IF                         The number of label actions to simulate.        Default 'min' will stop simulating when all relevant records are found.
+```
+
+#### Example usage
+
+Put at least 2 datasets in the data folder. One starting with the `prior_` prefix, and one without this prefix. 
+
+> note: `priors_` will also work.
+
+```console
+asreview makita template prior --classifier logistic --feature_extractor tfidf
+```
+
 ## Advanced usage
 
 ### Create and use custom templates

diff --git a/asreviewcontrib/makita/entrypoint.py b/asreviewcontrib/makita/entrypoint.py
@@ -69,78 +69,78 @@ def execute(self, argv):  # noqa: C901
             "--instances_per_query",
             type=int,
             default=ASREVIEW_CONFIG.DEFAULT_N_INSTANCES,
-            help="Number of instances per query. ",
+            help="Number of instances per query.",
         )
         parser_template.add_argument(
             "--stop_if",
             type=str,
             default="min",
-            help="The number of label actions to simulate. ",
+            help="The number of label actions to simulate.",
         )
         parser_template.add_argument(
             "--n_runs",
             type=int,
-            help="Number of runs. Only for templates 'basic' and 'multimodel'. ",
+            help="Number of runs.",
         )
         parser_template.add_argument(
             "--n_priors",
             type=int,
-            help="Number of priors. Only for template 'arfi'.",
+            help="Number of priors.",
         )
         parser_template.add_argument(
             "--skip_wordclouds",
             action="store_true",
-            help="Disables the generation of wordclouds. ",
+            help="Disables the generation of wordclouds.",
         )
         parser_template.add_argument(
             "--overwrite",
             action="store_true",
-            help="Overwrite existing files in the output folder. ",
+            help="Overwrite existing files in the output folder.",
         )
         parser_template.add_argument(
             "--classifier",
             type=str,
-            help="Classifier to use. Only for template 'basic' and 'arfi'. ",
+            help="Classifier to use.",
         )
         parser_template.add_argument(
             "--feature_extractor",
             type=str,
-            help="Feature_extractor to use. Only for template 'basic' and 'arfi'. ",
+            help="Feature_extractor to use.",
         )
         parser_template.add_argument(
             "--query_strategy",
             type=str,
-            help="Query strategy to use. Only for template 'basic' and 'arfi'. ",
+            help="Query strategy to use.",
         )
         parser_template.add_argument(
             "--balance_strategy",
             type=str,
-            help="Balance strategy to use. Only for template 'basic' and 'arfi'. ",
+            help="Balance strategy to use.",
         )
         parser_template.add_argument(
             "--classifiers",
             nargs="+",
-            help="Classifiers to use. Only for template 'multimodel'. ",
+            help="Classifiers to use.",
         )
         parser_template.add_argument(
             "--feature_extractors",
             nargs="+",
-            help="Feature extractors to use. Only for template 'multimodel'. ",
+            help="Feature extractors to use.",
         )
         parser_template.add_argument(
             "--query_strategies",
             nargs="+",
-            help="Query strategies to use. Only for template 'multimodel'. ",
+            help="Query strategies to use.",
         )
         parser_template.add_argument(
             "--balance_strategies",
             nargs="+",
-            help="Balancing strategies to use. Only for template 'multimodel'. ",
+            help="Balancing strategies to use.",
         )
         parser_template.add_argument(
             "--impossible_models",
             nargs="+",
-            help="Model combinations to exclude. Only for template 'multimodel'.",
+            help="Model combinations to exclude.",
         )
 
         parser_template.set_defaults(func=self._template_cli)

diff --git a/asreviewcontrib/makita/template_prior.py b/asreviewcontrib/makita/template_prior.py
@@ -0,0 +1,184 @@
+import warnings
+from pathlib import Path
+
+import pandas as pd
+from asreview import config as ASREVIEW_CONFIG
+from asreview.data import load_data
+
+from asreviewcontrib.makita.template_base import TemplateBase
+
+# Suppress FutureWarning messages
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+
+class TemplatePrior(TemplateBase):
+    template_file = "template_prior.txt.template"
+
+    def __init__(
+        self,
+        classifier,
+        feature_extractor,
+        query_strategy,
+        n_runs,
+        **kwargs,
+    ):
+        self.classifier = classifier
+        self.feature_extractor = feature_extractor
+        self.query_strategy = query_strategy
+        self.n_runs = n_runs
+        self.prior_makita_datasets = []
+        super().__init__(**kwargs)
+
+        self._prior_dataset_count = self._non_prior_dataset_count = 0
+
+    def get_dataset_specific_params(self, index, fp_dataset):
+        """Prepare dataset-specific parameters. These parameters are provided to the
+        template once for each dataset."""
+
+        # Load the dataset using load_data
+        asreview_data = load_data(fp_dataset)
+
+        # Create a DataFrame with the desired columns: label, abstract, and title
+        dataset = pd.DataFrame(
+            {
+                "title": asreview_data.title,
+                "abstract": asreview_data.abstract,
+                "label": asreview_data.labels.astype(int),
+            }
+        )
+
+        # Add the 'makita_priors' column
+        if fp_dataset.name.startswith("prior_") or fp_dataset.name.startswith(
+            "priors_"
+        ):
+            dataset["makita_priors"] = 1
+            self._prior_dataset_count += 1
+        else:
+            dataset["makita_priors"] = 0
+            self._non_prior_dataset_count += 1
+
+        if -1 in dataset.label.values:
+            index = dataset.label[dataset.label.values == -1].index[0]
+            raise ValueError(
+                f"Dataset {fp_dataset} contains unlabeled record at row {index}.\
+                    \nTitle: '{dataset.title[index]}'"
+            )
+
+        # Add the dataset to the list
+        self.prior_makita_datasets.append(dataset)
+
+        return {}
+
+    def get_template_specific_params(self, params):
+        """Prepare template-specific parameters. These parameters are provided to the
+        template only once."""
+
+        classifier = (
+            self.classifier
+            if self.classifier is not None
+            else ASREVIEW_CONFIG.DEFAULT_MODEL
+        )
+        feature_extractor = (
+            self.feature_extractor
+            if self.feature_extractor is not None
+            else ASREVIEW_CONFIG.DEFAULT_FEATURE_EXTRACTION
+        )
+        query_strategy = (
+            self.query_strategy
+            if self.query_strategy is not None
+            else ASREVIEW_CONFIG.DEFAULT_QUERY_STRATEGY
+        )
+        balance_strategy = (
+            self.balance_strategy
+            if self.balance_strategy is not None
+            else ASREVIEW_CONFIG.DEFAULT_BALANCE_STRATEGY
+        )
+        n_runs = self.n_runs if self.n_runs is not None else 1
+
+        # Check if at least one dataset with prior knowledge is present
+        if self._prior_dataset_count == 0:
+            raise ValueError(
+                "At least one dataset with prior knowledge (prefix 'prior_' or \
+                    'priors_') is required."
+            )
+
+        # Check if at least one dataset without prior knowledge is present
+        if self._non_prior_dataset_count == 0:
+            raise ValueError(
+                "At least one dataset without prior knowledge is required."
+            )
+
+        # Print the number of datasets with and without prior knowledge
+        print(f"\nTotal datasets with prior knowledge: {self._prior_dataset_count}")
+        print(
+            f"Total datasets without prior knowledge: {self._non_prior_dataset_count}"
+        )
+
+        # Create a directory for generated data if it doesn't already exist
+        generated_folder = Path("generated_data")
+        generated_folder.mkdir(parents=True, exist_ok=True)
+
+        # Set file paths for datasets with and without prior knowledge
+        filepath_with_priors = generated_folder / "dataset_with_priors.csv"
+        filepath_without_priors = generated_folder / "dataset_without_priors.csv"
+
+        # Combine all datasets into one DataFrame and remove rows where label is -1
+        combined_dataset = pd.concat(self.prior_makita_datasets, ignore_index=True)
+        combined_dataset.drop(
+            combined_dataset[combined_dataset.label == -1].index, inplace=True
+        )
+
+        # Calculate the total number of rows with and without prior knowledge
+        total_rows_with_priors = combined_dataset[
+            combined_dataset["makita_priors"] == 1
+        ].shape[0]
+        total_rows_without_priors = combined_dataset[
+            combined_dataset["makita_priors"] == 0
+        ].shape[0]
+
+        # Print the number of rows with and without prior knowledge
+        print(f"Total rows of prior knowledge: {total_rows_with_priors}")
+        print(f"Total rows of non-prior knowledge: {total_rows_without_priors}")
+
+        # Save the combined dataset to the appropriate file paths
+        combined_dataset.to_csv(filepath_with_priors, 
+                                index=True, 
+                                index_label='record_id')
+        combined_dataset[combined_dataset["makita_priors"] != 1].to_csv(
+            filepath_without_priors, 
+            index=True,
+            index_label='record_id'
+        )
+
+        # Create a string of indices for rows with prior knowledge
+        prior_idx_list = combined_dataset[
+            combined_dataset["makita_priors"] == 1
+        ].index.tolist()
+        if len(prior_idx_list) != total_rows_with_priors:
+            raise ValueError(
+                "prior_idx list is not equal in length to rows of prior \
+                             knowledge"
+            )
+        prior_idx = " ".join(map(str, prior_idx_list))
+
+        return {
+            "classifier": classifier,
+            "feature_extractor": feature_extractor,
+            "query_strategy": query_strategy,
+            "balance_strategy": balance_strategy,
+            "n_runs": n_runs,
+            "datasets": params,
+            "skip_wordclouds": self.skip_wordclouds,
+            "instances_per_query": self.instances_per_query,
+            "stop_if": self.stop_if,
+            "output_folder": self.output_folder,
+            "scripts_folder": self.scripts_folder,
+            "version": self.__version__,
+            "model_seed": self.model_seed,
+            "init_seed": self.init_seed,
+            "filepath_with_priors": filepath_with_priors,
+            "filepath_with_priors_stem": filepath_with_priors.stem,
+            "filepath_without_priors": filepath_without_priors,
+            "filepath_without_priors_stem": filepath_without_priors.stem,
+            "prior_idx": prior_idx,
+        }