Skip to content

Commit

Permalink
Add template to use and compare prior records from file (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
jteijema authored Sep 26, 2024
1 parent a52d639 commit 700888a
Show file tree
Hide file tree
Showing 19 changed files with 19,741 additions and 34 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/ci-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ jobs:
run: |
ruff check .
- name: Create directories using Python
run: python -c "import os; [os.makedirs(path, exist_ok=True) for path in ['./tmp/basic/data-test', './tmp/arfi/data', './tmp/multimodel/data', './tmp/scripts', './tmp/synergy/data']]"
run: python -c "import os; [os.makedirs(path, exist_ok=True) for path in ['./tmp/basic/data-test', './tmp/arfi/data', './tmp/prior/data', './tmp/multimodel/data', './tmp/scripts', './tmp/synergy/data']]"
- name: set up environment
run: |
cp .github/workflows/test_data/labels.csv ./tmp/basic/data-test/labels.csv
cp .github/workflows/test_data/labels.csv ./tmp/arfi/data/labels.csv
cp .github/workflows/test_data/labels.csv ./tmp/prior/data/labels.csv
cp .github/workflows/test_data/labels.csv ./tmp/prior/data/prior_labels.csv
cp .github/workflows/test_data/labels.csv ./tmp/multimodel/data/labels.csv
- name: Render makita templates
run: |
Expand All @@ -42,6 +44,9 @@ jobs:
cd ../arfi
asreview makita template arfi | tee output.txt
grep -q "ERROR" output.txt && exit 1 || true
cd ../prior
asreview makita template prior | tee output.txt
grep -q "ERROR" output.txt && exit 1 || true
cd ../multimodel
asreview makita template multimodel | tee output.txt
grep -q "ERROR" output.txt && exit 1 || true
Expand Down
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ optional arguments:
--impossible_models IMPOSSIBLE_MODELS Model combinations to exclude Default: ['nb,doc2vec', 'nb,sbert']
```

#### Example usage

If you want to specify certain combinations of classifiers and feature
extractors that should and should not be used, you can use the `--classifiers`,
`--feature_extractors`, `--query_strategies`, `--balance_strategies` and `--impossible_models` option. For instance, if you
Expand All @@ -203,6 +205,48 @@ want to exclude the combinations of `nb` with `doc2vec` and `logistic` with
asreview makita template multimodel --classifiers logistic nb --feature_extractors tfidf doc2vec --query_strategies max max_random max_uncertainty cluster --impossible_models nb,doc2vec logistic,tfidf
```

### Prior template

command: `prior`

The prior template evaluates how large amounts of prior knowledge might affect simulation performance. It processes two types of data in the data folder: labeled dataset(s) to be simulated and labeled dataset(s) to be used as prior knowledge. The filename(s) of the dataset(s) containing the prior knowledge should use the naming prefix `prior_[dataset_name]`.

The template runs two simulations: the first simulation uses all records from the `prior_` dataset(s) as prior knowledge, and the second uses a 1+1 randomly chosen set of prior knowledge from the non-prior knowledge dataset. Both runs simulate performance on the combined non-prior dataset(s).

Running this template creates a `generated_data` folder. This folder contains two datasets; `dataset_with_priors.csv` and `dataset_without_priors.csv`. The simulations specified in the generated jobs file will use these datasets for their simulations.

optional arguments:

```console
-h, --help show this help message and exit
--job_file JOB_FILE, -f JOB_FILE The name of the file with jobs. Default jobs.bat for Windows, otherwise jobs.sh.
-s DATA_FOLDER Dataset folder
-o OUTPUT_FOLDER Output folder
--init_seed INIT_SEED Seed of the priors. Seed is set to 535 by default.
--model_seed MODEL_SEED Seed of the models. Seed is set to 165 by default.
--template TEMPLATE Overwrite template with template file path.
--platform PLATFORM Platform to run jobs: Windows, Darwin, Linux. Default: the system of rendering templates.
--n_runs N_RUNS Number of runs. Default: 1.
--skip_wordclouds Disables the generation of wordclouds.
--overwrite Automatically accepts all overwrite requests.
--classifier CLASSIFIER Classifier to use. Default: nb.
--feature_extractor FEATURE_EXTRACTOR Feature_extractor to use. Default: tfidf.
--query_strategy QUERY_STRATEGY Query strategy to use. Default: max.
--balance_strategy BALANCE_STRATEGY Balance strategy to use. Default: double.
--instances_per_query INSTANCES_PER_QUERY Number of instances per query. Default: 1.
--stop_if STOP_IF The number of label actions to simulate. Default 'min' will stop simulating when all relevant records are found.
```

#### Example usage

Put at least 2 datasets in the data folder. One starting with the `prior_` prefix, and one without this prefix.

> note: `priors_` will also work.
```console
asreview makita template prior --classifier logistic --feature_extractor tfidf
```

## Advanced usage

### Create and use custom templates
Expand Down
30 changes: 15 additions & 15 deletions asreviewcontrib/makita/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,78 +69,78 @@ def execute(self, argv): # noqa: C901
"--instances_per_query",
type=int,
default=ASREVIEW_CONFIG.DEFAULT_N_INSTANCES,
help="Number of instances per query. ",
help="Number of instances per query.",
)
parser_template.add_argument(
"--stop_if",
type=str,
default="min",
help="The number of label actions to simulate. ",
help="The number of label actions to simulate.",
)
parser_template.add_argument(
"--n_runs",
type=int,
help="Number of runs. Only for templates 'basic' and 'multimodel'. ",
help="Number of runs.",
)
parser_template.add_argument(
"--n_priors",
type=int,
help="Number of priors. Only for template 'arfi'.",
help="Number of priors.",
)
parser_template.add_argument(
"--skip_wordclouds",
action="store_true",
help="Disables the generation of wordclouds. ",
help="Disables the generation of wordclouds.",
)
parser_template.add_argument(
"--overwrite",
action="store_true",
help="Overwrite existing files in the output folder. ",
help="Overwrite existing files in the output folder.",
)
parser_template.add_argument(
"--classifier",
type=str,
help="Classifier to use. Only for template 'basic' and 'arfi'. ",
help="Classifier to use.",
)
parser_template.add_argument(
"--feature_extractor",
type=str,
help="Feature_extractor to use. Only for template 'basic' and 'arfi'. ",
help="Feature_extractor to use.",
)
parser_template.add_argument(
"--query_strategy",
type=str,
help="Query strategy to use. Only for template 'basic' and 'arfi'. ",
help="Query strategy to use.",
)
parser_template.add_argument(
"--balance_strategy",
type=str,
help="Balance strategy to use. Only for template 'basic' and 'arfi'. ",
help="Balance strategy to use.",
)
parser_template.add_argument(
"--classifiers",
nargs="+",
help="Classifiers to use. Only for template 'multimodel'. ",
help="Classifiers to use.",
)
parser_template.add_argument(
"--feature_extractors",
nargs="+",
help="Feature extractors to use. Only for template 'multimodel'. ",
help="Feature extractors to use.",
)
parser_template.add_argument(
"--query_strategies",
nargs="+",
help="Query strategies to use. Only for template 'multimodel'. ",
help="Query strategies to use.",
)
parser_template.add_argument(
"--balance_strategies",
nargs="+",
help="Balancing strategies to use. Only for template 'multimodel'. ",
help="Balancing strategies to use.",
)
parser_template.add_argument(
"--impossible_models",
nargs="+",
help="Model combinations to exclude. Only for template 'multimodel'.",
help="Model combinations to exclude.",
)

parser_template.set_defaults(func=self._template_cli)
Expand Down
184 changes: 184 additions & 0 deletions asreviewcontrib/makita/template_prior.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import warnings
from pathlib import Path

import pandas as pd
from asreview import config as ASREVIEW_CONFIG
from asreview.data import load_data

from asreviewcontrib.makita.template_base import TemplateBase

# Suppress FutureWarning messages
warnings.simplefilter(action="ignore", category=FutureWarning)


class TemplatePrior(TemplateBase):
template_file = "template_prior.txt.template"

def __init__(
self,
classifier,
feature_extractor,
query_strategy,
n_runs,
**kwargs,
):
self.classifier = classifier
self.feature_extractor = feature_extractor
self.query_strategy = query_strategy
self.n_runs = n_runs
self.prior_makita_datasets = []
super().__init__(**kwargs)

self._prior_dataset_count = self._non_prior_dataset_count = 0

def get_dataset_specific_params(self, index, fp_dataset):
"""Prepare dataset-specific parameters. These parameters are provided to the
template once for each dataset."""

# Load the dataset using load_data
asreview_data = load_data(fp_dataset)

# Create a DataFrame with the desired columns: label, abstract, and title
dataset = pd.DataFrame(
{
"title": asreview_data.title,
"abstract": asreview_data.abstract,
"label": asreview_data.labels.astype(int),
}
)

# Add the 'makita_priors' column
if fp_dataset.name.startswith("prior_") or fp_dataset.name.startswith(
"priors_"
):
dataset["makita_priors"] = 1
self._prior_dataset_count += 1
else:
dataset["makita_priors"] = 0
self._non_prior_dataset_count += 1

if -1 in dataset.label.values:
index = dataset.label[dataset.label.values == -1].index[0]
raise ValueError(
f"Dataset {fp_dataset} contains unlabeled record at row {index}.\
\nTitle: '{dataset.title[index]}'"
)

# Add the dataset to the list
self.prior_makita_datasets.append(dataset)

return {}

def get_template_specific_params(self, params):
"""Prepare template-specific parameters. These parameters are provided to the
template only once."""

classifier = (
self.classifier
if self.classifier is not None
else ASREVIEW_CONFIG.DEFAULT_MODEL
)
feature_extractor = (
self.feature_extractor
if self.feature_extractor is not None
else ASREVIEW_CONFIG.DEFAULT_FEATURE_EXTRACTION
)
query_strategy = (
self.query_strategy
if self.query_strategy is not None
else ASREVIEW_CONFIG.DEFAULT_QUERY_STRATEGY
)
balance_strategy = (
self.balance_strategy
if self.balance_strategy is not None
else ASREVIEW_CONFIG.DEFAULT_BALANCE_STRATEGY
)
n_runs = self.n_runs if self.n_runs is not None else 1

# Check if at least one dataset with prior knowledge is present
if self._prior_dataset_count == 0:
raise ValueError(
"At least one dataset with prior knowledge (prefix 'prior_' or \
'priors_') is required."
)

# Check if at least one dataset without prior knowledge is present
if self._non_prior_dataset_count == 0:
raise ValueError(
"At least one dataset without prior knowledge is required."
)

# Print the number of datasets with and without prior knowledge
print(f"\nTotal datasets with prior knowledge: {self._prior_dataset_count}")
print(
f"Total datasets without prior knowledge: {self._non_prior_dataset_count}"
)

# Create a directory for generated data if it doesn't already exist
generated_folder = Path("generated_data")
generated_folder.mkdir(parents=True, exist_ok=True)

# Set file paths for datasets with and without prior knowledge
filepath_with_priors = generated_folder / "dataset_with_priors.csv"
filepath_without_priors = generated_folder / "dataset_without_priors.csv"

# Combine all datasets into one DataFrame and remove rows where label is -1
combined_dataset = pd.concat(self.prior_makita_datasets, ignore_index=True)
combined_dataset.drop(
combined_dataset[combined_dataset.label == -1].index, inplace=True
)

# Calculate the total number of rows with and without prior knowledge
total_rows_with_priors = combined_dataset[
combined_dataset["makita_priors"] == 1
].shape[0]
total_rows_without_priors = combined_dataset[
combined_dataset["makita_priors"] == 0
].shape[0]

# Print the number of rows with and without prior knowledge
print(f"Total rows of prior knowledge: {total_rows_with_priors}")
print(f"Total rows of non-prior knowledge: {total_rows_without_priors}")

# Save the combined dataset to the appropriate file paths
combined_dataset.to_csv(filepath_with_priors,
index=True,
index_label='record_id')
combined_dataset[combined_dataset["makita_priors"] != 1].to_csv(
filepath_without_priors,
index=True,
index_label='record_id'
)

# Create a string of indices for rows with prior knowledge
prior_idx_list = combined_dataset[
combined_dataset["makita_priors"] == 1
].index.tolist()
if len(prior_idx_list) != total_rows_with_priors:
raise ValueError(
"prior_idx list is not equal in length to rows of prior \
knowledge"
)
prior_idx = " ".join(map(str, prior_idx_list))

return {
"classifier": classifier,
"feature_extractor": feature_extractor,
"query_strategy": query_strategy,
"balance_strategy": balance_strategy,
"n_runs": n_runs,
"datasets": params,
"skip_wordclouds": self.skip_wordclouds,
"instances_per_query": self.instances_per_query,
"stop_if": self.stop_if,
"output_folder": self.output_folder,
"scripts_folder": self.scripts_folder,
"version": self.__version__,
"model_seed": self.model_seed,
"init_seed": self.init_seed,
"filepath_with_priors": filepath_with_priors,
"filepath_with_priors_stem": filepath_with_priors.stem,
"filepath_without_priors": filepath_without_priors,
"filepath_without_priors_stem": filepath_without_priors.stem,
"prior_idx": prior_idx,
}
Loading

0 comments on commit 700888a

Please sign in to comment.