Warnings and error messages per #141 #142 #146 (#147)

* #141 note about memory in README * #141 warning about memory in the docs * #142 add warning messages if labels are all the same * Add error message when predicates are specified using only strings (includes ??? case) Closes #141, #142, and #146
justin13601 · Oct 25, 2024 · 2c57b85 · 2c57b85
1 parent c56e60b
commit 2c57b85
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -218,14 +218,16 @@ Fields for a "plain" predicate:
 
 - `code` (required): Must be one of the following:
   - a string with `//` sequence separating the column name and column value.
-  - a list of strings as above in the form of {any: \[???, ???, ...\]}, which will match any of the listed codes.
-  - a regex in the form of {regex: "???"}, which will match any code that matches that regular expression.
+  - a list of strings as above in the form of `{any: \[???, ???, ...\]}`, which will match any of the listed codes.
+  - a regex in the form of `{regex: "???"}`, which will match any code that matches that regular expression.
 - `value_min` (optional): Must be float or integer specifying the minimum value of the predicate, if the variable is presented as numerical values.
 - `value_max` (optional): Must be float or integer specifying the maximum value of the predicate, if the variable is presented as numerical values.
 - `value_min_inclusive` (optional): Must be a boolean specifying whether `value_min` is inclusive or not.
 - `value_max_inclusive` (optional): Must be a boolean specifying whether `value_max` is inclusive or not.
 - `other_cols` (optional): Must be a 1-to-1 dictionary of column name and column value, which places additional constraints on further columns.
 
+**Note**: For memory optimization, we strongly recommend using either the List of Values or Regular Expression formats whenever possible, especially when needing to match multiple values. Defining each code as an individual string will increase memory usage significantly, as each code generates a separate predicate column. Using a list or regex consolidates multiple matching codes under a single column, reducing the overall memory footprint.
+
 #### Derived Predicates
 
 "Derived" predicates combine existing "plain" predicates using `and` / `or` keywords and have exactly 1 required `expr` field: For instance, the following defines a predicate representing either death or discharge (by combining "plain" predicates of `death` and `discharge`):

diff --git a/docs/source/configuration.md b/docs/source/configuration.md
@@ -49,20 +49,29 @@ These configs consist of the following four fields:
   The field can additionally be a dictionary with either a `regex` key and the value being a regular
   expression (satisfied if the regular expression evaluates to True), or a `any` key and the value being a
   list of strings (satisfied if there is an occurrence for any code in the list).
+
+  **Note**: Each individual definition of `PlainPredicateConfig` and `code` will generate a separate predicate
+  column. Thus, for memory optimization, it is strongly recommended to match multiple values using either the
+  List of Values or Regular Expression formats whenever possible.
+
 - `value_min`: If specified, an observation will only satisfy this predicate if the occurrence of the
   underlying `code` with a reported numerical value that is either greater than or greater than or equal to
   `value_min` (with these options being decided on the basis of `value_min_inclusive`, where
   `value_min_inclusive=True` indicating that an observation satisfies this predicate if its value is greater
   than or equal to `value_min`, and `value_min_inclusive=False` indicating a greater than but not equal to
   will be used).
+
 - `value_max`: If specified, an observation will only satisfy this predicate if the occurrence of the
   underlying `code` with a reported numerical value that is either less than or less than or equal to
   `value_max` (with these options being decided on the basis of `value_max_inclusive`, where
   `value_max_inclusive=True` indicating that an observation satisfies this predicate if its value is less
   than or equal to `value_max`, and `value_max_inclusive=False` indicating a less than but not equal to
   will be used).
+
 - `value_min_inclusive`: See `value_min`
+
 - `value_max_inclusive`: See `value_max`
+
 - `other_cols`: This optional field accepts a 1-to-1 dictionary of column names to column values, and can be
   used to specify further constraints on other columns (ie., not `code`) for this predicate.
 

diff --git a/src/aces/config.py b/src/aces/config.py
@@ -1195,6 +1195,23 @@ def load(cls, config_path: str | Path, predicates_path: str | Path = None) -> Ta
                                                     start_inclusive=True, end_inclusive=True, has={},
                                                     label=None, index_timestamp=None)},
                                 label_window=None, index_timestamp_window=None)
+            >>> predicates_dict = {
+            ...     "metadata": {'description': 'A test predicates file'},
+            ...     "description": 'this is a test',
+            ...     "patient_demographics": {"brown_eyes": {"code": "eye_color//BR"}},
+            ...     "predicates": {'admission': "invalid"},
+            ... }
+            >>> with (tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as config_fp,
+            ...      tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as pred_fp):
+            ...     config_path = Path(config_fp.name)
+            ...     pred_path = Path(pred_fp.name)
+            ...     yaml.dump(no_predicates_config, config_fp)
+            ...     yaml.dump(predicates_dict, pred_fp)
+            ...     cfg = TaskExtractorConfig.load(config_path, pred_path) # doctest: +NORMALIZE_WHITESPACE
+            Traceback (most recent call last):
+                ...
+            ValueError: Predicate 'admission' is not defined correctly in the configuration file. Currently
+            defined as the string: invalid. Please refer to the documentation for the supported formats.
         """
         if isinstance(config_path, str):
             config_path = Path(config_path)
@@ -1295,6 +1312,12 @@ def load(cls, config_path: str | Path, predicates_path: str | Path = None) -> Ta
             if "expr" in p:
                 predicate_objs[n] = DerivedPredicateConfig(**p)
             else:
+                if isinstance(p, str):
+                    raise ValueError(
+                        f"Predicate '{n}' is not defined correctly in the configuration file. "
+                        f"Currently defined as the string: {p}. "
+                        "Please refer to the documentation for the supported formats."
+                    )
                 config_data = {k: v for k, v in p.items() if k in PlainPredicateConfig.__dataclass_fields__}
                 other_cols = {k: v for k, v in p.items() if k not in config_data}
                 predicate_objs[n] = PlainPredicateConfig(**config_data, other_cols=other_cols)

diff --git a/src/aces/query.py b/src/aces/query.py
@@ -4,6 +4,8 @@
 """
 
 
+from collections import Counter
+
 import polars as pl
 from bigtree import preorder_iter
 from loguru import logger
@@ -137,6 +139,20 @@ def query(cfg: TaskExtractorConfig, predicates_df: pl.DataFrame) -> pl.DataFrame
         )
         to_return_cols.insert(1, "label")
 
+        if result["label"].n_unique() == 1:
+            logger.warning(
+                f"All labels in the extracted cohort are the same: '{result['label'][0]}'. "
+                "This may indicate an issue with the task logic. "
+                "Please double-check your configuration file if this is not expected."
+            )
+        else:
+            unique_labels = result["label"].n_unique()
+            label_distribution = Counter(result["label"])
+            logger.info(
+                f"Found {unique_labels} unique labels in the extracted cohort: "
+                f"{dict(label_distribution)}."
+            )
+
     # add index_timestamp column if specified
     if cfg.index_timestamp_window:
         logger.info(