diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b19cd2..fe96d88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,18 @@ # Changelog All notable changes to this project will be documented in this file. -## [Unreleased] +## [0.13.0] - 2018-07-25 +### Fixed +- Crash while computing metrics when either actual or predicted intent is unknown + +### Removed +- APIs depending implicitely on Snips NLU: + - `compute_cross_val_nlu_metrics` + - `compute_train_test_nlu_metrics` + +### Changed +- Use flexible version specifiers for dependencies + ## [0.12.0] - 2018-03-29 ### Added @@ -10,5 +21,6 @@ All notable changes to this project will be documented in this file. - New option to exclude slot metrics in the output - Samples -[Unreleased]: https://github.com/snipsco/snips-nlu-metrics/compare/0.12.0...HEAD + +[0.13.0]: https://github.com/snipsco/snips-nlu-metrics/compare/0.12.0...0.13.0 [0.12.0]: https://github.com/snipsco/snips-nlu-metrics/compare/0.11.1...0.12.0 \ No newline at end of file diff --git a/README.rst b/README.rst index 29d123a..72f2283 100644 --- a/README.rst +++ b/README.rst @@ -22,7 +22,7 @@ Install .. code-block:: console - pip install snips_nlu_metrics + $ pip install snips_nlu_metrics NLU Metrics API @@ -40,24 +40,64 @@ The metrics output (json) provides detailed information about: * parsing errors * `confusion matrix`_ +Data +---- + +Some sample datasets, that can be used to compute metrics, are available +`here `_. Alternatively, you can create your own dataset either by +using ``snips-nlu``'s `dataset generation tool`_ or by going on the +`Snips console`_. + Examples -------- -The Snips NLU metrics library can be used either with `Snips NLU`_ or with a -custom intent parsing pipeline. +The Snips NLU metrics library can be used with any NLU pipeline which satisfies +the ``Engine`` API: + +.. code-block:: python + + from builtins import object + + class Engine(object): + def fit(self, dataset): + # Perform training ... + return self + + def parse(self, text): + # extract intent and slots ... + return { + "input": text, + "intent": { + "intentName": intent_name, + "probability": probability + }, + "slots": slots + } + ---------------- Snips NLU Engine ---------------- -Here is how you can use the metrics API to compute metrics for the Snips NLU -pipeline: +This library can be used to benchmark NLU solutions such as `Snips NLU`_. To +install the ``snips-nlu`` python library, and fetch the language resources for +english, run the following commands: + +.. code-block:: bash + + $ pip install snips-nlu + $ snips-nlu download en + + +Then, you can compute metrics for the ``snips-nlu`` pipeline using the metrics +API as follows: .. code-block:: python - from snips_nlu import SnipsNLUEngine + from snips_nlu import load_resources, SnipsNLUEngine from snips_nlu_metrics import compute_train_test_metrics, compute_cross_val_metrics + load_resources("en") tt_metrics = compute_train_test_metrics(train_dataset="samples/train_dataset.json", test_dataset="samples/test_dataset.json", @@ -67,16 +107,6 @@ pipeline: engine_class=SnipsNLUEngine, nb_folds=5) -Some `sample code and datasets `_ are also available, you can have an -overview of the metrics output by running: - -.. code-block:: bash - - git clone https://github.com/snipsco/snips-nlu-metrics.git - cd snips-nlu-metrics - pip install -e ".[samples]" - python samples/sample.py train-test - ----------------- Custom NLU Engine ----------------- @@ -128,4 +158,6 @@ This library is provided by `Snips `_ as Open Source softw .. _train/test: https://en.wikipedia.org/wiki/Training,_test,_and_validation_sets .. _Snips NLU: https://github.com/snipsco/snips-nlu .. _precision, recall and f1 scores: https://en.wikipedia.org/wiki/Precision_and_recall -.. _confusion matrix: https://en.wikipedia.org/wiki/Confusion_matrix \ No newline at end of file +.. _confusion matrix: https://en.wikipedia.org/wiki/Confusion_matrix +.. _dataset generation tool: http://snips-nlu.readthedocs.io/en/latest/tutorial.html#snips-dataset-format +.. _Snips console: https://console.snips.ai \ No newline at end of file diff --git a/samples/__init__.py b/samples/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/samples/sample.py b/samples/sample.py deleted file mode 100644 index 49deedc..0000000 --- a/samples/sample.py +++ /dev/null @@ -1,48 +0,0 @@ -import argparse -import json -import os -import sys - -from snips_nlu import SnipsNLUEngine, load_resources - -from snips_nlu_metrics import (compute_train_test_metrics, - compute_cross_val_metrics) - -SAMPLES_DIR = os.path.dirname(os.path.abspath(__file__)) -TRAIN_DATASET_PATH = os.path.join(SAMPLES_DIR, "train_dataset.json") -TEST_DATASET_PATH = os.path.join(SAMPLES_DIR, "test_dataset.json") -CROSS_VAL_DATASET_PATH = os.path.join(SAMPLES_DIR, "cross_val_dataset.json") - - -def compute_sample_train_test_metrics(): - load_resources("en") - return compute_train_test_metrics( - train_dataset=TRAIN_DATASET_PATH, - test_dataset=TEST_DATASET_PATH, - engine_class=SnipsNLUEngine) - - -def compute_sample_cross_val_metrics(): - load_resources("en") - return compute_cross_val_metrics(dataset=CROSS_VAL_DATASET_PATH, - engine_class=SnipsNLUEngine, - nb_folds=5) - - -def main_metrics(): - parser = argparse.ArgumentParser( - description="Compute sample metrics on the Snips NLU parsing pipeline") - parser.add_argument("metrics_type", type=str, - choices=["train-test", "cross-val"], - metavar="metrics_type", - help="Type of metrics to compute") - args = parser.parse_args(sys.argv[1:]) - if args.metrics_type == "train_test": - metrics = compute_sample_train_test_metrics() - else: - metrics = compute_sample_cross_val_metrics() - print(json.dumps(metrics, indent=2)) - - -if __name__ == '__main__': - main_metrics() diff --git a/setup.py b/setup.py index b890fb4..624f9e4 100644 --- a/setup.py +++ b/setup.py @@ -19,17 +19,14 @@ install_requires = [ "future", - "numpy==1.14.0", - "scipy==1.0.0", - "scikit-learn==0.19.1", + "numpy>=1.7,<2.0", + "scipy>=1.0,<2.0", + "scikit-learn>=0.19,<0.20", ] extras_require = { "test": [ - "mock==2.0.0", - ], - "samples": [ - "snips-nlu==0.12.1" + "mock>=2.0,<3.0", ] } diff --git a/snips_nlu_metrics/__init__.py b/snips_nlu_metrics/__init__.py index 1cf231f..9947a1d 100644 --- a/snips_nlu_metrics/__init__.py +++ b/snips_nlu_metrics/__init__.py @@ -2,6 +2,4 @@ from snips_nlu_metrics.engine import Engine from snips_nlu_metrics.metrics import (compute_train_test_metrics, - compute_train_test_nlu_metrics, - compute_cross_val_metrics, - compute_cross_val_nlu_metrics) + compute_cross_val_metrics) diff --git a/snips_nlu_metrics/__version__ b/snips_nlu_metrics/__version__ index d33c3a2..51de330 100644 --- a/snips_nlu_metrics/__version__ +++ b/snips_nlu_metrics/__version__ @@ -1 +1 @@ -0.12.0 \ No newline at end of file +0.13.0 \ No newline at end of file diff --git a/snips_nlu_metrics/engine.py b/snips_nlu_metrics/engine.py index 6df3f51..293a37e 100644 --- a/snips_nlu_metrics/engine.py +++ b/snips_nlu_metrics/engine.py @@ -1,20 +1,10 @@ from __future__ import unicode_literals -import io -import json -import os -import zipfile from abc import ABCMeta, abstractmethod from builtins import object -from builtins import str -from copy import deepcopy from future.utils import with_metaclass -from snips_nlu_metrics.utils.temp_utils import tempdir_ctx - -TRAINED_ENGINE_FILENAME = "trained_assistant.json" - class Engine(with_metaclass(ABCMeta, object)): """Abstract class which represents an engine that can be used in the @@ -28,50 +18,3 @@ def fit(self, dataset): @abstractmethod def parse(self, text): pass - - -def build_nlu_engine_class(training_class, inference_class, - training_config=None): - _training_config = deepcopy(training_config) - - class NLUEngine(Engine): - def __init__(self): - self.inference_engine = None - self.training_config = _training_config - - def fit(self, dataset): - if self.training_config is not None: - training_engine = training_class(config=self.training_config) - else: - training_engine = training_class() - training_engine.fit(dataset) - trained_engine_dict = training_engine.to_dict() - self.inference_engine = get_inference_nlu_engine( - trained_engine_dict, inference_class) - - def parse(self, text): - return self.inference_engine.parse(text) - - return NLUEngine - - -def get_trained_nlu_engine(train_dataset, training_engine_class): - language = train_dataset["language"] - engine = training_engine_class(language) - engine.fit(train_dataset) - return engine - - -def get_inference_nlu_engine(trained_engine_dict, inference_engine_class): - with tempdir_ctx() as engine_dir: - trained_engine_path = os.path.join(engine_dir, TRAINED_ENGINE_FILENAME) - archive_path = os.path.join(engine_dir, 'assistant.zip') - - with io.open(trained_engine_path, mode='w', encoding='utf8') as f: - f.write(str(json.dumps(trained_engine_dict))) - with zipfile.ZipFile(archive_path, 'w') as zf: - zf.write(trained_engine_path, arcname=TRAINED_ENGINE_FILENAME) - with io.open(archive_path, mode='rb') as f: - data_zip = bytearray(f.read()) - - return inference_engine_class(data_zip=data_zip) diff --git a/snips_nlu_metrics/metrics.py b/snips_nlu_metrics/metrics.py index 30c1a4c..b46f263 100644 --- a/snips_nlu_metrics/metrics.py +++ b/snips_nlu_metrics/metrics.py @@ -1,65 +1,17 @@ -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals +from __future__ import division, print_function, unicode_literals import io import json from past.builtins import basestring -from snips_nlu_metrics.engine import build_nlu_engine_class from snips_nlu_metrics.utils.constants import ( - INTENTS, UTTERANCES, INTENT_UTTERANCES, PARSING_ERRORS, METRICS, - CONFUSION_MATRIX) + CONFUSION_MATRIX, INTENTS, INTENT_UTTERANCES, METRICS, PARSING_ERRORS, + UTTERANCES) from snips_nlu_metrics.utils.exception import NotEnoughDataError from snips_nlu_metrics.utils.metrics_utils import ( - create_shuffle_stratified_splits, compute_engine_metrics, - aggregate_metrics, compute_precision_recall_f1, aggregate_matrices) - - -def compute_cross_val_nlu_metrics(dataset, training_engine_class, - inference_engine_class, nb_folds=5, - train_size_ratio=1.0, - drop_entities=False, - include_slot_metrics=True, - slot_matching_lambda=None, - progression_handler=None): - """Compute pure NLU metrics on the dataset using cross validation - - Args: - dataset (dict or str): Dataset or path to dataset - training_engine_class: Python class to use for training - inference_engine_class: Python class to use for inference - nb_folds (int, optional): Number of folds to use for cross validation - train_size_ratio (float, optional): Ratio of intent utterances to use - for training - drop_entities (bool, false): Specify whether not all entity values - should be removed from training data - include_slot_metrics (bool, true): If false, the slots metrics and the - slots parsing errors will not be reported. - slot_matching_lambda (lambda, optional): - lambda expected_slot, actual_slot -> bool, - if defined, this function will be use to match slots when computing - metrics, otherwise exact match will be used. - `expected_slot` corresponds to the slot as defined in the dataset, - and `actual_slot` corresponds to the slot as returned by the NLU - progression_handler (lambda, optional): handler called at each - progression (%) step - - Returns - dict: Metrics results containing the following data - - - "metrics": the computed metrics - - "parsing_errors": the list of parsing errors - - """ - engine_class = build_nlu_engine_class(training_engine_class, - inference_engine_class) - return compute_cross_val_metrics(dataset, engine_class, nb_folds, - train_size_ratio, drop_entities, - include_slot_metrics, - slot_matching_lambda, - progression_handler) + aggregate_matrices, aggregate_metrics, compute_engine_metrics, + compute_precision_recall_f1, create_shuffle_stratified_splits) def compute_cross_val_metrics(dataset, engine_class, nb_folds=5, @@ -144,42 +96,6 @@ class must inherit from `Engine` } -def compute_train_test_nlu_metrics(train_dataset, test_dataset, - training_engine_class, - inference_engine_class, - include_slot_metrics=True, - slot_matching_lambda=None): - """Compute pure NLU metrics on `test_dataset` after having trained on - `train_dataset` - - Args - train_dataset (dict or str): Dataset or path to dataset used for - training - test_dataset (dict or str): Dataset or path to dataset used for testing - training_engine_class: Python class to use for training - inference_engine_class: Python class to use for inference - include_slot_metrics (bool, true): If false, the slots metrics and the - slots parsing errors will not be reported. - slot_matching_lambda (lambda, optional): - lambda expected_slot, actual_slot -> bool, - if defined, this function will be use to match slots when computing - metrics, otherwise exact match will be used. - `expected_slot` corresponds to the slot as defined in the dataset, - and `actual_slot` corresponds to the slot as returned by the NLU - - Returns - dict: Metrics results containing the following data - - - "metrics": the computed metrics - - "parsing_errors": the list of parsing errors - """ - engine_class = build_nlu_engine_class(training_engine_class, - inference_engine_class) - return compute_train_test_metrics(train_dataset, test_dataset, - engine_class, include_slot_metrics, - slot_matching_lambda) - - def compute_train_test_metrics(train_dataset, test_dataset, engine_class, include_slot_metrics=True, slot_matching_lambda=None): diff --git a/snips_nlu_metrics/tests/engine_config.py b/snips_nlu_metrics/tests/engine_config.py deleted file mode 100644 index f79b52c..0000000 --- a/snips_nlu_metrics/tests/engine_config.py +++ /dev/null @@ -1,163 +0,0 @@ -from __future__ import unicode_literals - -NLU_CONFIG = { - "unit_name": "nlu_engine", - "intent_parsers_configs": [ - { - "unit_name": "deterministic_intent_parser", - "max_queries": 50, - "max_entities": 200 - }, - { - "unit_name": "probabilistic_intent_parser", - "intent_classifier_config": { - "data_augmentation_config": { - "min_utterances": 20, - "unknown_words_replacement_string": None, - "noise_factor": 5, - "unknown_word_prob": 0 - }, - "unit_name": "log_reg_intent_classifier", - "featurizer_config": { - "sublinear_tf": False - }, - "random_seed": None, - "log_reg_args": { - "penalty": "l2", - "loss": "log", - "n_iter": 5, - "n_jobs": -1, - "class_weight": "balanced" - } - }, - "slot_filler_config": { - "data_augmentation_config": { - "capitalization_ratio": 0.2, - "min_utterances": 200 - }, - "unit_name": "crf_slot_filler", - "entities_offsets": [ - -2, - -1, - 0 - ], - "crf_args": { - "c2": 0.1, - "c1": 0.1, - "algorithm": "lbfgs" - }, - "tagging_scheme": 1, - "random_seed": None, - "feature_factory_configs": [ - { - "args": { - "common_words_gazetteer_name": None, - "use_stemming": True, - "n": 1 - }, - "factory_name": "ngram", - "offsets": [ - -2, - -1, - 0, - 1, - 2 - ] - }, - { - "args": { - "common_words_gazetteer_name": None, - "use_stemming": True, - "n": 2 - }, - "factory_name": "ngram", - "offsets": [ - -2, - 1 - ] - }, - { - "args": {}, - "factory_name": "is_digit", - "offsets": [ - -1, - 0, - 1 - ] - }, - { - "args": {}, - "factory_name": "is_first", - "offsets": [ - -2, - -1, - 0 - ] - }, - { - "args": {}, - "factory_name": "is_last", - "offsets": [ - 0, - 1, - 2 - ] - }, - { - "args": { - "n": 1 - }, - "factory_name": "shape_ngram", - "offsets": [ - 0 - ] - }, - { - "args": { - "n": 2 - }, - "factory_name": "shape_ngram", - "offsets": [ - -1, - 0 - ] - }, - { - "args": { - "n": 3 - }, - "factory_name": "shape_ngram", - "offsets": [ - -1 - ] - }, - { - "args": { - "tagging_scheme_code": 2, - "use_stemming": True - }, - "factory_name": "entity_match", - "drop_out": 0.1, - "offsets": [ - -2, - -1, - 0 - ] - }, - { - "args": { - "tagging_scheme_code": 1 - }, - "factory_name": "builtin_entity_match", - "offsets": [ - -2, - -1, - 0 - ] - } - ], - "exhaustive_permutations_threshold": 64 - } - } - ] -} diff --git a/snips_nlu_metrics/tests/mock_engine.py b/snips_nlu_metrics/tests/mock_engine.py index c39277c..22ad3f5 100644 --- a/snips_nlu_metrics/tests/mock_engine.py +++ b/snips_nlu_metrics/tests/mock_engine.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -from builtins import object - from snips_nlu_metrics import Engine @@ -13,26 +11,6 @@ def dummy_parsing_result(text): } -class MockTrainingEngine(object): - def __init__(self, config=None): - self.training_config = config - self.fitted = False - - def fit(self, dataset): - self.fitted = True - - def to_dict(self): - return dict() - - -class MockInferenceEngine(object): - def __init__(self, data_zip): - pass - - def parse(self, text): - return dummy_parsing_result(text) - - class MockEngine(Engine): def __init__(self): self.fitted = False diff --git a/snips_nlu_metrics/tests/test_metrics.py b/snips_nlu_metrics/tests/test_metrics.py index 672b45d..dfe71ad 100644 --- a/snips_nlu_metrics/tests/test_metrics.py +++ b/snips_nlu_metrics/tests/test_metrics.py @@ -3,33 +3,27 @@ import os import unittest -from mock import patch - -from snips_nlu_metrics.engine import build_nlu_engine_class from snips_nlu_metrics.metrics import (compute_cross_val_metrics, - compute_train_test_metrics, - compute_cross_val_nlu_metrics, - compute_train_test_nlu_metrics) -from snips_nlu_metrics.tests.engine_config import NLU_CONFIG -from snips_nlu_metrics.tests.mock_engine import (MockTrainingEngine, - MockInferenceEngine) + compute_train_test_metrics) +from snips_nlu_metrics.tests.mock_engine import MockEngine from snips_nlu_metrics.utils.constants import METRICS, PARSING_ERRORS class TestMetrics(unittest.TestCase): - def test_cross_val_nlu_metrics(self): + def test_compute_cross_val_metrics(self): # Given dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "beverage_dataset.json") - # When + with io.open(dataset_path, encoding="utf8") as f: + dataset = json.load(f) + + # When/Then try: - res = compute_cross_val_nlu_metrics( - dataset=dataset_path, training_engine_class=MockTrainingEngine, - inference_engine_class=MockInferenceEngine, nb_folds=2) + res = compute_cross_val_metrics( + dataset=dataset, engine_class=MockEngine, nb_folds=2) except Exception as e: self.fail(e.args[0]) - # Then expected_metrics = { "null": { "intent": { @@ -97,24 +91,7 @@ def test_cross_val_nlu_metrics(self): self.assertDictEqual(expected_metrics, res["metrics"]) - def test_cross_val_metrics_should_skip_when_not_enough_data(self): - # Given - dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "resources", "beverage_dataset.json") - - # When - result = compute_cross_val_nlu_metrics( - dataset=dataset_path, training_engine_class=MockTrainingEngine, - inference_engine_class=MockInferenceEngine, nb_folds=11) - - # Then - expected_result = { - METRICS: None, - PARSING_ERRORS: [] - } - self.assertDictEqual(expected_result, result) - - def test_end_to_end_cross_val_metrics(self): + def test_compute_cross_val_metrics_without_slot_metrics(self): # Given dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "beverage_dataset.json") @@ -123,35 +100,67 @@ def test_end_to_end_cross_val_metrics(self): # When/Then try: - engine_class = build_nlu_engine_class(MockTrainingEngine, - MockInferenceEngine) - compute_cross_val_metrics(dataset=dataset, - engine_class=engine_class, nb_folds=5) + res = compute_cross_val_metrics( + dataset=dataset, engine_class=MockEngine, nb_folds=2, + include_slot_metrics=False) except Exception as e: self.fail(e.args[0]) - @patch("snips_nlu_metrics.metrics.compute_train_test_metrics") - def test_train_test_nlu_metrics(self, mocked_train_test_metrics): + expected_metrics = { + "null": { + "intent": { + "true_positive": 0, + "false_positive": 11, + "false_negative": 0, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + }, + "intent_utterances": 0 + }, + "MakeCoffee": { + "intent": { + "true_positive": 0, + "false_positive": 0, + "false_negative": 7, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + }, + "intent_utterances": 7 + }, + "MakeTea": { + "intent": { + "true_positive": 0, + "false_positive": 0, + "false_negative": 4, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + }, + "intent_utterances": 4 + } + } + + self.assertDictEqual(expected_metrics, res["metrics"]) + + def test_cross_val_metrics_should_skip_when_not_enough_data(self): # Given - mocked_metrics_result = {"metrics": "ok"} - mocked_train_test_metrics.return_value = mocked_metrics_result dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "beverage_dataset.json") - with io.open(dataset_path, encoding="utf8") as f: - dataset = json.load(f) - # When/Then - try: - res = compute_train_test_nlu_metrics( - train_dataset=dataset, test_dataset=dataset, - training_engine_class=MockTrainingEngine, - inference_engine_class=MockInferenceEngine) - except Exception as e: - self.fail(e.args[0]) + # When + result = compute_cross_val_metrics( + dataset=dataset_path, engine_class=MockEngine, nb_folds=11) - self.assertDictEqual(mocked_metrics_result, res) + # Then + expected_result = { + METRICS: None, + PARSING_ERRORS: [] + } + self.assertDictEqual(expected_result, result) - def test_end_to_end_train_test_metrics(self): + def test_compute_train_test_metrics(self): # Given dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "beverage_dataset.json") @@ -160,15 +169,78 @@ def test_end_to_end_train_test_metrics(self): # When/Then try: - engine_class = build_nlu_engine_class(MockTrainingEngine, - MockInferenceEngine) - compute_train_test_metrics( + res = compute_train_test_metrics( train_dataset=dataset, test_dataset=dataset, - engine_class=engine_class) + engine_class=MockEngine) except Exception as e: self.fail(e.args[0]) - def test_end_to_end_train_test_metrics_with_training_config(self): + expected_metrics = { + "MakeCoffee": { + "intent": { + "true_positive": 0, + "false_positive": 0, + "false_negative": 7, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + }, + "slots": { + "number_of_cups": { + "true_positive": 0, + "false_positive": 0, + "false_negative": 0, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + } + }, + "intent_utterances": 7 + }, + "null": { + "intent": { + "true_positive": 0, + "false_positive": 11, + "false_negative": 0, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0}, + "slots": {}, + "intent_utterances": 0 + }, "MakeTea": { + "intent": { + "true_positive": 0, + "false_positive": 0, + "false_negative": 4, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + }, + "slots": { + "number_of_cups": { + "true_positive": 0, + "false_positive": 0, + "false_negative": 0, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + }, + "beverage_temperature": { + "true_positive": 0, + "false_positive": 0, + "false_negative": 0, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + } + }, + "intent_utterances": 4 + } + } + + self.assertDictEqual(expected_metrics, res["metrics"]) + + def test_compute_train_test_metrics_without_slots_metrics(self): # Given dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "beverage_dataset.json") @@ -177,11 +249,44 @@ def test_end_to_end_train_test_metrics_with_training_config(self): # When/Then try: - engine_class = build_nlu_engine_class(MockTrainingEngine, - MockInferenceEngine, - training_config=NLU_CONFIG) - compute_train_test_metrics( + res = compute_train_test_metrics( train_dataset=dataset, test_dataset=dataset, - engine_class=engine_class) + engine_class=MockEngine, include_slot_metrics=False) except Exception as e: self.fail(e.args[0]) + + expected_metrics = { + "MakeCoffee": { + "intent": { + "true_positive": 0, + "false_positive": 0, + "false_negative": 7, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + }, + "intent_utterances": 7 + }, + "null": { + "intent": { + "true_positive": 0, + "false_positive": 11, + "false_negative": 0, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0}, + "intent_utterances": 0 + }, "MakeTea": { + "intent": { + "true_positive": 0, + "false_positive": 0, + "false_negative": 4, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0 + }, + "intent_utterances": 4 + } + } + + self.assertDictEqual(expected_metrics, res["metrics"]) diff --git a/snips_nlu_metrics/tests/test_nlu_engine.py b/snips_nlu_metrics/tests/test_nlu_engine.py deleted file mode 100644 index d72f4eb..0000000 --- a/snips_nlu_metrics/tests/test_nlu_engine.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import unicode_literals - -import unittest - -from snips_nlu_metrics.engine import ( - get_trained_nlu_engine, get_inference_nlu_engine) -from snips_nlu_metrics.tests.mock_engine import ( - MockTrainingEngine, MockInferenceEngine) - - -class TestNLUEngine(unittest.TestCase): - def test_get_trained_engine_should_use_provided_engine_class(self): - # Given - _dataset = { - "language": "en", - "intents": { - "intent1": { - "utterances": [ - {"data": [{"text": "text1"}]}, - ] - }, - "intent2": { - "utterances": [ - {"data": [{"text": "text2"}]}, - ] - }, - }, - "entities": {}, - "snips_nlu_version": "0.1.0" - } - - # When - engine = get_trained_nlu_engine(_dataset, MockTrainingEngine) - - # Then - self.assertTrue(engine.fitted, 1) - - def test_get_inference_engine_should_use_provided_engine_class(self): - # When - inference_engine = get_inference_nlu_engine(dict(), - MockInferenceEngine) - - # Then - self.assertIsInstance(inference_engine, MockInferenceEngine) diff --git a/snips_nlu_metrics/utils/metrics_utils.py b/snips_nlu_metrics/utils/metrics_utils.py index 5fb6c5c..98f0c32 100644 --- a/snips_nlu_metrics/utils/metrics_utils.py +++ b/snips_nlu_metrics/utils/metrics_utils.py @@ -114,8 +114,12 @@ def compute_engine_metrics(engine, test_utterances, intent_list, predicted_slots = [] if parsing["slots"] is None else parsing["slots"] - i = intents_idx[actual_intent] - j = intents_idx[predicted_intent] + i = intents_idx.get(actual_intent) + j = intents_idx.get(predicted_intent) + + if i is None or j is None: + continue + confusion_matrix["matrix"][i][j] += 1 utterance_metrics = compute_utterance_metrics( @@ -230,9 +234,10 @@ def compute_precision_recall_f1(metrics): prec_rec_metrics = _compute_precision_recall_f1( intent_metrics["intent"]) intent_metrics["intent"].update(prec_rec_metrics) - for slot_metrics in intent_metrics["slots"].values(): - prec_rec_metrics = _compute_precision_recall_f1(slot_metrics) - slot_metrics.update(prec_rec_metrics) + if "slots" in intent_metrics: + for slot_metrics in intent_metrics["slots"].values(): + prec_rec_metrics = _compute_precision_recall_f1(slot_metrics) + slot_metrics.update(prec_rec_metrics) return metrics