diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9b19cd2..fe96d88 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,18 @@
# Changelog
All notable changes to this project will be documented in this file.
-## [Unreleased]
+## [0.13.0] - 2018-07-25
+### Fixed
+- Crash while computing metrics when either actual or predicted intent is unknown
+
+### Removed
+- APIs depending implicitely on Snips NLU:
+ - `compute_cross_val_nlu_metrics`
+ - `compute_train_test_nlu_metrics`
+
+### Changed
+- Use flexible version specifiers for dependencies
+
## [0.12.0] - 2018-03-29
### Added
@@ -10,5 +21,6 @@ All notable changes to this project will be documented in this file.
- New option to exclude slot metrics in the output
- Samples
-[Unreleased]: https://github.com/snipsco/snips-nlu-metrics/compare/0.12.0...HEAD
+
+[0.13.0]: https://github.com/snipsco/snips-nlu-metrics/compare/0.12.0...0.13.0
[0.12.0]: https://github.com/snipsco/snips-nlu-metrics/compare/0.11.1...0.12.0
\ No newline at end of file
diff --git a/README.rst b/README.rst
index 29d123a..72f2283 100644
--- a/README.rst
+++ b/README.rst
@@ -22,7 +22,7 @@ Install
.. code-block:: console
- pip install snips_nlu_metrics
+ $ pip install snips_nlu_metrics
NLU Metrics API
@@ -40,24 +40,64 @@ The metrics output (json) provides detailed information about:
* parsing errors
* `confusion matrix`_
+Data
+----
+
+Some sample datasets, that can be used to compute metrics, are available
+`here `_. Alternatively, you can create your own dataset either by
+using ``snips-nlu``'s `dataset generation tool`_ or by going on the
+`Snips console`_.
+
Examples
--------
-The Snips NLU metrics library can be used either with `Snips NLU`_ or with a
-custom intent parsing pipeline.
+The Snips NLU metrics library can be used with any NLU pipeline which satisfies
+the ``Engine`` API:
+
+.. code-block:: python
+
+ from builtins import object
+
+ class Engine(object):
+ def fit(self, dataset):
+ # Perform training ...
+ return self
+
+ def parse(self, text):
+ # extract intent and slots ...
+ return {
+ "input": text,
+ "intent": {
+ "intentName": intent_name,
+ "probability": probability
+ },
+ "slots": slots
+ }
+
----------------
Snips NLU Engine
----------------
-Here is how you can use the metrics API to compute metrics for the Snips NLU
-pipeline:
+This library can be used to benchmark NLU solutions such as `Snips NLU`_. To
+install the ``snips-nlu`` python library, and fetch the language resources for
+english, run the following commands:
+
+.. code-block:: bash
+
+ $ pip install snips-nlu
+ $ snips-nlu download en
+
+
+Then, you can compute metrics for the ``snips-nlu`` pipeline using the metrics
+API as follows:
.. code-block:: python
- from snips_nlu import SnipsNLUEngine
+ from snips_nlu import load_resources, SnipsNLUEngine
from snips_nlu_metrics import compute_train_test_metrics, compute_cross_val_metrics
+ load_resources("en")
tt_metrics = compute_train_test_metrics(train_dataset="samples/train_dataset.json",
test_dataset="samples/test_dataset.json",
@@ -67,16 +107,6 @@ pipeline:
engine_class=SnipsNLUEngine,
nb_folds=5)
-Some `sample code and datasets `_ are also available, you can have an
-overview of the metrics output by running:
-
-.. code-block:: bash
-
- git clone https://github.com/snipsco/snips-nlu-metrics.git
- cd snips-nlu-metrics
- pip install -e ".[samples]"
- python samples/sample.py train-test
-
-----------------
Custom NLU Engine
-----------------
@@ -128,4 +158,6 @@ This library is provided by `Snips `_ as Open Source softw
.. _train/test: https://en.wikipedia.org/wiki/Training,_test,_and_validation_sets
.. _Snips NLU: https://github.com/snipsco/snips-nlu
.. _precision, recall and f1 scores: https://en.wikipedia.org/wiki/Precision_and_recall
-.. _confusion matrix: https://en.wikipedia.org/wiki/Confusion_matrix
\ No newline at end of file
+.. _confusion matrix: https://en.wikipedia.org/wiki/Confusion_matrix
+.. _dataset generation tool: http://snips-nlu.readthedocs.io/en/latest/tutorial.html#snips-dataset-format
+.. _Snips console: https://console.snips.ai
\ No newline at end of file
diff --git a/samples/__init__.py b/samples/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/samples/sample.py b/samples/sample.py
deleted file mode 100644
index 49deedc..0000000
--- a/samples/sample.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import argparse
-import json
-import os
-import sys
-
-from snips_nlu import SnipsNLUEngine, load_resources
-
-from snips_nlu_metrics import (compute_train_test_metrics,
- compute_cross_val_metrics)
-
-SAMPLES_DIR = os.path.dirname(os.path.abspath(__file__))
-TRAIN_DATASET_PATH = os.path.join(SAMPLES_DIR, "train_dataset.json")
-TEST_DATASET_PATH = os.path.join(SAMPLES_DIR, "test_dataset.json")
-CROSS_VAL_DATASET_PATH = os.path.join(SAMPLES_DIR, "cross_val_dataset.json")
-
-
-def compute_sample_train_test_metrics():
- load_resources("en")
- return compute_train_test_metrics(
- train_dataset=TRAIN_DATASET_PATH,
- test_dataset=TEST_DATASET_PATH,
- engine_class=SnipsNLUEngine)
-
-
-def compute_sample_cross_val_metrics():
- load_resources("en")
- return compute_cross_val_metrics(dataset=CROSS_VAL_DATASET_PATH,
- engine_class=SnipsNLUEngine,
- nb_folds=5)
-
-
-def main_metrics():
- parser = argparse.ArgumentParser(
- description="Compute sample metrics on the Snips NLU parsing pipeline")
- parser.add_argument("metrics_type", type=str,
- choices=["train-test", "cross-val"],
- metavar="metrics_type",
- help="Type of metrics to compute")
- args = parser.parse_args(sys.argv[1:])
- if args.metrics_type == "train_test":
- metrics = compute_sample_train_test_metrics()
- else:
- metrics = compute_sample_cross_val_metrics()
- print(json.dumps(metrics, indent=2))
-
-
-if __name__ == '__main__':
- main_metrics()
diff --git a/setup.py b/setup.py
index b890fb4..624f9e4 100644
--- a/setup.py
+++ b/setup.py
@@ -19,17 +19,14 @@
install_requires = [
"future",
- "numpy==1.14.0",
- "scipy==1.0.0",
- "scikit-learn==0.19.1",
+ "numpy>=1.7,<2.0",
+ "scipy>=1.0,<2.0",
+ "scikit-learn>=0.19,<0.20",
]
extras_require = {
"test": [
- "mock==2.0.0",
- ],
- "samples": [
- "snips-nlu==0.12.1"
+ "mock>=2.0,<3.0",
]
}
diff --git a/snips_nlu_metrics/__init__.py b/snips_nlu_metrics/__init__.py
index 1cf231f..9947a1d 100644
--- a/snips_nlu_metrics/__init__.py
+++ b/snips_nlu_metrics/__init__.py
@@ -2,6 +2,4 @@
from snips_nlu_metrics.engine import Engine
from snips_nlu_metrics.metrics import (compute_train_test_metrics,
- compute_train_test_nlu_metrics,
- compute_cross_val_metrics,
- compute_cross_val_nlu_metrics)
+ compute_cross_val_metrics)
diff --git a/snips_nlu_metrics/__version__ b/snips_nlu_metrics/__version__
index d33c3a2..51de330 100644
--- a/snips_nlu_metrics/__version__
+++ b/snips_nlu_metrics/__version__
@@ -1 +1 @@
-0.12.0
\ No newline at end of file
+0.13.0
\ No newline at end of file
diff --git a/snips_nlu_metrics/engine.py b/snips_nlu_metrics/engine.py
index 6df3f51..293a37e 100644
--- a/snips_nlu_metrics/engine.py
+++ b/snips_nlu_metrics/engine.py
@@ -1,20 +1,10 @@
from __future__ import unicode_literals
-import io
-import json
-import os
-import zipfile
from abc import ABCMeta, abstractmethod
from builtins import object
-from builtins import str
-from copy import deepcopy
from future.utils import with_metaclass
-from snips_nlu_metrics.utils.temp_utils import tempdir_ctx
-
-TRAINED_ENGINE_FILENAME = "trained_assistant.json"
-
class Engine(with_metaclass(ABCMeta, object)):
"""Abstract class which represents an engine that can be used in the
@@ -28,50 +18,3 @@ def fit(self, dataset):
@abstractmethod
def parse(self, text):
pass
-
-
-def build_nlu_engine_class(training_class, inference_class,
- training_config=None):
- _training_config = deepcopy(training_config)
-
- class NLUEngine(Engine):
- def __init__(self):
- self.inference_engine = None
- self.training_config = _training_config
-
- def fit(self, dataset):
- if self.training_config is not None:
- training_engine = training_class(config=self.training_config)
- else:
- training_engine = training_class()
- training_engine.fit(dataset)
- trained_engine_dict = training_engine.to_dict()
- self.inference_engine = get_inference_nlu_engine(
- trained_engine_dict, inference_class)
-
- def parse(self, text):
- return self.inference_engine.parse(text)
-
- return NLUEngine
-
-
-def get_trained_nlu_engine(train_dataset, training_engine_class):
- language = train_dataset["language"]
- engine = training_engine_class(language)
- engine.fit(train_dataset)
- return engine
-
-
-def get_inference_nlu_engine(trained_engine_dict, inference_engine_class):
- with tempdir_ctx() as engine_dir:
- trained_engine_path = os.path.join(engine_dir, TRAINED_ENGINE_FILENAME)
- archive_path = os.path.join(engine_dir, 'assistant.zip')
-
- with io.open(trained_engine_path, mode='w', encoding='utf8') as f:
- f.write(str(json.dumps(trained_engine_dict)))
- with zipfile.ZipFile(archive_path, 'w') as zf:
- zf.write(trained_engine_path, arcname=TRAINED_ENGINE_FILENAME)
- with io.open(archive_path, mode='rb') as f:
- data_zip = bytearray(f.read())
-
- return inference_engine_class(data_zip=data_zip)
diff --git a/snips_nlu_metrics/metrics.py b/snips_nlu_metrics/metrics.py
index 30c1a4c..b46f263 100644
--- a/snips_nlu_metrics/metrics.py
+++ b/snips_nlu_metrics/metrics.py
@@ -1,65 +1,17 @@
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+from __future__ import division, print_function, unicode_literals
import io
import json
from past.builtins import basestring
-from snips_nlu_metrics.engine import build_nlu_engine_class
from snips_nlu_metrics.utils.constants import (
- INTENTS, UTTERANCES, INTENT_UTTERANCES, PARSING_ERRORS, METRICS,
- CONFUSION_MATRIX)
+ CONFUSION_MATRIX, INTENTS, INTENT_UTTERANCES, METRICS, PARSING_ERRORS,
+ UTTERANCES)
from snips_nlu_metrics.utils.exception import NotEnoughDataError
from snips_nlu_metrics.utils.metrics_utils import (
- create_shuffle_stratified_splits, compute_engine_metrics,
- aggregate_metrics, compute_precision_recall_f1, aggregate_matrices)
-
-
-def compute_cross_val_nlu_metrics(dataset, training_engine_class,
- inference_engine_class, nb_folds=5,
- train_size_ratio=1.0,
- drop_entities=False,
- include_slot_metrics=True,
- slot_matching_lambda=None,
- progression_handler=None):
- """Compute pure NLU metrics on the dataset using cross validation
-
- Args:
- dataset (dict or str): Dataset or path to dataset
- training_engine_class: Python class to use for training
- inference_engine_class: Python class to use for inference
- nb_folds (int, optional): Number of folds to use for cross validation
- train_size_ratio (float, optional): Ratio of intent utterances to use
- for training
- drop_entities (bool, false): Specify whether not all entity values
- should be removed from training data
- include_slot_metrics (bool, true): If false, the slots metrics and the
- slots parsing errors will not be reported.
- slot_matching_lambda (lambda, optional):
- lambda expected_slot, actual_slot -> bool,
- if defined, this function will be use to match slots when computing
- metrics, otherwise exact match will be used.
- `expected_slot` corresponds to the slot as defined in the dataset,
- and `actual_slot` corresponds to the slot as returned by the NLU
- progression_handler (lambda, optional): handler called at each
- progression (%) step
-
- Returns
- dict: Metrics results containing the following data
-
- - "metrics": the computed metrics
- - "parsing_errors": the list of parsing errors
-
- """
- engine_class = build_nlu_engine_class(training_engine_class,
- inference_engine_class)
- return compute_cross_val_metrics(dataset, engine_class, nb_folds,
- train_size_ratio, drop_entities,
- include_slot_metrics,
- slot_matching_lambda,
- progression_handler)
+ aggregate_matrices, aggregate_metrics, compute_engine_metrics,
+ compute_precision_recall_f1, create_shuffle_stratified_splits)
def compute_cross_val_metrics(dataset, engine_class, nb_folds=5,
@@ -144,42 +96,6 @@ class must inherit from `Engine`
}
-def compute_train_test_nlu_metrics(train_dataset, test_dataset,
- training_engine_class,
- inference_engine_class,
- include_slot_metrics=True,
- slot_matching_lambda=None):
- """Compute pure NLU metrics on `test_dataset` after having trained on
- `train_dataset`
-
- Args
- train_dataset (dict or str): Dataset or path to dataset used for
- training
- test_dataset (dict or str): Dataset or path to dataset used for testing
- training_engine_class: Python class to use for training
- inference_engine_class: Python class to use for inference
- include_slot_metrics (bool, true): If false, the slots metrics and the
- slots parsing errors will not be reported.
- slot_matching_lambda (lambda, optional):
- lambda expected_slot, actual_slot -> bool,
- if defined, this function will be use to match slots when computing
- metrics, otherwise exact match will be used.
- `expected_slot` corresponds to the slot as defined in the dataset,
- and `actual_slot` corresponds to the slot as returned by the NLU
-
- Returns
- dict: Metrics results containing the following data
-
- - "metrics": the computed metrics
- - "parsing_errors": the list of parsing errors
- """
- engine_class = build_nlu_engine_class(training_engine_class,
- inference_engine_class)
- return compute_train_test_metrics(train_dataset, test_dataset,
- engine_class, include_slot_metrics,
- slot_matching_lambda)
-
-
def compute_train_test_metrics(train_dataset, test_dataset, engine_class,
include_slot_metrics=True,
slot_matching_lambda=None):
diff --git a/snips_nlu_metrics/tests/engine_config.py b/snips_nlu_metrics/tests/engine_config.py
deleted file mode 100644
index f79b52c..0000000
--- a/snips_nlu_metrics/tests/engine_config.py
+++ /dev/null
@@ -1,163 +0,0 @@
-from __future__ import unicode_literals
-
-NLU_CONFIG = {
- "unit_name": "nlu_engine",
- "intent_parsers_configs": [
- {
- "unit_name": "deterministic_intent_parser",
- "max_queries": 50,
- "max_entities": 200
- },
- {
- "unit_name": "probabilistic_intent_parser",
- "intent_classifier_config": {
- "data_augmentation_config": {
- "min_utterances": 20,
- "unknown_words_replacement_string": None,
- "noise_factor": 5,
- "unknown_word_prob": 0
- },
- "unit_name": "log_reg_intent_classifier",
- "featurizer_config": {
- "sublinear_tf": False
- },
- "random_seed": None,
- "log_reg_args": {
- "penalty": "l2",
- "loss": "log",
- "n_iter": 5,
- "n_jobs": -1,
- "class_weight": "balanced"
- }
- },
- "slot_filler_config": {
- "data_augmentation_config": {
- "capitalization_ratio": 0.2,
- "min_utterances": 200
- },
- "unit_name": "crf_slot_filler",
- "entities_offsets": [
- -2,
- -1,
- 0
- ],
- "crf_args": {
- "c2": 0.1,
- "c1": 0.1,
- "algorithm": "lbfgs"
- },
- "tagging_scheme": 1,
- "random_seed": None,
- "feature_factory_configs": [
- {
- "args": {
- "common_words_gazetteer_name": None,
- "use_stemming": True,
- "n": 1
- },
- "factory_name": "ngram",
- "offsets": [
- -2,
- -1,
- 0,
- 1,
- 2
- ]
- },
- {
- "args": {
- "common_words_gazetteer_name": None,
- "use_stemming": True,
- "n": 2
- },
- "factory_name": "ngram",
- "offsets": [
- -2,
- 1
- ]
- },
- {
- "args": {},
- "factory_name": "is_digit",
- "offsets": [
- -1,
- 0,
- 1
- ]
- },
- {
- "args": {},
- "factory_name": "is_first",
- "offsets": [
- -2,
- -1,
- 0
- ]
- },
- {
- "args": {},
- "factory_name": "is_last",
- "offsets": [
- 0,
- 1,
- 2
- ]
- },
- {
- "args": {
- "n": 1
- },
- "factory_name": "shape_ngram",
- "offsets": [
- 0
- ]
- },
- {
- "args": {
- "n": 2
- },
- "factory_name": "shape_ngram",
- "offsets": [
- -1,
- 0
- ]
- },
- {
- "args": {
- "n": 3
- },
- "factory_name": "shape_ngram",
- "offsets": [
- -1
- ]
- },
- {
- "args": {
- "tagging_scheme_code": 2,
- "use_stemming": True
- },
- "factory_name": "entity_match",
- "drop_out": 0.1,
- "offsets": [
- -2,
- -1,
- 0
- ]
- },
- {
- "args": {
- "tagging_scheme_code": 1
- },
- "factory_name": "builtin_entity_match",
- "offsets": [
- -2,
- -1,
- 0
- ]
- }
- ],
- "exhaustive_permutations_threshold": 64
- }
- }
- ]
-}
diff --git a/snips_nlu_metrics/tests/mock_engine.py b/snips_nlu_metrics/tests/mock_engine.py
index c39277c..22ad3f5 100644
--- a/snips_nlu_metrics/tests/mock_engine.py
+++ b/snips_nlu_metrics/tests/mock_engine.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-from builtins import object
-
from snips_nlu_metrics import Engine
@@ -13,26 +11,6 @@ def dummy_parsing_result(text):
}
-class MockTrainingEngine(object):
- def __init__(self, config=None):
- self.training_config = config
- self.fitted = False
-
- def fit(self, dataset):
- self.fitted = True
-
- def to_dict(self):
- return dict()
-
-
-class MockInferenceEngine(object):
- def __init__(self, data_zip):
- pass
-
- def parse(self, text):
- return dummy_parsing_result(text)
-
-
class MockEngine(Engine):
def __init__(self):
self.fitted = False
diff --git a/snips_nlu_metrics/tests/test_metrics.py b/snips_nlu_metrics/tests/test_metrics.py
index 672b45d..dfe71ad 100644
--- a/snips_nlu_metrics/tests/test_metrics.py
+++ b/snips_nlu_metrics/tests/test_metrics.py
@@ -3,33 +3,27 @@
import os
import unittest
-from mock import patch
-
-from snips_nlu_metrics.engine import build_nlu_engine_class
from snips_nlu_metrics.metrics import (compute_cross_val_metrics,
- compute_train_test_metrics,
- compute_cross_val_nlu_metrics,
- compute_train_test_nlu_metrics)
-from snips_nlu_metrics.tests.engine_config import NLU_CONFIG
-from snips_nlu_metrics.tests.mock_engine import (MockTrainingEngine,
- MockInferenceEngine)
+ compute_train_test_metrics)
+from snips_nlu_metrics.tests.mock_engine import MockEngine
from snips_nlu_metrics.utils.constants import METRICS, PARSING_ERRORS
class TestMetrics(unittest.TestCase):
- def test_cross_val_nlu_metrics(self):
+ def test_compute_cross_val_metrics(self):
# Given
dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"resources", "beverage_dataset.json")
- # When
+ with io.open(dataset_path, encoding="utf8") as f:
+ dataset = json.load(f)
+
+ # When/Then
try:
- res = compute_cross_val_nlu_metrics(
- dataset=dataset_path, training_engine_class=MockTrainingEngine,
- inference_engine_class=MockInferenceEngine, nb_folds=2)
+ res = compute_cross_val_metrics(
+ dataset=dataset, engine_class=MockEngine, nb_folds=2)
except Exception as e:
self.fail(e.args[0])
- # Then
expected_metrics = {
"null": {
"intent": {
@@ -97,24 +91,7 @@ def test_cross_val_nlu_metrics(self):
self.assertDictEqual(expected_metrics, res["metrics"])
- def test_cross_val_metrics_should_skip_when_not_enough_data(self):
- # Given
- dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
- "resources", "beverage_dataset.json")
-
- # When
- result = compute_cross_val_nlu_metrics(
- dataset=dataset_path, training_engine_class=MockTrainingEngine,
- inference_engine_class=MockInferenceEngine, nb_folds=11)
-
- # Then
- expected_result = {
- METRICS: None,
- PARSING_ERRORS: []
- }
- self.assertDictEqual(expected_result, result)
-
- def test_end_to_end_cross_val_metrics(self):
+ def test_compute_cross_val_metrics_without_slot_metrics(self):
# Given
dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"resources", "beverage_dataset.json")
@@ -123,35 +100,67 @@ def test_end_to_end_cross_val_metrics(self):
# When/Then
try:
- engine_class = build_nlu_engine_class(MockTrainingEngine,
- MockInferenceEngine)
- compute_cross_val_metrics(dataset=dataset,
- engine_class=engine_class, nb_folds=5)
+ res = compute_cross_val_metrics(
+ dataset=dataset, engine_class=MockEngine, nb_folds=2,
+ include_slot_metrics=False)
except Exception as e:
self.fail(e.args[0])
- @patch("snips_nlu_metrics.metrics.compute_train_test_metrics")
- def test_train_test_nlu_metrics(self, mocked_train_test_metrics):
+ expected_metrics = {
+ "null": {
+ "intent": {
+ "true_positive": 0,
+ "false_positive": 11,
+ "false_negative": 0,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ },
+ "intent_utterances": 0
+ },
+ "MakeCoffee": {
+ "intent": {
+ "true_positive": 0,
+ "false_positive": 0,
+ "false_negative": 7,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ },
+ "intent_utterances": 7
+ },
+ "MakeTea": {
+ "intent": {
+ "true_positive": 0,
+ "false_positive": 0,
+ "false_negative": 4,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ },
+ "intent_utterances": 4
+ }
+ }
+
+ self.assertDictEqual(expected_metrics, res["metrics"])
+
+ def test_cross_val_metrics_should_skip_when_not_enough_data(self):
# Given
- mocked_metrics_result = {"metrics": "ok"}
- mocked_train_test_metrics.return_value = mocked_metrics_result
dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"resources", "beverage_dataset.json")
- with io.open(dataset_path, encoding="utf8") as f:
- dataset = json.load(f)
- # When/Then
- try:
- res = compute_train_test_nlu_metrics(
- train_dataset=dataset, test_dataset=dataset,
- training_engine_class=MockTrainingEngine,
- inference_engine_class=MockInferenceEngine)
- except Exception as e:
- self.fail(e.args[0])
+ # When
+ result = compute_cross_val_metrics(
+ dataset=dataset_path, engine_class=MockEngine, nb_folds=11)
- self.assertDictEqual(mocked_metrics_result, res)
+ # Then
+ expected_result = {
+ METRICS: None,
+ PARSING_ERRORS: []
+ }
+ self.assertDictEqual(expected_result, result)
- def test_end_to_end_train_test_metrics(self):
+ def test_compute_train_test_metrics(self):
# Given
dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"resources", "beverage_dataset.json")
@@ -160,15 +169,78 @@ def test_end_to_end_train_test_metrics(self):
# When/Then
try:
- engine_class = build_nlu_engine_class(MockTrainingEngine,
- MockInferenceEngine)
- compute_train_test_metrics(
+ res = compute_train_test_metrics(
train_dataset=dataset, test_dataset=dataset,
- engine_class=engine_class)
+ engine_class=MockEngine)
except Exception as e:
self.fail(e.args[0])
- def test_end_to_end_train_test_metrics_with_training_config(self):
+ expected_metrics = {
+ "MakeCoffee": {
+ "intent": {
+ "true_positive": 0,
+ "false_positive": 0,
+ "false_negative": 7,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ },
+ "slots": {
+ "number_of_cups": {
+ "true_positive": 0,
+ "false_positive": 0,
+ "false_negative": 0,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ }
+ },
+ "intent_utterances": 7
+ },
+ "null": {
+ "intent": {
+ "true_positive": 0,
+ "false_positive": 11,
+ "false_negative": 0,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0},
+ "slots": {},
+ "intent_utterances": 0
+ }, "MakeTea": {
+ "intent": {
+ "true_positive": 0,
+ "false_positive": 0,
+ "false_negative": 4,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ },
+ "slots": {
+ "number_of_cups": {
+ "true_positive": 0,
+ "false_positive": 0,
+ "false_negative": 0,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ },
+ "beverage_temperature": {
+ "true_positive": 0,
+ "false_positive": 0,
+ "false_negative": 0,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ }
+ },
+ "intent_utterances": 4
+ }
+ }
+
+ self.assertDictEqual(expected_metrics, res["metrics"])
+
+ def test_compute_train_test_metrics_without_slots_metrics(self):
# Given
dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"resources", "beverage_dataset.json")
@@ -177,11 +249,44 @@ def test_end_to_end_train_test_metrics_with_training_config(self):
# When/Then
try:
- engine_class = build_nlu_engine_class(MockTrainingEngine,
- MockInferenceEngine,
- training_config=NLU_CONFIG)
- compute_train_test_metrics(
+ res = compute_train_test_metrics(
train_dataset=dataset, test_dataset=dataset,
- engine_class=engine_class)
+ engine_class=MockEngine, include_slot_metrics=False)
except Exception as e:
self.fail(e.args[0])
+
+ expected_metrics = {
+ "MakeCoffee": {
+ "intent": {
+ "true_positive": 0,
+ "false_positive": 0,
+ "false_negative": 7,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ },
+ "intent_utterances": 7
+ },
+ "null": {
+ "intent": {
+ "true_positive": 0,
+ "false_positive": 11,
+ "false_negative": 0,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0},
+ "intent_utterances": 0
+ }, "MakeTea": {
+ "intent": {
+ "true_positive": 0,
+ "false_positive": 0,
+ "false_negative": 4,
+ "precision": 0.0,
+ "recall": 0.0,
+ "f1": 0.0
+ },
+ "intent_utterances": 4
+ }
+ }
+
+ self.assertDictEqual(expected_metrics, res["metrics"])
diff --git a/snips_nlu_metrics/tests/test_nlu_engine.py b/snips_nlu_metrics/tests/test_nlu_engine.py
deleted file mode 100644
index d72f4eb..0000000
--- a/snips_nlu_metrics/tests/test_nlu_engine.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from __future__ import unicode_literals
-
-import unittest
-
-from snips_nlu_metrics.engine import (
- get_trained_nlu_engine, get_inference_nlu_engine)
-from snips_nlu_metrics.tests.mock_engine import (
- MockTrainingEngine, MockInferenceEngine)
-
-
-class TestNLUEngine(unittest.TestCase):
- def test_get_trained_engine_should_use_provided_engine_class(self):
- # Given
- _dataset = {
- "language": "en",
- "intents": {
- "intent1": {
- "utterances": [
- {"data": [{"text": "text1"}]},
- ]
- },
- "intent2": {
- "utterances": [
- {"data": [{"text": "text2"}]},
- ]
- },
- },
- "entities": {},
- "snips_nlu_version": "0.1.0"
- }
-
- # When
- engine = get_trained_nlu_engine(_dataset, MockTrainingEngine)
-
- # Then
- self.assertTrue(engine.fitted, 1)
-
- def test_get_inference_engine_should_use_provided_engine_class(self):
- # When
- inference_engine = get_inference_nlu_engine(dict(),
- MockInferenceEngine)
-
- # Then
- self.assertIsInstance(inference_engine, MockInferenceEngine)
diff --git a/snips_nlu_metrics/utils/metrics_utils.py b/snips_nlu_metrics/utils/metrics_utils.py
index 5fb6c5c..98f0c32 100644
--- a/snips_nlu_metrics/utils/metrics_utils.py
+++ b/snips_nlu_metrics/utils/metrics_utils.py
@@ -114,8 +114,12 @@ def compute_engine_metrics(engine, test_utterances, intent_list,
predicted_slots = [] if parsing["slots"] is None else parsing["slots"]
- i = intents_idx[actual_intent]
- j = intents_idx[predicted_intent]
+ i = intents_idx.get(actual_intent)
+ j = intents_idx.get(predicted_intent)
+
+ if i is None or j is None:
+ continue
+
confusion_matrix["matrix"][i][j] += 1
utterance_metrics = compute_utterance_metrics(
@@ -230,9 +234,10 @@ def compute_precision_recall_f1(metrics):
prec_rec_metrics = _compute_precision_recall_f1(
intent_metrics["intent"])
intent_metrics["intent"].update(prec_rec_metrics)
- for slot_metrics in intent_metrics["slots"].values():
- prec_rec_metrics = _compute_precision_recall_f1(slot_metrics)
- slot_metrics.update(prec_rec_metrics)
+ if "slots" in intent_metrics:
+ for slot_metrics in intent_metrics["slots"].values():
+ prec_rec_metrics = _compute_precision_recall_f1(slot_metrics)
+ slot_metrics.update(prec_rec_metrics)
return metrics