diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9b19cd2..fe96d88 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,18 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
-## [Unreleased]
+## [0.13.0] - 2018-07-25
+### Fixed
+- Crash while computing metrics when either actual or predicted intent is unknown
+
+### Removed
+- APIs depending implicitely on Snips NLU: 
+    - `compute_cross_val_nlu_metrics`
+    - `compute_train_test_nlu_metrics`
+    
+### Changed
+- Use flexible version specifiers for dependencies
+
 
 ## [0.12.0] - 2018-03-29
 ### Added
@@ -10,5 +21,6 @@ All notable changes to this project will be documented in this file.
 - New option to exclude slot metrics in the output
 - Samples
 
-[Unreleased]: https://github.com/snipsco/snips-nlu-metrics/compare/0.12.0...HEAD
+
+[0.13.0]: https://github.com/snipsco/snips-nlu-metrics/compare/0.12.0...0.13.0
 [0.12.0]: https://github.com/snipsco/snips-nlu-metrics/compare/0.11.1...0.12.0
\ No newline at end of file
diff --git a/README.rst b/README.rst
index 29d123a..72f2283 100644
--- a/README.rst
+++ b/README.rst
@@ -22,7 +22,7 @@ Install
 
 .. code-block:: console
 
-    pip install snips_nlu_metrics
+    $ pip install snips_nlu_metrics
 
 
 NLU Metrics API
@@ -40,24 +40,64 @@ The metrics output (json) provides detailed information about:
 * parsing errors
 * `confusion matrix`_
 
+Data
+----
+
+Some sample datasets, that can be used to compute metrics, are available
+`here <samples/>`_. Alternatively, you can create your own dataset either by
+using ``snips-nlu``'s `dataset generation tool`_ or by going on the
+`Snips console`_.
+
 Examples
 --------
 
-The Snips NLU metrics library can be used either with `Snips NLU`_ or with a
-custom intent parsing pipeline.
+The Snips NLU metrics library can be used with any NLU pipeline which satisfies
+the ``Engine`` API:
+
+.. code-block:: python
+
+    from builtins import object
+
+    class Engine(object):
+        def fit(self, dataset):
+            # Perform training ...
+            return self
+
+        def parse(self, text):
+            # extract intent and slots ...
+            return {
+                "input": text,
+                "intent": {
+                    "intentName": intent_name,
+                    "probability": probability
+                },
+                "slots": slots
+            }
+
 
 ----------------
 Snips NLU Engine
 ----------------
 
-Here is how you can use the metrics API to compute metrics for the Snips NLU
-pipeline:
+This library can be used to benchmark NLU solutions such as `Snips NLU`_. To
+install the ``snips-nlu`` python library, and fetch the language resources for
+english, run the following commands:
+
+.. code-block:: bash
+
+    $ pip install snips-nlu
+    $ snips-nlu download en
+
+
+Then, you can compute metrics for the ``snips-nlu`` pipeline using the metrics
+API as follows:
 
 .. code-block:: python
 
-    from snips_nlu import SnipsNLUEngine
+    from snips_nlu import load_resources, SnipsNLUEngine
     from snips_nlu_metrics import compute_train_test_metrics, compute_cross_val_metrics
 
+    load_resources("en")
 
     tt_metrics = compute_train_test_metrics(train_dataset="samples/train_dataset.json",
                                             test_dataset="samples/test_dataset.json",
@@ -67,16 +107,6 @@ pipeline:
                                            engine_class=SnipsNLUEngine,
                                            nb_folds=5)
 
-Some `sample code and datasets <samples/>`_ are also available, you can have an
-overview of the metrics output by running:
-
-.. code-block:: bash
-
-    git clone https://github.com/snipsco/snips-nlu-metrics.git
-    cd snips-nlu-metrics
-    pip install -e ".[samples]"
-    python samples/sample.py train-test
-
 -----------------
 Custom NLU Engine
 -----------------
@@ -128,4 +158,6 @@ This library is provided by `Snips <https://www.snips.ai>`_ as Open Source softw
 .. _train/test: https://en.wikipedia.org/wiki/Training,_test,_and_validation_sets
 .. _Snips NLU: https://github.com/snipsco/snips-nlu
 .. _precision, recall and f1 scores: https://en.wikipedia.org/wiki/Precision_and_recall
-.. _confusion matrix: https://en.wikipedia.org/wiki/Confusion_matrix
\ No newline at end of file
+.. _confusion matrix: https://en.wikipedia.org/wiki/Confusion_matrix
+.. _dataset generation tool: http://snips-nlu.readthedocs.io/en/latest/tutorial.html#snips-dataset-format
+.. _Snips console: https://console.snips.ai
\ No newline at end of file
diff --git a/samples/__init__.py b/samples/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/samples/sample.py b/samples/sample.py
deleted file mode 100644
index 49deedc..0000000
--- a/samples/sample.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import argparse
-import json
-import os
-import sys
-
-from snips_nlu import SnipsNLUEngine, load_resources
-
-from snips_nlu_metrics import (compute_train_test_metrics,
-                               compute_cross_val_metrics)
-
-SAMPLES_DIR = os.path.dirname(os.path.abspath(__file__))
-TRAIN_DATASET_PATH = os.path.join(SAMPLES_DIR, "train_dataset.json")
-TEST_DATASET_PATH = os.path.join(SAMPLES_DIR, "test_dataset.json")
-CROSS_VAL_DATASET_PATH = os.path.join(SAMPLES_DIR, "cross_val_dataset.json")
-
-
-def compute_sample_train_test_metrics():
-    load_resources("en")
-    return compute_train_test_metrics(
-        train_dataset=TRAIN_DATASET_PATH,
-        test_dataset=TEST_DATASET_PATH,
-        engine_class=SnipsNLUEngine)
-
-
-def compute_sample_cross_val_metrics():
-    load_resources("en")
-    return compute_cross_val_metrics(dataset=CROSS_VAL_DATASET_PATH,
-                                     engine_class=SnipsNLUEngine,
-                                     nb_folds=5)
-
-
-def main_metrics():
-    parser = argparse.ArgumentParser(
-        description="Compute sample metrics on the Snips NLU parsing pipeline")
-    parser.add_argument("metrics_type", type=str,
-                        choices=["train-test", "cross-val"],
-                        metavar="metrics_type",
-                        help="Type of metrics to compute")
-    args = parser.parse_args(sys.argv[1:])
-    if args.metrics_type == "train_test":
-        metrics = compute_sample_train_test_metrics()
-    else:
-        metrics = compute_sample_cross_val_metrics()
-    print(json.dumps(metrics, indent=2))
-
-
-if __name__ == '__main__':
-    main_metrics()
diff --git a/setup.py b/setup.py
index b890fb4..624f9e4 100644
--- a/setup.py
+++ b/setup.py
@@ -19,17 +19,14 @@
 
 install_requires = [
     "future",
-    "numpy==1.14.0",
-    "scipy==1.0.0",
-    "scikit-learn==0.19.1",
+    "numpy>=1.7,<2.0",
+    "scipy>=1.0,<2.0",
+    "scikit-learn>=0.19,<0.20",
 ]
 
 extras_require = {
     "test": [
-        "mock==2.0.0",
-    ],
-    "samples": [
-        "snips-nlu==0.12.1"
+        "mock>=2.0,<3.0",
     ]
 }
 
diff --git a/snips_nlu_metrics/__init__.py b/snips_nlu_metrics/__init__.py
index 1cf231f..9947a1d 100644
--- a/snips_nlu_metrics/__init__.py
+++ b/snips_nlu_metrics/__init__.py
@@ -2,6 +2,4 @@
 
 from snips_nlu_metrics.engine import Engine
 from snips_nlu_metrics.metrics import (compute_train_test_metrics,
-                                       compute_train_test_nlu_metrics,
-                                       compute_cross_val_metrics,
-                                       compute_cross_val_nlu_metrics)
+                                       compute_cross_val_metrics)
diff --git a/snips_nlu_metrics/__version__ b/snips_nlu_metrics/__version__
index d33c3a2..51de330 100644
--- a/snips_nlu_metrics/__version__
+++ b/snips_nlu_metrics/__version__
@@ -1 +1 @@
-0.12.0
\ No newline at end of file
+0.13.0
\ No newline at end of file
diff --git a/snips_nlu_metrics/engine.py b/snips_nlu_metrics/engine.py
index 6df3f51..293a37e 100644
--- a/snips_nlu_metrics/engine.py
+++ b/snips_nlu_metrics/engine.py
@@ -1,20 +1,10 @@
 from __future__ import unicode_literals
 
-import io
-import json
-import os
-import zipfile
 from abc import ABCMeta, abstractmethod
 from builtins import object
-from builtins import str
-from copy import deepcopy
 
 from future.utils import with_metaclass
 
-from snips_nlu_metrics.utils.temp_utils import tempdir_ctx
-
-TRAINED_ENGINE_FILENAME = "trained_assistant.json"
-
 
 class Engine(with_metaclass(ABCMeta, object)):
     """Abstract class which represents an engine that can be used in the
@@ -28,50 +18,3 @@ def fit(self, dataset):
     @abstractmethod
     def parse(self, text):
         pass
-
-
-def build_nlu_engine_class(training_class, inference_class,
-                           training_config=None):
-    _training_config = deepcopy(training_config)
-
-    class NLUEngine(Engine):
-        def __init__(self):
-            self.inference_engine = None
-            self.training_config = _training_config
-
-        def fit(self, dataset):
-            if self.training_config is not None:
-                training_engine = training_class(config=self.training_config)
-            else:
-                training_engine = training_class()
-            training_engine.fit(dataset)
-            trained_engine_dict = training_engine.to_dict()
-            self.inference_engine = get_inference_nlu_engine(
-                trained_engine_dict, inference_class)
-
-        def parse(self, text):
-            return self.inference_engine.parse(text)
-
-    return NLUEngine
-
-
-def get_trained_nlu_engine(train_dataset, training_engine_class):
-    language = train_dataset["language"]
-    engine = training_engine_class(language)
-    engine.fit(train_dataset)
-    return engine
-
-
-def get_inference_nlu_engine(trained_engine_dict, inference_engine_class):
-    with tempdir_ctx() as engine_dir:
-        trained_engine_path = os.path.join(engine_dir, TRAINED_ENGINE_FILENAME)
-        archive_path = os.path.join(engine_dir, 'assistant.zip')
-
-        with io.open(trained_engine_path, mode='w', encoding='utf8') as f:
-            f.write(str(json.dumps(trained_engine_dict)))
-        with zipfile.ZipFile(archive_path, 'w') as zf:
-            zf.write(trained_engine_path, arcname=TRAINED_ENGINE_FILENAME)
-        with io.open(archive_path, mode='rb') as f:
-            data_zip = bytearray(f.read())
-
-    return inference_engine_class(data_zip=data_zip)
diff --git a/snips_nlu_metrics/metrics.py b/snips_nlu_metrics/metrics.py
index 30c1a4c..b46f263 100644
--- a/snips_nlu_metrics/metrics.py
+++ b/snips_nlu_metrics/metrics.py
@@ -1,65 +1,17 @@
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+from __future__ import division, print_function, unicode_literals
 
 import io
 import json
 
 from past.builtins import basestring
 
-from snips_nlu_metrics.engine import build_nlu_engine_class
 from snips_nlu_metrics.utils.constants import (
-    INTENTS, UTTERANCES, INTENT_UTTERANCES, PARSING_ERRORS, METRICS,
-    CONFUSION_MATRIX)
+    CONFUSION_MATRIX, INTENTS, INTENT_UTTERANCES, METRICS, PARSING_ERRORS,
+    UTTERANCES)
 from snips_nlu_metrics.utils.exception import NotEnoughDataError
 from snips_nlu_metrics.utils.metrics_utils import (
-    create_shuffle_stratified_splits, compute_engine_metrics,
-    aggregate_metrics, compute_precision_recall_f1, aggregate_matrices)
-
-
-def compute_cross_val_nlu_metrics(dataset, training_engine_class,
-                                  inference_engine_class, nb_folds=5,
-                                  train_size_ratio=1.0,
-                                  drop_entities=False,
-                                  include_slot_metrics=True,
-                                  slot_matching_lambda=None,
-                                  progression_handler=None):
-    """Compute pure NLU metrics on the dataset using cross validation
-
-    Args:
-        dataset (dict or str): Dataset or path to dataset
-        training_engine_class: Python class to use for training
-        inference_engine_class: Python class to use for inference
-        nb_folds (int, optional): Number of folds to use for cross validation
-        train_size_ratio (float, optional): Ratio of intent utterances to use
-            for training
-        drop_entities (bool, false): Specify whether not all entity values
-            should be removed from training data
-        include_slot_metrics (bool, true): If false, the slots metrics and the
-            slots parsing errors will not be reported.
-        slot_matching_lambda (lambda, optional):
-            lambda expected_slot, actual_slot -> bool,
-            if defined, this function will be use to match slots when computing
-            metrics, otherwise exact match will be used.
-            `expected_slot` corresponds to the slot as defined in the dataset,
-            and `actual_slot` corresponds to the slot as returned by the NLU
-        progression_handler (lambda, optional): handler called at each
-            progression (%) step
-
-    Returns
-        dict: Metrics results containing the following data
-
-            - "metrics": the computed metrics
-            - "parsing_errors": the list of parsing errors
-
-    """
-    engine_class = build_nlu_engine_class(training_engine_class,
-                                          inference_engine_class)
-    return compute_cross_val_metrics(dataset, engine_class, nb_folds,
-                                     train_size_ratio, drop_entities,
-                                     include_slot_metrics,
-                                     slot_matching_lambda,
-                                     progression_handler)
+    aggregate_matrices, aggregate_metrics, compute_engine_metrics,
+    compute_precision_recall_f1, create_shuffle_stratified_splits)
 
 
 def compute_cross_val_metrics(dataset, engine_class, nb_folds=5,
@@ -144,42 +96,6 @@ class must inherit from `Engine`
     }
 
 
-def compute_train_test_nlu_metrics(train_dataset, test_dataset,
-                                   training_engine_class,
-                                   inference_engine_class,
-                                   include_slot_metrics=True,
-                                   slot_matching_lambda=None):
-    """Compute pure NLU metrics on `test_dataset` after having trained on
-    `train_dataset`
-
-    Args
-        train_dataset (dict or str): Dataset or path to dataset used for
-            training
-        test_dataset (dict or str): Dataset or path to dataset used for testing
-        training_engine_class: Python class to use for training
-        inference_engine_class: Python class to use for inference
-        include_slot_metrics (bool, true): If false, the slots metrics and the
-            slots parsing errors will not be reported.
-        slot_matching_lambda (lambda, optional):
-            lambda expected_slot, actual_slot -> bool,
-            if defined, this function will be use to match slots when computing
-            metrics, otherwise exact match will be used.
-            `expected_slot` corresponds to the slot as defined in the dataset,
-            and `actual_slot` corresponds to the slot as returned by the NLU
-
-    Returns
-        dict: Metrics results containing the following data
-
-            - "metrics": the computed metrics
-            - "parsing_errors": the list of parsing errors
-    """
-    engine_class = build_nlu_engine_class(training_engine_class,
-                                          inference_engine_class)
-    return compute_train_test_metrics(train_dataset, test_dataset,
-                                      engine_class, include_slot_metrics,
-                                      slot_matching_lambda)
-
-
 def compute_train_test_metrics(train_dataset, test_dataset, engine_class,
                                include_slot_metrics=True,
                                slot_matching_lambda=None):
diff --git a/snips_nlu_metrics/tests/engine_config.py b/snips_nlu_metrics/tests/engine_config.py
deleted file mode 100644
index f79b52c..0000000
--- a/snips_nlu_metrics/tests/engine_config.py
+++ /dev/null
@@ -1,163 +0,0 @@
-from __future__ import unicode_literals
-
-NLU_CONFIG = {
-    "unit_name": "nlu_engine",
-    "intent_parsers_configs": [
-        {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 50,
-            "max_entities": 200
-        },
-        {
-            "unit_name": "probabilistic_intent_parser",
-            "intent_classifier_config": {
-                "data_augmentation_config": {
-                    "min_utterances": 20,
-                    "unknown_words_replacement_string": None,
-                    "noise_factor": 5,
-                    "unknown_word_prob": 0
-                },
-                "unit_name": "log_reg_intent_classifier",
-                "featurizer_config": {
-                    "sublinear_tf": False
-                },
-                "random_seed": None,
-                "log_reg_args": {
-                    "penalty": "l2",
-                    "loss": "log",
-                    "n_iter": 5,
-                    "n_jobs": -1,
-                    "class_weight": "balanced"
-                }
-            },
-            "slot_filler_config": {
-                "data_augmentation_config": {
-                    "capitalization_ratio": 0.2,
-                    "min_utterances": 200
-                },
-                "unit_name": "crf_slot_filler",
-                "entities_offsets": [
-                    -2,
-                    -1,
-                    0
-                ],
-                "crf_args": {
-                    "c2": 0.1,
-                    "c1": 0.1,
-                    "algorithm": "lbfgs"
-                },
-                "tagging_scheme": 1,
-                "random_seed": None,
-                "feature_factory_configs": [
-                    {
-                        "args": {
-                            "common_words_gazetteer_name": None,
-                            "use_stemming": True,
-                            "n": 1
-                        },
-                        "factory_name": "ngram",
-                        "offsets": [
-                            -2,
-                            -1,
-                            0,
-                            1,
-                            2
-                        ]
-                    },
-                    {
-                        "args": {
-                            "common_words_gazetteer_name": None,
-                            "use_stemming": True,
-                            "n": 2
-                        },
-                        "factory_name": "ngram",
-                        "offsets": [
-                            -2,
-                            1
-                        ]
-                    },
-                    {
-                        "args": {},
-                        "factory_name": "is_digit",
-                        "offsets": [
-                            -1,
-                            0,
-                            1
-                        ]
-                    },
-                    {
-                        "args": {},
-                        "factory_name": "is_first",
-                        "offsets": [
-                            -2,
-                            -1,
-                            0
-                        ]
-                    },
-                    {
-                        "args": {},
-                        "factory_name": "is_last",
-                        "offsets": [
-                            0,
-                            1,
-                            2
-                        ]
-                    },
-                    {
-                        "args": {
-                            "n": 1
-                        },
-                        "factory_name": "shape_ngram",
-                        "offsets": [
-                            0
-                        ]
-                    },
-                    {
-                        "args": {
-                            "n": 2
-                        },
-                        "factory_name": "shape_ngram",
-                        "offsets": [
-                            -1,
-                            0
-                        ]
-                    },
-                    {
-                        "args": {
-                            "n": 3
-                        },
-                        "factory_name": "shape_ngram",
-                        "offsets": [
-                            -1
-                        ]
-                    },
-                    {
-                        "args": {
-                            "tagging_scheme_code": 2,
-                            "use_stemming": True
-                        },
-                        "factory_name": "entity_match",
-                        "drop_out": 0.1,
-                        "offsets": [
-                            -2,
-                            -1,
-                            0
-                        ]
-                    },
-                    {
-                        "args": {
-                            "tagging_scheme_code": 1
-                        },
-                        "factory_name": "builtin_entity_match",
-                        "offsets": [
-                            -2,
-                            -1,
-                            0
-                        ]
-                    }
-                ],
-                "exhaustive_permutations_threshold": 64
-            }
-        }
-    ]
-}
diff --git a/snips_nlu_metrics/tests/mock_engine.py b/snips_nlu_metrics/tests/mock_engine.py
index c39277c..22ad3f5 100644
--- a/snips_nlu_metrics/tests/mock_engine.py
+++ b/snips_nlu_metrics/tests/mock_engine.py
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-from builtins import object
-
 from snips_nlu_metrics import Engine
 
 
@@ -13,26 +11,6 @@ def dummy_parsing_result(text):
     }
 
 
-class MockTrainingEngine(object):
-    def __init__(self, config=None):
-        self.training_config = config
-        self.fitted = False
-
-    def fit(self, dataset):
-        self.fitted = True
-
-    def to_dict(self):
-        return dict()
-
-
-class MockInferenceEngine(object):
-    def __init__(self, data_zip):
-        pass
-
-    def parse(self, text):
-        return dummy_parsing_result(text)
-
-
 class MockEngine(Engine):
     def __init__(self):
         self.fitted = False
diff --git a/snips_nlu_metrics/tests/test_metrics.py b/snips_nlu_metrics/tests/test_metrics.py
index 672b45d..dfe71ad 100644
--- a/snips_nlu_metrics/tests/test_metrics.py
+++ b/snips_nlu_metrics/tests/test_metrics.py
@@ -3,33 +3,27 @@
 import os
 import unittest
 
-from mock import patch
-
-from snips_nlu_metrics.engine import build_nlu_engine_class
 from snips_nlu_metrics.metrics import (compute_cross_val_metrics,
-                                       compute_train_test_metrics,
-                                       compute_cross_val_nlu_metrics,
-                                       compute_train_test_nlu_metrics)
-from snips_nlu_metrics.tests.engine_config import NLU_CONFIG
-from snips_nlu_metrics.tests.mock_engine import (MockTrainingEngine,
-                                                 MockInferenceEngine)
+                                       compute_train_test_metrics)
+from snips_nlu_metrics.tests.mock_engine import MockEngine
 from snips_nlu_metrics.utils.constants import METRICS, PARSING_ERRORS
 
 
 class TestMetrics(unittest.TestCase):
-    def test_cross_val_nlu_metrics(self):
+    def test_compute_cross_val_metrics(self):
         # Given
         dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                     "resources", "beverage_dataset.json")
-        # When
+        with io.open(dataset_path, encoding="utf8") as f:
+            dataset = json.load(f)
+
+        # When/Then
         try:
-            res = compute_cross_val_nlu_metrics(
-                dataset=dataset_path, training_engine_class=MockTrainingEngine,
-                inference_engine_class=MockInferenceEngine, nb_folds=2)
+            res = compute_cross_val_metrics(
+                dataset=dataset, engine_class=MockEngine, nb_folds=2)
         except Exception as e:
             self.fail(e.args[0])
 
-        # Then
         expected_metrics = {
             "null": {
                 "intent": {
@@ -97,24 +91,7 @@ def test_cross_val_nlu_metrics(self):
 
         self.assertDictEqual(expected_metrics, res["metrics"])
 
-    def test_cross_val_metrics_should_skip_when_not_enough_data(self):
-        # Given
-        dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                                    "resources", "beverage_dataset.json")
-
-        # When
-        result = compute_cross_val_nlu_metrics(
-            dataset=dataset_path, training_engine_class=MockTrainingEngine,
-            inference_engine_class=MockInferenceEngine, nb_folds=11)
-
-        # Then
-        expected_result = {
-            METRICS: None,
-            PARSING_ERRORS: []
-        }
-        self.assertDictEqual(expected_result, result)
-
-    def test_end_to_end_cross_val_metrics(self):
+    def test_compute_cross_val_metrics_without_slot_metrics(self):
         # Given
         dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                     "resources", "beverage_dataset.json")
@@ -123,35 +100,67 @@ def test_end_to_end_cross_val_metrics(self):
 
         # When/Then
         try:
-            engine_class = build_nlu_engine_class(MockTrainingEngine,
-                                                  MockInferenceEngine)
-            compute_cross_val_metrics(dataset=dataset,
-                                      engine_class=engine_class, nb_folds=5)
+            res = compute_cross_val_metrics(
+                dataset=dataset, engine_class=MockEngine, nb_folds=2,
+                include_slot_metrics=False)
         except Exception as e:
             self.fail(e.args[0])
 
-    @patch("snips_nlu_metrics.metrics.compute_train_test_metrics")
-    def test_train_test_nlu_metrics(self, mocked_train_test_metrics):
+        expected_metrics = {
+            "null": {
+                "intent": {
+                    "true_positive": 0,
+                    "false_positive": 11,
+                    "false_negative": 0,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0
+                },
+                "intent_utterances": 0
+            },
+            "MakeCoffee": {
+                "intent": {
+                    "true_positive": 0,
+                    "false_positive": 0,
+                    "false_negative": 7,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0
+                },
+                "intent_utterances": 7
+            },
+            "MakeTea": {
+                "intent": {
+                    "true_positive": 0,
+                    "false_positive": 0,
+                    "false_negative": 4,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0
+                },
+                "intent_utterances": 4
+            }
+        }
+
+        self.assertDictEqual(expected_metrics, res["metrics"])
+
+    def test_cross_val_metrics_should_skip_when_not_enough_data(self):
         # Given
-        mocked_metrics_result = {"metrics": "ok"}
-        mocked_train_test_metrics.return_value = mocked_metrics_result
         dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                     "resources", "beverage_dataset.json")
-        with io.open(dataset_path, encoding="utf8") as f:
-            dataset = json.load(f)
 
-        # When/Then
-        try:
-            res = compute_train_test_nlu_metrics(
-                train_dataset=dataset, test_dataset=dataset,
-                training_engine_class=MockTrainingEngine,
-                inference_engine_class=MockInferenceEngine)
-        except Exception as e:
-            self.fail(e.args[0])
+        # When
+        result = compute_cross_val_metrics(
+            dataset=dataset_path, engine_class=MockEngine, nb_folds=11)
 
-        self.assertDictEqual(mocked_metrics_result, res)
+        # Then
+        expected_result = {
+            METRICS: None,
+            PARSING_ERRORS: []
+        }
+        self.assertDictEqual(expected_result, result)
 
-    def test_end_to_end_train_test_metrics(self):
+    def test_compute_train_test_metrics(self):
         # Given
         dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                     "resources", "beverage_dataset.json")
@@ -160,15 +169,78 @@ def test_end_to_end_train_test_metrics(self):
 
         # When/Then
         try:
-            engine_class = build_nlu_engine_class(MockTrainingEngine,
-                                                  MockInferenceEngine)
-            compute_train_test_metrics(
+            res = compute_train_test_metrics(
                 train_dataset=dataset, test_dataset=dataset,
-                engine_class=engine_class)
+                engine_class=MockEngine)
         except Exception as e:
             self.fail(e.args[0])
 
-    def test_end_to_end_train_test_metrics_with_training_config(self):
+        expected_metrics = {
+            "MakeCoffee": {
+                "intent": {
+                    "true_positive": 0,
+                    "false_positive": 0,
+                    "false_negative": 7,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0
+                },
+                "slots": {
+                    "number_of_cups": {
+                        "true_positive": 0,
+                        "false_positive": 0,
+                        "false_negative": 0,
+                        "precision": 0.0,
+                        "recall": 0.0,
+                        "f1": 0.0
+                    }
+                },
+                "intent_utterances": 7
+            },
+            "null": {
+                "intent": {
+                    "true_positive": 0,
+                    "false_positive": 11,
+                    "false_negative": 0,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0},
+                "slots": {},
+                "intent_utterances": 0
+            }, "MakeTea": {
+                "intent": {
+                    "true_positive": 0,
+                    "false_positive": 0,
+                    "false_negative": 4,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0
+                },
+                "slots": {
+                    "number_of_cups": {
+                        "true_positive": 0,
+                        "false_positive": 0,
+                        "false_negative": 0,
+                        "precision": 0.0,
+                        "recall": 0.0,
+                        "f1": 0.0
+                    },
+                    "beverage_temperature": {
+                        "true_positive": 0,
+                        "false_positive": 0,
+                        "false_negative": 0,
+                        "precision": 0.0,
+                        "recall": 0.0,
+                        "f1": 0.0
+                    }
+                },
+                "intent_utterances": 4
+            }
+        }
+
+        self.assertDictEqual(expected_metrics, res["metrics"])
+
+    def test_compute_train_test_metrics_without_slots_metrics(self):
         # Given
         dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                     "resources", "beverage_dataset.json")
@@ -177,11 +249,44 @@ def test_end_to_end_train_test_metrics_with_training_config(self):
 
         # When/Then
         try:
-            engine_class = build_nlu_engine_class(MockTrainingEngine,
-                                                  MockInferenceEngine,
-                                                  training_config=NLU_CONFIG)
-            compute_train_test_metrics(
+            res = compute_train_test_metrics(
                 train_dataset=dataset, test_dataset=dataset,
-                engine_class=engine_class)
+                engine_class=MockEngine, include_slot_metrics=False)
         except Exception as e:
             self.fail(e.args[0])
+
+        expected_metrics = {
+            "MakeCoffee": {
+                "intent": {
+                    "true_positive": 0,
+                    "false_positive": 0,
+                    "false_negative": 7,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0
+                },
+                "intent_utterances": 7
+            },
+            "null": {
+                "intent": {
+                    "true_positive": 0,
+                    "false_positive": 11,
+                    "false_negative": 0,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0},
+                "intent_utterances": 0
+            }, "MakeTea": {
+                "intent": {
+                    "true_positive": 0,
+                    "false_positive": 0,
+                    "false_negative": 4,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0
+                },
+                "intent_utterances": 4
+            }
+        }
+
+        self.assertDictEqual(expected_metrics, res["metrics"])
diff --git a/snips_nlu_metrics/tests/test_nlu_engine.py b/snips_nlu_metrics/tests/test_nlu_engine.py
deleted file mode 100644
index d72f4eb..0000000
--- a/snips_nlu_metrics/tests/test_nlu_engine.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from __future__ import unicode_literals
-
-import unittest
-
-from snips_nlu_metrics.engine import (
-    get_trained_nlu_engine, get_inference_nlu_engine)
-from snips_nlu_metrics.tests.mock_engine import (
-    MockTrainingEngine, MockInferenceEngine)
-
-
-class TestNLUEngine(unittest.TestCase):
-    def test_get_trained_engine_should_use_provided_engine_class(self):
-        # Given
-        _dataset = {
-            "language": "en",
-            "intents": {
-                "intent1": {
-                    "utterances": [
-                        {"data": [{"text": "text1"}]},
-                    ]
-                },
-                "intent2": {
-                    "utterances": [
-                        {"data": [{"text": "text2"}]},
-                    ]
-                },
-            },
-            "entities": {},
-            "snips_nlu_version": "0.1.0"
-        }
-
-        # When
-        engine = get_trained_nlu_engine(_dataset, MockTrainingEngine)
-
-        # Then
-        self.assertTrue(engine.fitted, 1)
-
-    def test_get_inference_engine_should_use_provided_engine_class(self):
-        # When
-        inference_engine = get_inference_nlu_engine(dict(),
-                                                    MockInferenceEngine)
-
-        # Then
-        self.assertIsInstance(inference_engine, MockInferenceEngine)
diff --git a/snips_nlu_metrics/utils/metrics_utils.py b/snips_nlu_metrics/utils/metrics_utils.py
index 5fb6c5c..98f0c32 100644
--- a/snips_nlu_metrics/utils/metrics_utils.py
+++ b/snips_nlu_metrics/utils/metrics_utils.py
@@ -114,8 +114,12 @@ def compute_engine_metrics(engine, test_utterances, intent_list,
 
         predicted_slots = [] if parsing["slots"] is None else parsing["slots"]
 
-        i = intents_idx[actual_intent]
-        j = intents_idx[predicted_intent]
+        i = intents_idx.get(actual_intent)
+        j = intents_idx.get(predicted_intent)
+
+        if i is None or j is None:
+            continue
+            
         confusion_matrix["matrix"][i][j] += 1
 
         utterance_metrics = compute_utterance_metrics(
@@ -230,9 +234,10 @@ def compute_precision_recall_f1(metrics):
         prec_rec_metrics = _compute_precision_recall_f1(
             intent_metrics["intent"])
         intent_metrics["intent"].update(prec_rec_metrics)
-        for slot_metrics in intent_metrics["slots"].values():
-            prec_rec_metrics = _compute_precision_recall_f1(slot_metrics)
-            slot_metrics.update(prec_rec_metrics)
+        if "slots" in intent_metrics:
+            for slot_metrics in intent_metrics["slots"].values():
+                prec_rec_metrics = _compute_precision_recall_f1(slot_metrics)
+                slot_metrics.update(prec_rec_metrics)
     return metrics