From 9871c9007f39578b21152df6469e07e95c6cb9e5 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 20 Feb 2023 20:20:59 +0000
Subject: [PATCH 01/26] Merged PR 27999: Update internal master to public
 master

Pull in changes from public master for sync. No review required.
---
 .github/workflows/release.yml | 2 +-
 .github/workflows/windows.yml | 2 +-
 CHANGELOG.md                  | 1 +
 src/common/aliases.cpp        | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 8a3761e3b..5beab28f0 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -158,7 +158,7 @@ jobs:
 
     - name: Download MKL
       run: |
-        C:\msys64\usr\bin\wget.exe -nv https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip -O mkl.zip
+        C:\msys64\usr\bin\wget.exe -nv https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip -O mkl.zip
         Expand-Archive -Force mkl.zip ${{ github.workspace }}\mkl
         # Set the MKLROOT environment variable so that CMake can find MKL.
         # GITHUB_WORKSPACE is an environment variable available on all GitHub-hosted runners
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index ee85f303d..b1d6b1bd1 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -7,7 +7,7 @@ on:
     branches: [ master ]
 
 env:
-  MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip"
+  MKL_URL: "https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip"
   BOOST_ROOT: "C:/hostedtoolcache/windows/Boost/1.72.0/x86_64"
   BOOST_URL: "https://sourceforge.net/projects/boost/files/boost-binaries/1.72.0/boost_1_72_0-msvc-14.2-64.exe"
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 53f81397d..f66f456fa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed fp16 training/inference with factors-combine concat method
 - Fixed clang 13.0.1 compatibility
 - Fixed potential vulnerabilities from lxml<4.9.1 or mistune<2.0.31
+- Fixed the `--best-deep` RNN alias not setting the s2s model type
 
 ### Changed
 - Parameter synchronization in local sharding model now executes hash checksum before syncing
diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp
index 3db31e515..75d9bdf97 100644
--- a/src/common/aliases.cpp
+++ b/src/common/aliases.cpp
@@ -46,6 +46,7 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) {
 
     // Options setting the BiDeep architecture proposed in http://www.aclweb.org/anthology/W17-4710
     cli.alias("best-deep", "true", [](YAML::Node& config) {
+      config["type"] = "s2s";
       config["layer-normalization"] = true;
       config["tied-embeddings"] = true;
       config["enc-type"] = "alternating";

From efcd3dae71c63036d2b1d5f5992125dabacd2a92 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Thu, 23 Feb 2023 06:15:47 +0000
Subject: [PATCH 02/26] Merged PR 28059: Add missing default for factors

This adds a missing default for factors, the error does not manifest on the command line since it's set in `config_parser.cpp`
---
 CHANGELOG.md             |  3 +++
 VERSION                  |  2 +-
 src/common/config.cpp    | 15 ---------------
 src/layers/embedding.cpp |  2 +-
 src/layers/output.cpp    | 30 +++++++++++++++++++++++++++---
 src/models/s2s.h         |  2 +-
 src/models/transformer.h |  2 +-
 7 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e325e25e..aa6f06bee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 
+### Fixed
+- Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp)
+
 ## [1.12.0] - 2023-02-20
 
 ### Added
diff --git a/VERSION b/VERSION
index a5effa303..51b86ba24 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.0
+v1.12.1
diff --git a/src/common/config.cpp b/src/common/config.cpp
index 9878c70b0..a1c4ed5ac 100644
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@@ -116,21 +116,6 @@ void Config::initialize(ConfigParser const& cp) {
     config_["tsv-fields"] = tsvFields;
   }
 
-  // ensures factors backward compatibility whilst keeping the more user friendly CLI
-  if(get<std::string>("lemma-dependency").empty()) {
-    YAML::Node config;
-    int lemmaDimEmb = get<int>("lemma-dim-emb");
-    if(lemmaDimEmb > 0) {
-      config_["lemma-dependency"] = "re-embedding";
-    } else if(lemmaDimEmb == -1) {
-      config_["lemma-dependency"] = "lemma-dependent-bias";
-    } else if(lemmaDimEmb == -2) {
-      config_["lemma-dependency"] = "soft-transformer-layer";
-    } else if(lemmaDimEmb == -3) {
-      config_["lemma-dependency"] = "hard-transformer-layer";
-    }
-  }
-
   // echo full configuration
   log();
 
diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp
index b60f6cc18..334f0b865 100644
--- a/src/layers/embedding.cpp
+++ b/src/layers/embedding.cpp
@@ -179,7 +179,7 @@ Expr Embedding::applyIndices(const std::vector<WordIndex>& embIdx, const Shape&
                                                                                           : prefix_ + "_Wemb",
       "fixed",          embeddingFix_,
       "dimFactorEmb",   opt<int>("factors-dim-emb", 0),  // for factored embeddings
-      "factorsCombine", opt<std::string>("factors-combine", ""),  // for factored embeddings
+      "factorsCombine", opt<std::string>("factors-combine", "sum"),  // for factored embeddings
       "vocab",     opt<std::vector<std::string>>("vocabs")[batchIndex_]);  // for factored embeddings
   // clang-format on
   if(options_->hasAndNotEmpty("embedding-vectors")) {
diff --git a/src/layers/output.cpp b/src/layers/output.cpp
index efff58df4..8977464b1 100644
--- a/src/layers/output.cpp
+++ b/src/layers/output.cpp
@@ -6,6 +6,28 @@
 namespace marian {
 namespace mlp {
 
+// @TODO: get rid of factored code altogether
+static std::string getLemmaDependency(int lemmaDimEmb, const std::string& lemmaDependencyIn) {
+  // ensures factors backward compatibility whilst keeping the more user friendly CLI
+  std::string lemmaDependencyOut;
+  if(lemmaDependencyIn.empty()) {
+    if(lemmaDimEmb > 0) {
+      lemmaDependencyOut = "re-embedding";
+    } else if(lemmaDimEmb == -1) {
+      lemmaDependencyOut = "lemma-dependent-bias";
+    } else if(lemmaDimEmb == -2) {
+      lemmaDependencyOut = "soft-transformer-layer";
+    } else if(lemmaDimEmb == -3) {
+      lemmaDependencyOut = "hard-transformer-layer";
+    } else {
+      lemmaDependencyOut = "";
+    }
+  } else {
+    lemmaDependencyOut = lemmaDependencyIn;
+  }
+  return lemmaDependencyOut;
+}
+
 /*private*/ void Output::lazyConstruct(int inputDim) {
   // We must construct lazily since we won't know tying nor input dim in constructor.
   if(Wt_)
@@ -36,7 +58,8 @@ namespace mlp {
     b_ = graph_->param(name + "_b", {1, numOutputClasses}, inits::zeros());
 
   /*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
-  std::string lemmaDependency = options_->get<std::string>("lemma-dependency", "");
+  std::string lemmaDependency = getLemmaDependency(lemmaDimEmb, options_->get<std::string>("lemma-dependency", ""));
+
   ABORT_IF(lemmaDimEmb && !factoredVocab_, "--lemma-dim-emb requires a factored vocabulary");
   if(lemmaDependency == "re-embedding") {  // embed the (expected) word with a different embedding matrix
     ABORT_IF(
@@ -112,7 +135,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ {
     Expr Plemma = nullptr;                              // used for lemmaDependency = lemma-dependent-bias
     Expr inputLemma = nullptr;                          // used for lemmaDependency = hard-transformer-layer and soft-transformer-layer
 
-    std::string factorsCombine = options_->get<std::string>("factors-combine", "");
+    std::string factorsCombine = options_->get<std::string>("factors-combine", "sum");
     ABORT_IF(factorsCombine == "concat", "Combining lemma and factors embeddings with concatenation on the target side is currently not supported");
 
     for(size_t g = 0; g < numGroups; g++) {
@@ -134,7 +157,8 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ {
           factorB = slice(b_, -1, Slice((int)range.first, (int)range.second));
       }
       /*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
-      std::string lemmaDependency = options_->get<std::string>("lemma-dependency", "");
+      std::string lemmaDependency = getLemmaDependency(lemmaDimEmb, options_->get<std::string>("lemma-dependency", ""));
+
       if((lemmaDependency == "soft-transformer-layer" || lemmaDependency == "hard-transformer-layer") && g > 0) {
         // this mimics one transformer layer
         //  - attention over two inputs:
diff --git a/src/models/s2s.h b/src/models/s2s.h
index 104f946c9..8eb2ef8d1 100644
--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@@ -319,7 +319,7 @@ class DecoderS2S : public DecoderBase {
       last("vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_]); // for factored outputs
       last("lemma-dim-emb", opt<int>("lemma-dim-emb", 0)); // for factored outputs
       last("lemma-dependency", opt<std::string>("lemma-dependency", "")); // for factored outputs
-      last("factors-combine", opt<std::string>("factors-combine", "")); // for factored outputs
+      last("factors-combine", opt<std::string>("factors-combine", "sum")); // for factored outputs
 
       last("output-omit-bias", opt<bool>("output-omit-bias", false)); 
 
diff --git a/src/models/transformer.h b/src/models/transformer.h
index 243d2c7fc..1fed868b6 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -689,7 +689,7 @@ class DecoderTransformer : public Transformer<DecoderBase> {
         "output-approx-knn", opt<std::vector<int>>("output-approx-knn", {}),
         "lemma-dim-emb", opt<int>("lemma-dim-emb", 0), // for factored outputs
         "lemma-dependency", opt<std::string>("lemma-dependency", ""), // for factored outputs
-        "factors-combine", opt<std::string>("factors-combine", "")); // for factored outputs
+        "factors-combine", opt<std::string>("factors-combine", "sum")); // for factored outputs
 
     if(opt<bool>("tied-embeddings") || opt<bool>("tied-embeddings-all"))
       outputFactory.tieTransposed(opt<bool>("tied-embeddings-all") || opt<bool>("tied-embeddings-src") ? "Wemb" : prefix_ + "_Wemb");

From a23cc77e5f7132e405e99dcdcf657f5aceace08a Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 27 Feb 2023 21:53:41 +0000
Subject: [PATCH 03/26] Merged PR 27976: Introduce new layer framework into
 master

Introduces the new layer framework into Marian master. This is currently not used anywhere for the existing models unless explicitly asked for. This also shouldn't modify any major existing functionality. The goal of this PR is to have the new code in master and use it for new things instead of the old code.

FYI: @<Roman Grundkiewicz> @<Hieu Hoang> the files in `src/layers_new` are the new framework. The rest is mostly unchanged apart from small modifications that allow for interaction with the new code. For now it exists in parallel to the old code.
---
 .gitignore                           |   3 +-
 CHANGELOG.md                         |   4 +
 VERSION                              |   2 +-
 src/CMakeLists.txt                   |   2 +
 src/command/marian_conv.cpp          |   1 -
 src/common/utils.cpp                 |  30 ++
 src/common/utils.h                   |  16 +
 src/graph/cached_expression.h        |  70 ++++
 src/graph/expression_operators.cpp   |   7 +-
 src/graph/expression_operators.h     |   2 +-
 src/graph/node_operators_binary.h    |  45 ++-
 src/layers_new/attention.h           | 192 ++++++++++
 src/layers_new/decoder.h             | 136 +++++++
 src/layers_new/embeddings.h          | 239 ++++++++++++
 src/layers_new/interface.h           | 550 ++++++++++++++++++++++++++
 src/layers_new/neuralnet.cpp         |  24 ++
 src/layers_new/neuralnet.h           | 300 +++++++++++++++
 src/layers_new/rnn.h                 | 126 ++++++
 src/layers_new/transformer.h         | 553 +++++++++++++++++++++++++++
 src/models/model_factory.cpp         |  46 ++-
 src/models/s2s.h                     |   8 +-
 src/models/states.h                  |  36 +-
 src/models/transformer.h             |  44 +--
 src/models/transformer_factory.h     | 162 ++++++++
 src/models/transformer_new.h         | 245 ++++++++++++
 src/tensors/cpu/tensor_operators.cpp |   7 +-
 src/tests/CMakeLists.txt             |   1 +
 src/tests/transformer_new.cpp        |  11 +
 28 files changed, 2754 insertions(+), 108 deletions(-)
 create mode 100644 src/graph/cached_expression.h
 create mode 100644 src/layers_new/attention.h
 create mode 100644 src/layers_new/decoder.h
 create mode 100644 src/layers_new/embeddings.h
 create mode 100644 src/layers_new/interface.h
 create mode 100644 src/layers_new/neuralnet.cpp
 create mode 100644 src/layers_new/neuralnet.h
 create mode 100644 src/layers_new/rnn.h
 create mode 100644 src/layers_new/transformer.h
 create mode 100644 src/models/transformer_new.h
 create mode 100644 src/tests/transformer_new.cpp

diff --git a/.gitignore b/.gitignore
index 956ce6847..d7f2f4df3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,5 +61,4 @@ examples/mnist/*ubyte
 /vs/MarianDll.VC.VC.opendb
 
 .vs
-.vscode
-
+.vscode
diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa6f06bee..6a7316be9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 
+### Added
+
+- New experimental layer framework for Transformer-like models.
+
 ### Fixed
 - Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp)
 
diff --git a/VERSION b/VERSION
index 51b86ba24..41de27dfa 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.1
+v1.12.2
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f095f2eb8..f9d5a5e5b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -80,6 +80,8 @@ set(MARIAN_SOURCES
   layers/logits.cpp
   layers/lsh.cpp
 
+  layers_new/neuralnet.cpp
+
   rnn/cells.cpp
   rnn/attention.cpp
 
diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp
index b4a5f3745..12412a238 100644
--- a/src/command/marian_conv.cpp
+++ b/src/command/marian_conv.cpp
@@ -8,7 +8,6 @@
 
 int main(int argc, char** argv) {
   using namespace marian;
-
   createLoggers();
 
   auto options = New<Options>();
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
index 1f3fd6c07..c058d4874 100644
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -440,3 +440,33 @@ double parseNumber(std::string param) {
 
 }  // namespace utils
 }  // namespace marian
+
+
+// Code for demangling gnu g++ type names, closing/re-opening namespaces to keep things local
+// This is used to determine Layer type names for display and nameing.
+#ifdef __GNUG__
+#include <cxxabi.h>
+#endif
+
+namespace marian {
+namespace utils {
+
+#ifdef __GNUG__ // gnu g++ and clang seem to do this similarly
+std::string cxxTypeNameDemangle(const char* name) {
+  int status = -4; // some arbitrary value to eliminate the compiler warning
+  // __cxa_demangle allocates a string that has to be freed, we pass the deallocation function
+  std::unique_ptr<char, void(*)(void*)> res(
+      abi::__cxa_demangle(name, NULL, NULL, &status),
+      std::free
+  );
+  return (status == 0) ? res.get() : name;
+}
+#else
+// does nothing if not g++, should be correct for MSVC
+std::string cxxTypeNameDemangle(const char* name) {
+  return name;
+}
+#endif
+
+}  // namespace utils
+}  // namespace marian
diff --git a/src/common/utils.h b/src/common/utils.h
index fbcf672d7..5f3695fc0 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -2,6 +2,7 @@
 
 #include <string>
 #include <vector>
+#include <typeinfo>
 #include <iostream>
 
 namespace marian {
@@ -66,6 +67,21 @@ std::string findReplace(const std::string& in, const std::string& what, const st
 double parseDouble(std::string s);
 double parseNumber(std::string s);
 
+std::string cxxTypeNameDemangle(const char* name);
+
+// return type name via object of given type
+template <class T>
+std::string cxxTypeName(const T& t) {
+  return cxxTypeNameDemangle(typeid(t).name());
+}
+
+// return type name via templated type
+template <class T>
+std::string cxxTypeName() {
+  return cxxTypeNameDemangle(typeid(T).name());
+}
+
+
 // prints vector values with a custom label.
 template<class T>
 void Debug(const T *arr, size_t size, const std::string &str) {
diff --git a/src/graph/cached_expression.h b/src/graph/cached_expression.h
new file mode 100644
index 000000000..f7adff8bc
--- /dev/null
+++ b/src/graph/cached_expression.h
@@ -0,0 +1,70 @@
+#include "common/definitions.h"
+#include "common/intrusive_ptr.h"
+#include "graph/expression_graph.h"
+
+#include <functional>
+
+namespace marian {
+
+// This class allows for simpler caching of Expr objects and automatic checking if the 
+// cached Expr needs to be updated/recreated. 
+class CachedExpr {
+  private:
+    ENABLE_INTRUSIVE_PTR(CachedExpr);
+
+    Expr cachedKey_{nullptr};
+    Expr cachedValue_{nullptr};
+
+    typedef std::function<Expr(Expr)> ApplyFunT;
+    typedef std::function<bool(Expr, Expr)> EqualFunT;
+
+    UPtr<ApplyFunT> applyFun_; // function that creates the cached result
+    UPtr<EqualFunT> equalFun_; // function that checks if the input changed. If yes,
+                               // the `apply_` functions gets reapplied and the new result
+                               // is cached. 
+  
+  public:
+    // No functors are given; they will have to supplied when calling `apply`.
+    CachedExpr() {};
+
+    // No apply functor is given; it will have to supplied when calling `apply`.
+    CachedExpr(EqualFunT equalFun) 
+    : equalFun_(new EqualFunT(equalFun)) {};
+
+    // Both functors are given, and will be used by default. They can however be overriden
+    // if supplied directly in `apply`.
+    CachedExpr(ApplyFunT applyFun, EqualFunT equalFun) 
+    : applyFun_(new ApplyFunT(applyFun)), equalFun_(new EqualFunT(equalFun)) {};
+
+    // lazily executes the factory `applyFun` if `equalFun` indicates that the input has changed.
+    Expr apply(Expr key, ApplyFunT applyFun, EqualFunT equalFun) {
+      if(!cachedKey_ || !equalFun(cachedKey_, key)) {
+        cachedKey_ = key;
+        cachedValue_ = applyFun(key);
+      } 
+      return cachedValue_;
+    }
+
+    // lazily executes the factory `applyFun` if a equality check that has been passed to the constructor
+    // indicates that the input has changed.
+    Expr apply(Expr key, ApplyFunT applyFun) {
+      ABORT_IF(!equalFun_, "Equality check has not been passed to constructor");
+      return apply(key, applyFun, *equalFun_);
+    }
+
+    // lazily executes a factory if a equality check indicates that the input has changed. Both,
+    // the factory and the equality check have to have been passed to the constructor.
+    Expr apply(Expr key) {
+      ABORT_IF(!equalFun_, "Equality check has not been passed to constructor");
+      ABORT_IF(!applyFun_, "Apply factory has not been passed to constructor");
+      return apply(key, *applyFun_, *equalFun_);
+    }
+
+    // clears any cached values, calling apply after this will trigger recomputation.
+    void clear() {
+      cachedKey_   = nullptr;
+      cachedValue_ = nullptr;
+    }
+};
+
+}
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index b0d40949b..a6504ebac 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -759,8 +759,7 @@ Expr transpose(Expr a, const std::vector<int>& axes) {
   return Expression<TransposeNodeOp>(a, axes);
 }
 
-Expr swapAxes(Expr x, int axis1, int axis2)
-{
+Expr swapAxes(Expr x, int axis1, int axis2) {
   const auto& shape = x->shape();
   axis1 = shape.axis(axis1);
   axis2 = shape.axis(axis2);
@@ -880,8 +879,8 @@ Expr rmsNorm(Expr x,
   return Expression<RMSNormalizationOp>(nodes, eps);
 }
 
-Expr highway(Expr y, Expr x, Expr t) {
-  std::vector<Expr> nodes = {y, x, t};
+Expr highway(Expr input1, Expr input2, Expr gate) {
+  std::vector<Expr> nodes = {input1, input2, gate};
   return Expression<HighwayNodeOp>(nodes);
 }
 
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index cc3e6028b..faef5c29e 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -976,7 +976,7 @@ static inline Expr dropout(Expr x, float dropProb, Shape shape) {
 
 
 /**
- * Performs dropout with a given probably.
+ * Performs dropout with a given probability.
  */
 static inline Expr dropout(Expr x, float dropProb) {
   if(dropProb == 0)
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 292554bd0..2c997d577 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -577,8 +577,8 @@ class DotBatchedNodeOp : public NaryNodeOp {
     // df/dB += alpha * dot(op(A).T, D)
     // beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C
     // to sum gradients from different graph parts
-
-    if(!transA_ && transB_)
+    
+    if(!transA_ && transB_) {
       return {NodeOp(ProdBatched(child(0)->grad(),
                                  graph()->allocator(),
                                  adj_,
@@ -595,8 +595,7 @@ class DotBatchedNodeOp : public NaryNodeOp {
                                  false,
                                  1.0,
                                  scalar_))};
-
-    if(transA_ && !transB_)
+    } else if(transA_ && !transB_) {
       return {NodeOp(ProdBatched(child(0)->grad(),
                                  graph()->allocator(),
                                  child(1)->val(),
@@ -613,8 +612,7 @@ class DotBatchedNodeOp : public NaryNodeOp {
                                  false,
                                  1.0,
                                  scalar_))};
-
-    if(transA_ && transB_)
+    } else if(transA_ && transB_) {
       return {NodeOp(ProdBatched(child(0)->grad(),
                                  graph()->allocator(),
                                  child(1)->val(),
@@ -631,23 +629,24 @@ class DotBatchedNodeOp : public NaryNodeOp {
                                  true,
                                  1.0,
                                  scalar_))};
-
-    return {NodeOp(ProdBatched(child(0)->grad(),
-                               graph()->allocator(),
-                               adj_,
-                               child(1)->val(),
-                               false,
-                               true,
-                               1.0,
-                               scalar_)),
-            NodeOp(ProdBatched(child(1)->grad(),
-                               graph()->allocator(),
-                               child(0)->val(),
-                               adj_,
-                               true,
-                               false,
-                               1.0,
-                               scalar_))};
+    } else { // !transA && !transB
+      return {NodeOp(ProdBatched(child(0)->grad(),
+                                graph()->allocator(),
+                                adj_,
+                                child(1)->val(),
+                                false,
+                                true,
+                                1.0,
+                                scalar_)),
+              NodeOp(ProdBatched(child(1)->grad(),
+                                graph()->allocator(),
+                                child(0)->val(),
+                                adj_,
+                                true,
+                                false,
+                                1.0,
+                                scalar_))};
+    }
   }
 
   const std::string type() override { return "bdot"; }
diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h
new file mode 100644
index 000000000..035e6c51d
--- /dev/null
+++ b/src/layers_new/attention.h
@@ -0,0 +1,192 @@
+#pragma once
+
+#include "graph/cached_expression.h"
+#include "layers_new/neuralnet.h"
+
+namespace marian {
+namespace nn {
+
+// Abstract base class for attention mechanisms
+class AttentionLayer : public Layer, 
+                       public IQuaternaryLayer {
+protected:
+  using Layer::namedLayers_;
+  
+public:
+  AttentionLayer(Ptr<ExpressionGraph> graph) : Layer(graph) {}
+  virtual ~AttentionLayer() = default;
+};
+
+class MultiplicativeAttention : public AttentionLayer {
+protected:
+  using AttentionLayer::namedLayers_;
+
+public:
+  Ptr<Dropout> attentionDropout;
+
+  MultiplicativeAttention(Ptr<ExpressionGraph> graph, float dropoutProbability)
+   : AttentionLayer(graph) {
+    attentionDropout = New<Dropout>(graph, dropoutProbability);
+    registerLayer(attentionDropout);
+  }
+
+  virtual ~MultiplicativeAttention() = default;
+
+  virtual Expr apply(Expr query, Expr keys, Expr values, Expr logMask = nullptr) const override {
+    int dimKeys = keys->shape()[-1];
+
+    // softmax over batched dot product of query and keys (applied over all
+    // time steps and batch entries), also add logMask for illegal connections
+
+    // multiplicative attention with flattened softmax
+    float scale = 1.0f / std::sqrt((float)dimKeys); // scaling to avoid extreme values due to matrix multiplication
+    
+    // query, keys and values: [beam depth * batch size, num heads, length, head dim]
+    auto z = bdot(query, keys, false, true, scale); // [beam depth, batch size * num heads, max tgt length, max src length]
+
+    // mask out garbage beyond end of sequences
+    if(logMask)
+      z = z + logMask;
+
+    // take softmax along src sequence axis (-1)
+    auto weights = softmax(z); // [beam depth, batch size * num heads, max tgt length, max src length]
+    
+#if 0 // @TODO: make this work again
+    if(saveAttentionWeights)
+      collectOneHead(weights, dimBeam);
+#endif
+
+    // optional dropout for attention weights
+    weights = attentionDropout->apply(weights);
+
+    // apply attention weights to values
+    // weights: [beam depth, batch size * num heads, max tgt length, max src length]
+    // values:  [beam depth, batch size * num heads, src length, head dim]
+    auto output = bdot(weights, values);  // [beam depth, batch size * num heads, max tgt length, split vector dim]
+    return output;
+  }
+};
+
+template <class AttentionType> // Currently only used for MultiplicativeAttention
+class MultiHeadAttention : public AttentionType {
+protected:
+  using AttentionType::namedLayers_;
+
+private:
+  IPtr<CachedExpr> cachedKh_; // cached result of key projection
+  IPtr<CachedExpr> cachedVh_; // cached result of value projection
+
+public:
+  Ptr<Linear> qProj; // query projection layer
+  Ptr<Linear> kProj; // key projection layer
+  Ptr<Linear> vProj; // value projection layer
+  Ptr<Linear> oProj; // output projection layer
+
+  int numHeads;
+  int attDim;
+  int modelDim;
+
+  MultiHeadAttention(Ptr<ExpressionGraph> graph,
+                     int numHeads, 
+                     int attDim, 
+                     int modelDim,
+                     float dropoutProbability)
+    : AttentionType(graph, dropoutProbability),
+      cachedKh_(new CachedExpr()), 
+      cachedVh_(new CachedExpr()),
+      numHeads(numHeads), 
+      attDim(attDim), 
+      modelDim(modelDim) {
+    qProj = New<Linear>(graph, attDim);
+    registerLayer(qProj);
+    kProj = New<Linear>(graph, attDim);
+    registerLayer(kProj);
+    vProj = New<Linear>(graph, attDim);
+    registerLayer(vProj);
+
+    oProj = New<Linear>(graph, modelDim);
+    registerLayer(oProj);
+  }
+
+  virtual ~MultiHeadAttention() = default;
+
+private:
+  // join beam and batch dimension and split model dimension in to heads and head dimension. We also need to transpose to 
+  // be able to do an efficient batched matmul.
+  Expr splitHeads(Expr input) const {
+    int dimSteps = input->shape()[-2];
+    int dimBatch = input->shape()[-3];
+    int dimBeam  = input->shape()[-4];
+    int dimDepth = attDim / numHeads;
+
+    auto output = reshape(input, {dimBeam * dimBatch, dimSteps, numHeads, dimDepth});
+    output      = transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, numHeads, dimSteps, dimDepth]
+    output      = reshape(output, {dimBeam, dimBatch * numHeads, dimSteps, dimDepth});
+    return output;
+  }
+
+  // Undoes the effects of the above function by reversing the transposition and reshaping back to original shape
+  Expr joinHeads(Expr input) const {
+    int dimDepth      = input->shape()[-1];
+    int dimSteps      = input->shape()[-2];
+    int dimBatchHeads = input->shape()[-3];
+    int dimBeam       = input->shape()[-4];
+    int dimModel      = numHeads * dimDepth;
+    int dimBatch      = dimBatchHeads / numHeads;
+
+    auto output = reshape(input, {dimBeam * dimBatch, numHeads, dimSteps, dimDepth});
+    output      = transpose(output, {0, 2, 1, 3});
+    output      = reshape(output, {dimBeam, dimBatch, dimSteps, dimModel});
+    return output;
+  }
+
+public:
+  virtual Expr apply(Expr query, Expr keys, Expr values, Expr mask) const override {
+    auto qh = splitHeads(qProj->apply(query));
+
+    // @TODO: in original implementation we use shape()->elements(), dunno why
+    auto equal = [](Expr a, Expr b) { return a->shape() == b->shape(); };
+    
+    // these two get conditionally recomputed if their size changes according to criterion above
+    auto kh = cachedKh_->apply(keys,   [this](Expr keys)   { 
+      return splitHeads(kProj->apply(keys)); 
+    }, equal);
+    
+    auto vh = cachedVh_->apply(values, [this](Expr values) { 
+      return splitHeads(vProj->apply(values)); 
+    }, equal);
+
+    auto output = AttentionType::apply(qh, kh, vh, mask);
+
+    output = joinHeads(output);
+    output = oProj->apply(output);
+
+    return output;
+  }
+
+  virtual void clear() override {
+    Layer::clear();
+    cachedKh_->clear();
+    cachedVh_->clear();
+  }
+};
+
+static Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options) {
+  // @TODO: currently this does nothing as it isn't set anywhere
+  std::string selfAttentionType = options->get<std::string>("transformer-encoder-attention", "default"); // currently only default
+
+  // in the future we might add SingleHead or Additive or LSH-based as in Reformer
+  if(selfAttentionType == "default") {
+    int numHeads = options->get<int>("transformer-heads");
+    int modelDim = options->get<int>("dim-emb");
+    float attentionDropoutProbability = options->get<float>("transformer-dropout-attention", 0.f);
+
+    return New<MultiHeadAttention<MultiplicativeAttention>>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability);
+  }
+  else {
+    ABORT("Unknown transformer encoder attention type: {}", selfAttentionType);
+  }
+}
+
+} // namespace nn
+} // namespace marian
diff --git a/src/layers_new/decoder.h b/src/layers_new/decoder.h
new file mode 100644
index 000000000..406017d64
--- /dev/null
+++ b/src/layers_new/decoder.h
@@ -0,0 +1,136 @@
+#pragma once
+
+#include "common/utils.h"
+#include "graph/expression_graph.h"
+#include "graph/expression_operators.h"
+#include "graph/node_initializers.h"
+
+#include "layers_new/interface.h"
+
+namespace marian {
+namespace nn {
+
+// Interface: decoder state
+struct DecoderState : public IClassName, public std::enable_shared_from_this<DecoderState> {
+protected:
+  size_t position{0};
+
+public:
+  DecoderState(size_t position) : position(position) {}
+  virtual ~DecoderState() {}
+
+  virtual void incrementPosition() {
+    position++;
+  }
+
+  virtual size_t getPosition() {
+    return position;
+  }
+
+  // Dynamic cast to requested layer type. Will return nullptr if not possible
+  template <class StateType>
+  Ptr<StateType> as() {
+    return std::dynamic_pointer_cast<StateType>(shared_from_this());
+  }
+
+  // Dynamic cast to requested layer type. Will return nullptr if not possible
+  template <class StateType>
+  Ptr<StateType> as() const {
+    return const_cast<DecoderState*>(this)->as<StateType>();
+  }
+
+  // Dynamic cast to requested layer type. Will abort if the cast is not possible.
+  template <class StateType>
+  Ptr<StateType> cast() {
+    auto stateCast = as<StateType>();
+    ABORT_IF(!stateCast, "State {} cannot be cast to requested type {}", 
+             className(),
+             utils::cxxTypeName<StateType>());
+    return stateCast;
+  }
+
+  template <class StateType>
+  Ptr<StateType> cast() const {
+    return const_cast<DecoderState*>(this)->cast<StateType>();
+  }
+};
+
+class DecoderStateItem : public DecoderState {
+private:
+  Expr state_;
+
+public:
+  DecoderStateItem(Expr state, size_t position) : DecoderState(position), state_(state) {}
+  virtual ~DecoderStateItem() = default;
+
+  Expr get() { return state_; }
+  void set(Expr state) { state_ = state; }
+};
+
+class DecoderStateList : public DecoderState {
+private:
+  std::vector<Ptr<DecoderStateItem>> items_;
+
+public:
+  DecoderStateList(size_t position) : DecoderState(position) {}
+  virtual ~DecoderStateList() = default;
+
+  void incrementPosition() override {
+    DecoderState::incrementPosition();
+    for(auto& item : items_) {
+      item->incrementPosition();
+      ABORT_IF(position != item->getPosition(), "Positions out of sync??");
+    }
+  }
+
+  void append(Ptr<DecoderStateItem> item) {
+    ABORT_IF(position != item->getPosition(), "DecoderStateList.position ({}) != DecoderStateItem.position ({}) ?", position, item->getPosition());
+    items_.push_back(item);
+  }
+
+  /** 
+   * Retrieve DecoderStateItem at index i
+   */
+  Ptr<DecoderStateItem> at(size_t i) const {
+    return items_[i];
+  }
+
+  auto begin() -> decltype(items_.begin()) const {
+    return items_.begin();
+  }
+
+  auto end() -> decltype(items_.end()) const {
+    return items_.end();
+  }
+
+  size_t size() const { return items_.size(); }
+};
+
+
+// Interface: Unary function
+struct IUnaryDecoderLayer {
+  virtual Expr apply(Expr /*input*/, Ptr<DecoderState> /*state*/) const = 0;
+};
+
+// Interface: Binary function
+struct IBinaryDecoderLayer {
+  virtual Expr apply(Expr, Expr, Ptr<DecoderState> /*state*/) const = 0;
+};
+
+// Interface: Ternary function
+struct ITernaryDecoderLayer {
+  virtual Expr apply(Expr, Expr, Expr, Ptr<DecoderState> /*state*/) const = 0;
+};
+
+// Interface: 4ary function
+struct IQuaternaryDecoderLayer {
+  virtual Expr apply(Expr, Expr, Expr, Expr, Ptr<DecoderState> /*state*/) const = 0;
+};
+
+// Interface: N-Ary function
+struct INaryLayerDecoderLayer {
+  virtual Expr apply(const std::vector<Expr>& /*inputs*/, Ptr<DecoderState> /*state*/) const = 0;
+};
+
+} // namespace nn
+} // namespace marian
diff --git a/src/layers_new/embeddings.h b/src/layers_new/embeddings.h
new file mode 100644
index 000000000..b7d297b63
--- /dev/null
+++ b/src/layers_new/embeddings.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include "layers_new/interface.h"
+#include "data/corpus_base.h"
+#include "data/factored_vocab.h"
+
+namespace marian {
+namespace nn {
+
+// Embedding from corpus sub-batch to (emb, mask)
+struct IEmbeddingLayer {
+  virtual std::tuple<Expr/*input*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const = 0;
+
+  virtual Expr apply(const Words& embIdx, const Shape& shape) const = 0;
+
+  // alternative from indices directly
+  virtual Expr applyIndices(const std::vector<WordIndex>& embIdx, const Shape& shape) const = 0;
+};
+
+struct IPositionEmbeddingLayer {
+  virtual Expr apply(Expr, int startPosition = 0) = 0;
+};
+
+// A regular embedding layer.
+// Note that this also applies dropout if the option is passed (pass 0 when in inference mode).
+// It is best to not use Embedding directly, but rather via getEmbeddingLayer() in
+// EncoderDecoderLayerBase, which knows to pass on all required parameters from options.
+class Embedding : public LayerWithOptions, public IEmbeddingLayer {
+public:
+  Expr embeddings;
+
+  Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options) : LayerWithOptions(graph, options) {
+    std::string name = opt<std::string>("prefix");
+    int dimVoc = opt<int>("dimVocab");
+    int dimEmb = opt<int>("dimEmb");
+    bool fixed = opt<bool>("fixed", false);
+
+    factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
+    if (factoredVocab_) {
+      dimVoc = (int)factoredVocab_->factorVocabSize();
+      LOG_ONCE(info, "[embedding] Factored embeddings enabled");
+    }
+
+    // Embedding layer initialization should depend only on embedding size, hence fanIn=false
+    auto initFunc = inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true); // -> embedding vectors have roughly unit length
+    
+    if(options_->has("embFile")) {
+      std::string file = opt<std::string>("embFile");
+      if (!file.empty()) {
+        bool norm = opt<bool>("normalization", false);
+        initFunc = inits::fromWord2vec(file, dimVoc, dimEmb, norm);
+      }
+    }
+
+    registerParameter(embeddings, Shape({dimVoc, dimEmb}), initFunc);
+    embeddings->setTrainable(!fixed); // @TODO: move into registerParam macro
+  }
+
+  virtual ~Embedding() = default;
+  
+  std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override final {
+    auto graph = embeddings->graph();
+    int dimBatch = (int)subBatch->batchSize();
+    int dimEmb = embeddings->shape()[-1];
+    int dimWidth = (int)subBatch->batchWidth();
+
+    // factored embeddings:
+    //  - regular:
+    //     - y = x @ E    x:[B x 1ofV] ; E:[V x D] ; y:[B x D]
+    //  - factored:
+    //     - u = x @ M    one-hot to U-dimensional multi-hot (all factors in one concatenated space)
+    //        - each row of M contains the set of factors for one word => we want a CSR matrix
+    //     - y = (x @ M) @ E   (x:[B x 1ofV] ; M:[V x U]) ; E:[U x D] ; y:[B x D]
+    //  - first compute x @ M on the CPU
+    //     - (Uvalues, Uindices, Uoffsets) = csr_rows(Mvalues, Mindices, Moffsets, subBatch->data()):
+    //        - shape (U, specifically) not actually needed here
+    //     - foreach input x[i]
+    //        - locate row M[i,*]
+    //        - copy through its index values (std::vector<push_back>)
+    //     - create a matching ones vector (we can keep growing)
+    //     - convert to GPU-side CSR matrix. CSR matrix now has #rows equal to len(x)
+    //     - CSR matrix product with E
+    //     - csr_dot(Uvalues, Uindices, Uoffsets, embeddings, transposeU)
+    //        - double-check if all dimensions are specified. Probably not for transpose (which would be like csc_dot()).
+    //  - weighting:
+    //     - core factors' gradients are sums over all words that use the factors;
+    //        - core factors' embeddings move very fast
+    //        - words will need to make up for the move; rare words cannot
+    //     - so, we multiply each factor with 1/refCount
+    //        - core factors get weighed down a lot
+    //        - no impact on gradients, as Adam makes up for it; embeddings still move fast just as before
+    //        - but forward pass weighs them down, so that all factors are in a similar numeric range
+    //        - if it is required to be in a different range, the embeddings can still learn that, but more slowly
+
+    auto batchEmbeddings = apply(subBatch->data(), {dimWidth, dimBatch, dimEmb});
+    auto batchMask = graph->constant({dimWidth, dimBatch, 1},
+                                     inits::fromVector(subBatch->mask()));
+    return std::make_tuple(batchEmbeddings, batchMask);
+  }
+
+  Expr apply(const Words& words, const Shape& shape) const override final {
+    if (factoredVocab_) {
+      Expr selectedEmbs = multiRows(words, opt<float>("dropout", 0.0f));        // [(B*W) x E]
+      selectedEmbs = reshape(selectedEmbs, shape); // [W, B, E]
+      return selectedEmbs;
+    }
+    else
+      return applyIndices(toWordIndexVector(words), shape);
+  }
+
+  Expr applyIndices(const std::vector<WordIndex>& embIdx, const Shape& shape) const override final {
+    ABORT_IF(factoredVocab_, "Embedding: applyIndices must not be used with a factored vocabulary");
+    auto selectedEmbs = rows(embeddings, embIdx);        // [(B*W) x E]
+    selectedEmbs = reshape(selectedEmbs, shape); // [W, B, E]
+    // @BUGBUG: We should not broadcast along dimBatch=[-2]. Then we can also dropout before reshape() (test that separately)
+    selectedEmbs = dropout(selectedEmbs, opt<float>("dropout", 0.0f), { selectedEmbs->shape()[-3], 1, 1 });
+    return selectedEmbs;
+  }
+
+private:
+  Ptr<FactoredVocab> factoredVocab_;
+
+  // helper to embed a sequence of words (given as indices) via factored embeddings
+  Expr multiRows(const Words& data, float dropProb) const {
+    auto graph = embeddings->graph();
+    auto factoredData = factoredVocab_->csr_rows(data);
+    // multi-hot factor vectors are represented as a sparse CSR matrix
+    // [row index = word position index] -> set of factor indices for word at this position
+    ABORT_IF(factoredData.shape != Shape({(int)factoredData.offsets.size()-1/*=rows of CSR*/, embeddings->shape()[0]}), "shape mismatch??");
+    // the CSR matrix is passed in pieces
+    auto weights = graph->constant({ (int)factoredData.weights.size() }, inits::fromVector(factoredData.weights), Type::float32);
+    auto indices = graph->constant({ (int)factoredData.indices.size() }, inits::fromVector(factoredData.indices), Type::uint32);
+    auto offsets = graph->constant({ (int)factoredData.offsets.size() }, inits::fromVector(factoredData.offsets), Type::uint32);
+    // apply dropout
+    // We apply it to the weights, i.e. factors get dropped out separately, but always as entire vectors.
+    weights = dropout(weights, dropProb);
+    // perform the product
+    return csr_dot(factoredData.shape, weights, indices, offsets, embeddings);
+  }
+};
+
+// Abstract base class for position embedding layers
+struct PositionEmbeddingLayer : public Layer, 
+                                public IPositionEmbeddingLayer {
+  using Layer::namedLayers_;
+  using Layer::namedParameters_;
+  using Layer::param;
+
+  int positionAxis;
+  int maxLength;
+
+  PositionEmbeddingLayer(Ptr<ExpressionGraph> graph, int positionAxis, int maxLength) 
+  : Layer(graph), positionAxis(positionAxis), maxLength(maxLength) {}
+
+  virtual ~PositionEmbeddingLayer() = default;
+};
+
+struct SinusoidalPositionEmbedding : public PositionEmbeddingLayer {
+  using PositionEmbeddingLayer::positionAxis;
+  using PositionEmbeddingLayer::maxLength;
+
+  SinusoidalPositionEmbedding(Ptr<ExpressionGraph> graph, int positionAxis)
+   : PositionEmbeddingLayer(graph, positionAxis, /*maxLength=*/-1)
+  {}
+
+  virtual ~SinusoidalPositionEmbedding() = default;
+
+  Expr apply(Expr input, int start = 0) override {      
+      int dimEmb   = input->shape()[-1];
+      int dimWords = input->shape()[positionAxis];
+
+      input = std::sqrt((float)dimEmb) * input; // input were initialized to unit length; so norms will be in order of sqrt(dimEmb)
+
+      Shape posEmbeddingShape;
+      posEmbeddingShape.resize(input->shape().size()); // resize to input shape size and fill with 1s
+      posEmbeddingShape.set(-1, dimEmb);               // match embedding size
+      posEmbeddingShape.set(positionAxis, dimWords);   // match number of items to embed on correct axis
+
+      // the node initializer is dimension agnostic for dimensions other than the last 
+      // dimension (embedding dimension) and works with any positionAxis value
+      auto posEmbeddings = graph()->constant(posEmbeddingShape,
+                                             inits::sinusoidalPositionEmbeddings(start));
+
+      input = input + posEmbeddings;
+      return input;
+  }
+};
+
+struct LearnedPositionEmbedding : public PositionEmbeddingLayer {
+  using PositionEmbeddingLayer::positionAxis;
+  using PositionEmbeddingLayer::maxLength;
+
+  Expr embeddings;
+
+  LearnedPositionEmbedding(Ptr<ExpressionGraph> graph, int positionAxis, int maxLength)
+   : PositionEmbeddingLayer(graph, positionAxis, maxLength)
+  {}
+
+  virtual ~LearnedPositionEmbedding() = default;
+
+  Expr apply(Expr input, int start = 0) override {     
+      int dimEmb   = input->shape()[-1];
+      int dimWords = input->shape()[positionAxis];
+
+      registerParameter(embeddings, 
+                        Shape({maxLength, dimEmb}), 
+                        inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true));
+
+      ABORT_IF(start + dimWords > maxLength, 
+               "Number of positions ({}) starting at position {} exceeds maximum length {}",
+               dimWords, start, maxLength);
+
+      Shape posEmbeddingShape;
+      posEmbeddingShape.resize(input->shape().size()); // resize to input shape size and fill with 1s
+      posEmbeddingShape.set(-1, dimEmb);               // match embedding size
+      posEmbeddingShape.set(positionAxis, dimWords);   // match number of items to embed on correct axis
+
+      auto posEmbeddings = slice(embeddings, -2, Slice(start, start + dimWords));
+      posEmbeddings      = reshape(posEmbeddings, posEmbeddingShape);
+
+      input = input + posEmbeddings;
+      return input;
+  }
+};
+
+static Ptr<PositionEmbeddingLayer> positionEmbeddingFromOptions(Ptr<ExpressionGraph> graph, 
+                                                                Ptr<Options> options, 
+                                                                int positionAxis) {
+  bool trainedEmbedding = options->get<bool>("transformer-train-position-embeddings", false);
+  if(trainedEmbedding) {
+    int maxLength = options->get<int>("max-length");
+    return New<LearnedPositionEmbedding>(graph, positionAxis, maxLength);
+  } else {
+    return New<SinusoidalPositionEmbedding>(graph, positionAxis);
+  }
+}
+
+} // namespace nn
+} // namespace marian
diff --git a/src/layers_new/interface.h b/src/layers_new/interface.h
new file mode 100644
index 000000000..d8317d610
--- /dev/null
+++ b/src/layers_new/interface.h
@@ -0,0 +1,550 @@
+#pragma once
+
+#include "common/utils.h"
+#include "graph/expression_graph.h"
+#include "graph/expression_operators.h"
+#include "graph/node_initializers.h"
+
+#include <type_traits>
+
+namespace marian {
+namespace nn {
+
+// Interface: provides a class member to return the class name (type) as a string
+struct IClassName {
+  virtual std::string className() const {
+    return utils::cxxTypeName(*this);
+  }
+};
+
+// Interface: Unary function
+struct IUnaryLayer {
+  virtual Expr apply(Expr) const = 0;
+};
+
+// Interface: Binary function
+struct IBinaryLayer {
+  virtual Expr apply(Expr, Expr) const = 0;
+};
+
+// Interface: Ternary function
+struct ITernaryLayer {
+  virtual Expr apply(Expr, Expr, Expr) const = 0;
+};
+
+// Interface: 4ary function
+struct IQuaternaryLayer {
+  virtual Expr apply(Expr, Expr, Expr, Expr) const = 0;
+};
+
+// Interface: N-Ary function
+struct INaryLayer {
+  virtual Expr apply(const std::vector<Expr>& list) const = 0;
+};
+
+// Interface: implement a clearing function
+struct IClearable {
+  virtual void clear() = 0;
+};
+
+
+// Helper macro to turn parameter C++ variable name into a string.
+#define registerParameter(paramArg, shape, init) \
+do { \
+  if(!paramArg) { \
+    paramArg = this->param(#paramArg, shape, init); \
+  } \
+} while(0);
+
+// Helper macro to turn parameter C++ variable name into a string.
+// This version is meant to be used in apply(...) functions for lazy parameter inits 
+// hence has to cast away constness.
+#define registerParameterLazy(paramArg, shape, init) \
+do { \
+  using ThisLayerType = std::decay<decltype(*this)>::type; \
+  ThisLayerType* thisLayer = const_cast<ThisLayerType*>(this); \
+  if(!thisLayer->paramArg) { \
+    thisLayer->paramArg = thisLayer->param(#paramArg, shape, init); \
+  } \
+} while(0);
+
+// Helper macro to turn a layer C++ variable name into a string and to add the layer as a named sublayer to the parent layer
+#define registerLayer(layerArg) \
+do { \
+  ABORT_IF(!layerArg, "Layer {} of type {} is not initialized", #layerArg, utils::cxxTypeName(layerArg)); \
+  namedLayers_.emplace_back(#layerArg, layerArg); \
+  if(!layerArg->registered()) { \
+    layerArg->setName(#layerArg); \
+    layerArg->setFirstParent(this); \
+  } \
+} while(0);
+
+// Helper macro that adds the layer as a named sublayer to the parent layer and uses the given name. Different from above as 
+// the C++ variable name itself is not used a name string. 
+#define registerLayerWithName(layerArg, name) \
+do { \
+  ABORT_IF(!layerArg, "Layer {} of type {} with name {} is not initialized", #layerArg, utils::cxxTypeName(layerArg), name); \
+  namedLayers_.emplace_back(name, layerArg); \
+  if(!layerArg->registered()) { \
+    layerArg->setName(name); \
+    layerArg->setFirstParent(this); \
+  } \
+} while(0);
+
+class Layer;
+
+using NamedParameter = std::pair<std::string, Expr>;
+
+template <class LayerType = Layer>
+using NamedLayer = std::pair<std::string, Ptr<LayerType>>;
+
+// Base class for all layers. Sub layers should inherit from this class and one or multiple of the interfaces (e.g. IUnaryLayer)
+class Layer : public IClassName, public IClearable, public std::enable_shared_from_this<Layer> {
+public:
+  enum class Mode : int { eval, train };
+
+private:
+  Weak<ExpressionGraph> graph_;
+
+  // Using naked pointer as a weak reference. Cannot use shared_ptr or weak_ptr 
+  // as registration happens in constructor of parent layer and shared_from_this() 
+  // cannot be used before parent layer constructor exits.
+  Layer* firstParent_{nullptr};
+  std::string name_;
+
+  mutable Mode mode_{Mode::train}; // eval or train ?
+
+protected:
+  std::vector<NamedParameter> namedParameters_; // vector of all named parameters belonging to this specific layer (not recurisve)
+  std::vector<NamedLayer<Layer>> namedLayers_;  // vector of all named sublayers for this specific layer (not recursive)
+
+  // Create a layer parameter with a full name composed of the path to this layer and localName
+  Expr param(const std::string& localName, const Shape& shape, const Ptr<inits::NodeInitializer>& init) {
+    std::string fullName = fmt::format("{}->{}", path(), localName);
+    auto parameter = graph()->param(fullName, shape, init);
+    namedParameters_.emplace_back(localName, parameter);
+    return parameter;
+  }
+
+public:
+  Layer(Ptr<ExpressionGraph> graph)
+    : graph_(graph) {}
+
+  virtual ~Layer() = default;
+
+  Ptr<ExpressionGraph> graph() { 
+    auto graph = graph_.lock();
+    ABORT_IF(!graph, "graph in layer {} expired?", path());
+    return graph;
+  }
+
+  const Ptr<ExpressionGraph> graph() const { 
+    auto graph = graph_.lock();
+    ABORT_IF(!graph, "graph in layer {} expired?", path());
+    return graph;
+  }
+
+#if 1
+  // @TODO: this should be removed, currently hack to init graph.
+  void setGraph(Ptr<ExpressionGraph> graph) {
+    graph_ = graph;
+    for(auto& lr: namedLayers())
+      lr.second->setGraph(graph);
+  }
+#endif
+
+  // Dynamic cast to requested layer type. Will return nullptr if not possible
+  template <class LayerType>
+  Ptr<LayerType> as() {
+    return std::dynamic_pointer_cast<LayerType>(shared_from_this());
+  }
+
+  // Dynamic cast to requested layer type. Will return nullptr if not possible
+  template <class LayerType>
+  Ptr<LayerType> as() const {
+    return const_cast<Layer*>(this)->as<LayerType>();
+  }
+
+  // Dynamic cast to requested layer type. Will abort if the cast is not possible.
+  template <class LayerType>
+  Ptr<LayerType> cast() {
+    auto layerCast = as<LayerType>();
+    ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", 
+             className(),
+             utils::cxxTypeName<LayerType>());
+    return layerCast;
+  }
+
+  template <class LayerType>
+  Ptr<LayerType> cast() const {
+    return const_cast<Layer*>(this)->cast<LayerType>();
+  }
+  
+  // Return all named parameters for this specific layer (not descending into sub-layers)
+  std::vector<NamedParameter>& namedParameters() { return namedParameters_; }
+  const std::vector<NamedParameter>& namedParameters() const { return namedParameters_; }
+
+  // Return all named layers for this specific layer (not descending into sub-layers)
+  std::vector<NamedLayer<Layer>>& namedLayers() { return namedLayers_; }
+  const std::vector<NamedLayer<Layer>>& namedLayers() const { return namedLayers_; }
+
+  // Return all named sub-layers for this layer and its sub-layers (descending recursively into sub-layers).
+  // Can be used with layer type e.g. allNamedLayers<Linear>() to return only sub-layers of this type. 
+  // Returned layers will then have the given type and do not need to be cast anymore.
+  template <class LayerType = Layer>
+  std::vector<NamedLayer<LayerType>> allNamedLayers() {
+    std::vector<NamedLayer<LayerType>> layers;
+    for(auto& namedLayer : namedLayers()) {
+      auto castLayer = namedLayer.second->as<LayerType>();
+      if(castLayer)
+        layers.emplace_back(namedLayer.first, castLayer);
+      
+      auto subLayers = namedLayer.second->allNamedLayers<LayerType>();
+      layers.insert(layers.end(), subLayers.begin(), subLayers.end());
+    }
+    return layers;
+  }
+
+  template <class LayerType = Layer>
+  std::vector<NamedLayer<LayerType>> allNamedLayers() const {
+    return const_cast<Layer*>(this)->allNamedLayers<LayerType>();
+  }
+
+  // Returns all sub-layers (only the layers, not the names) for this layer and its sub-layers (descending 
+  // recursively into sub-layers). Can be used with layer type e.g. allLayers<Linear>() to return only 
+  // sub-layers of this type. Returned layers will then have the given type and do not need to be cast anymore.
+  template <class LayerType = Layer>
+  std::vector<Ptr<LayerType>> allLayers() {
+    std::vector<Ptr<LayerType>> layers;
+    for(auto namedLayer : allNamedLayers<LayerType>())
+      layers.push_back(namedLayer.second);
+    return layers;
+  }
+
+  template <class LayerType = Layer>
+  std::vector<Ptr<LayerType>> allLayers() const {
+    return const_cast<Layer*>(this)->allLayers<LayerType>();
+  }
+
+  // Used by parent layers to set the name of a sub-layer.
+  // @TODO: make this private and only allow friend access from layers before merging with master. 
+  // Currently misused for top layer that has no parent layer that can set its name. 
+  void setName(const std::string& name) { name_ = name; }
+
+  const std::string& name() const { return name_; }
+
+  // This sets the first parent of a sublayer (the layer a sublayer was first registered with).
+  // This is required to generate the correct path/name for layer parameters at saving time. 
+  void setFirstParent(Layer* parent) { 
+    ABORT_IF(firstParent_ != nullptr, "Parent layer has already been set");
+    ABORT_IF(parent == this, "Parent layer has to be different from child");
+    firstParent_ = parent; 
+  }
+
+  // The parent layer of a sublayer is the first layer the sublayer has been registered with.
+  // Subsequent calls to setFirstParent will abort if the parent is already set.
+  bool registered() const {
+    return firstParent_ != nullptr;
+  }
+
+  std::string path() const {
+    std::vector<std::string> path;
+    if(firstParent_)
+      path.push_back(firstParent_->path());
+    path.push_back(name_);
+    return marian::utils::join(path, "->");
+  }
+
+  std::string layerInfo(bool includeChildren=false) const {
+    std::stringstream ss;
+    std::function<void(const Layer*, int)> recurse;
+    recurse = [&](const Layer* layer, int level) {
+      auto indent = utils::join(std::vector<std::string>(level, "  "), "");
+      ss << indent << layer->name() << " : " << layer->className() << std::endl;
+      for(auto& pr: layer->namedParameters())
+        ss << indent << "  " << pr.first << " : " << pr.second->shape() << std::endl;
+      if(includeChildren)
+        for(auto& lr: layer->namedLayers())
+          recurse(lr.second.get(), level + 1);
+    };
+    recurse(this, 0);
+    return ss.str();
+  }
+
+  // Return Mode::eval or Mode::train. This is used to determine if training only layer-internal actions 
+  // like dropout should be run. This will not affect graph-internal gradient propagation unless somehow
+  // specified in a layer.  
+  Mode getMode() const {
+  #if 1
+    if(graph()->isInference()) {
+      return Mode::eval;
+    } else {
+      return Mode::train;
+    }
+  #else
+    return mode_;
+  #endif
+  }
+
+  // Set mode to Mode::eval for this layer and all sub-layers. This will disable dropout and similar actions.
+  void setEvalMode() {
+    mode_ = Mode::eval;
+    for(auto& lr: namedLayers())
+      lr.second->setEvalMode();
+  }
+
+  // Set mode to Mode::train for this layer and all sub-layers. This will enable dropout and similar actions.
+  void setTrainMode() {
+    mode_ = Mode::train;
+    for(auto& lr: namedLayers())
+      lr.second->setTrainMode();
+  }
+
+  virtual void clear() override {
+    for(auto& lr : namedLayers())
+      lr.second->clear();
+  }
+};
+
+class LayerWithOptions : public Layer {
+protected:
+  Ptr<Options> options_;
+
+public:
+  LayerWithOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options)
+    : Layer(graph), options_(options) {}
+
+  virtual ~LayerWithOptions() = default;
+
+  template <typename T>
+  T opt(const std::string key) const {
+    return options_->get<T>(key);
+  }
+
+  template <typename T>
+  T opt(const std::string key, const T& defaultValue) const {
+    return options_->get<T>(key, defaultValue);
+  }
+};
+
+/**
+ * Wrapper to be used exclusively inside LayerList or other similar containers. This is allows to use the apply(...) functions
+ * of a layer without having to cast to specific type (this is done internally based on the number of arguments). Inspired by
+ * boost::any_type which allows to construct containers that hold various types. 
+ * This should allow to use any layer and iterfaces will be added here as required.
+ */
+class AnyLayer final : public IUnaryLayer, 
+                       public IBinaryLayer,
+                       public ITernaryLayer,
+                       public IQuaternaryLayer,
+                       public INaryLayer,
+                       public IClearable {
+private:
+  Ptr<Layer> layer_;
+
+protected:
+  // private/protected constructor, should only be created within listed classes with friendship
+  AnyLayer(const Ptr<Layer>& layer)
+    : layer_(layer) {}
+  
+  friend class LayerList;
+
+public:
+  // Dynamic cast to requested layer type. Will return nullptr if not possible
+  template <class LayerType>
+  Ptr<LayerType> as() const {
+    return std::dynamic_pointer_cast<LayerType>(layer_);
+  }
+
+  // Dynamic cast to requested layer type. Will abort if the cast is not possible.
+  template <class LayerType>
+  Ptr<LayerType> cast() const {
+    auto layerCast = as<LayerType>();
+    ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", 
+             layer_->className(),
+             utils::cxxTypeName<LayerType>());
+    return layerCast;
+  }
+
+  Expr apply(Expr input) const override {
+    return cast<IUnaryLayer>()->apply(input);
+  }
+
+  Expr apply(Expr input1, Expr input2) const override {
+    return cast<IBinaryLayer>()->apply(input1, input2);
+  }
+
+  Expr apply(Expr input1, Expr input2, Expr input3) const override {
+    return cast<ITernaryLayer>()->apply(input1, input2, input3);
+  }
+
+  Expr apply(Expr input1, Expr input2, Expr input3, Expr input4) const override {
+    return cast<IQuaternaryLayer>()->apply(input1, input2, input3, input4);
+  }
+
+  Expr apply(const std::vector<Expr>& inputs) const override {
+    return cast<INaryLayer>()->apply(inputs);
+  }
+
+  virtual void clear() override {
+    cast<IClearable>()->clear();
+  }
+};
+
+/** 
+ * Holds sublayers in a list and performs correct registration of sublayers. Sublayers are indexed
+ * and can be accessed like array elements, including iteration.
+ * `LayerList` -- in contrast to `Sequential` -- does not provide `apply` functions. 
+ * You have to define the execution order and information flow in code.
+ * 
+ * See TransformerEncoder for an example where we hold the transformer layer stack in a LayerList,
+ * but define a custom apply function (due to masks being external information and shared between layers).
+ */
+class LayerList : public Layer {
+protected:
+  std::vector<Ptr<AnyLayer>> layers_;
+
+  template <class Last>
+  void recursiveAppend(Last last) {
+    append(last);
+  }
+  
+  template <class First, class ...Rest>
+  void recursiveAppend(First first, Rest ...rest) {
+    append(first);
+    recursiveAppend(rest...);
+  }
+
+public:
+  LayerList(Ptr<ExpressionGraph> graph)
+  : Layer(graph) {}
+
+  template <class ...Layers>
+  LayerList(Ptr<ExpressionGraph> graph, Layers ...layers)
+  : Layer(graph) {
+    recursiveAppend(layers...);
+  }
+
+  virtual ~LayerList() = default;
+
+  /** 
+   * This inserts an already existing sublayer from this or a different container which will result in 
+   * parameter sharing if there are parameters.
+  ```
+  auto layers = New<LayerList>(graph);
+  layers->append(New<Linear>(graph, 100)); // <- creates a new sublayer and registers it.
+  layers->append(layers->at(0));           // <- no new sublayer created or registered; reference the first one.
+  ```
+  */
+  void append(const Ptr<AnyLayer>& layer) {
+    layers_.push_back(layer);
+  }
+
+  void append(const Ptr<Layer>& layer) {
+    std::string name = fmt::format("at({})->as<{}>()", layers_.size(), layer->className());
+    registerLayerWithName(layer, name);
+    layers_.emplace_back(new AnyLayer(layer)); // not using New<...> because of missing friendship
+  }
+
+  /** 
+   * Retrieve sublayer at index i
+   */
+  Ptr<AnyLayer> at(size_t i) const {
+    return layers_[i];
+  }
+
+  auto begin() -> decltype(layers_.begin()) const {
+    return layers_.begin();
+  }
+
+  auto end() -> decltype(layers_.end()) const {
+    return layers_.end();
+  }
+
+  size_t size() const { return layers_.size(); }
+
+  virtual void clear() override {
+    for(auto& layer : layers_)
+      layer->clear();
+  }
+};
+
+/** 
+ * `Sequential` is a list of layers similar to `LayerList`, but does provide a set of `apply` functions.
+ * These function assume that the first element in the container can be a unary, binary, ternary
+ * or n-ary layer, but all subsequent layers have to be unary layers as they will consume the single
+ * output of their preceding layer. Non-unary layers will fail to execute during runtime if they are 
+ * not the very first layer.
+ * 
+ * `Sequential` can be used to implement typical feed forward networks:
+ * 
+ ```
+  using namespace marian::nn;
+
+  auto seq = New<Sequential>(graph, 
+    New<Linear>(graph, 100),
+    New<ReLU>(graph),
+    New<Dropout>(graph, 0.1f),
+    New<Linear>(graph, 100),
+    New<ReLU>(graph),
+    New<LayerNorm>(graph)
+  );
+
+  Expr output = seq->apply(input);
+ ```
+ * For other application patterns use `LayerList` and implement them yourself by traversing the layers.
+ */
+class Sequential : public LayerList, 
+                   public IUnaryLayer,
+                   public IBinaryLayer,
+                   public ITernaryLayer,
+                   public IQuaternaryLayer,
+                   public INaryLayer {
+public:
+  Sequential(Ptr<ExpressionGraph> graph)
+  : LayerList(graph) {}
+
+  template <class ...Layers>
+  Sequential(Ptr<ExpressionGraph> graph, Layers ...layers)
+  : LayerList(graph, layers...) {}
+
+  virtual ~Sequential() = default;
+
+  Expr apply(Expr input) const override {
+    ABORT_IF(layers_.empty(), "Applying empty Sequential layer?");
+    return applyTail(layers_[0]->apply(input));
+  }
+
+  Expr apply(Expr input1, Expr input2) const override {
+    ABORT_IF(layers_.empty(), "Applying empty Sequential layer?");
+    return applyTail(layers_[0]->apply(input1, input2));
+  }
+
+  Expr apply(Expr input1, Expr input2, Expr input3) const override {
+    ABORT_IF(layers_.empty(), "Applying empty Sequential layer?");
+    return applyTail(layers_[0]->apply(input1, input2, input3));
+  }
+
+  Expr apply(Expr input1, Expr input2, Expr input3, Expr input4) const override {
+    ABORT_IF(layers_.empty(), "Applying empty Sequential layer?");
+    return applyTail(layers_[0]->apply(input1, input2, input3, input4));
+  }
+
+  Expr apply(const std::vector<Expr>& inputs) const override {
+    ABORT_IF(layers_.empty(), "Applying empty Sequential layer?");
+    return applyTail(layers_[0]->apply(inputs));
+  }
+
+private:
+  // apply remaining layers after first layer has been applied.
+  Expr applyTail(Expr input) const {
+    Expr output = input;
+    for(int i = 1; i < layers_.size(); ++i)
+      output = layers_[i]->apply(output);
+    return output;
+  } 
+
+};
+
+} // namespace nn
+} // namespace marian
diff --git a/src/layers_new/neuralnet.cpp b/src/layers_new/neuralnet.cpp
new file mode 100644
index 000000000..11f9ae63d
--- /dev/null
+++ b/src/layers_new/neuralnet.cpp
@@ -0,0 +1,24 @@
+#include "layers_new/neuralnet.h"
+
+namespace marian {
+namespace nn {
+
+// Factory for activation function layers from name as string.
+Ptr<Activation> activationLayerByName(Ptr<ExpressionGraph> graph, const std::string& actName) {
+  // @TODO: lowercase actName first?
+  if(actName == "relu")
+    return New<ReLU>(graph);
+  else if(actName == "gelu")
+    return New<GELU>(graph);
+  else if(actName == "tanh")
+    return New<Tanh>(graph);
+  else if(actName == "sigmoid")
+    return New<Sigmoid>(graph);
+  else if(actName == "swish")
+    return New<Swish>(graph);
+  else
+    ABORT("Unknown activation function: {}", actName);
+}
+
+}
+}
diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h
new file mode 100644
index 000000000..51f2ef4e3
--- /dev/null
+++ b/src/layers_new/neuralnet.h
@@ -0,0 +1,300 @@
+#pragma once
+
+#include "layers_new/interface.h"
+#include "graph/node_initializers.h"
+
+namespace marian {
+namespace nn {
+
+static inline Expr swapTimeBatch(Expr input) { return swapAxes(atleast_4d(input), -2, -3); }
+
+  // @TODO: this is an odd function to be here, this should rather be handled somewhere globally?
+  // convert multiplicative 1/0 mask to additive 0/-inf log mask, and transpose to match result of bdot() op in Attention()
+static inline Expr transposedLogMask(Expr mask, int dimHeads) {
+  if(!mask)
+    return nullptr;
+
+  // LayerAttention expects mask in a different layout
+  int dimBatch    = mask->shape()[-3];
+  int dimSrcWords = mask->shape()[-2];
+  mask = reshape(mask, {dimBatch, 1, 1, dimSrcWords}); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
+
+  float maskFactor = std::max(NumericLimits<float>(mask->value_type()).lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16
+  auto logMask = (1 - mask) * maskFactor;
+  logMask      = reshape(repeat(logMask, dimHeads, -3), {1, dimBatch * dimHeads, 1, dimSrcWords});
+  return logMask;
+}
+
+/**
+ * A generic Activation function layer. Any unary Marian operator or function accepted by 
+ * `std::function<Expr(Expr)>` can be turned into an activation function like this: 
+ ```
+ auto reluLayer = New<Activation>(graph, (Expr(*)(Expr))relu)
+ ```
+ * The function pointer cast may be required to disambiguate the operator name if operators 
+ * of the same name but with a different sets of parameters exist, otherwise it can be dropped 
+ * or replaced with a more readable lambda function.
+ * 
+ * `Activation` will also accept lambdas for more complex activations:
+ ```
+ // a reasonably accurate approximation of GELU
+ auto geluApprox = New<Activation>(graph, [](Expr x) { return x * sigmoid(1.702f * x); });
+ ```
+ */
+class Activation : public Layer, public IUnaryLayer {
+private:
+  std::function<Expr(Expr)> actFn;
+
+public:
+  Activation(Ptr<ExpressionGraph> graph,
+             const std::function<Expr(Expr)>& actFn) 
+    : Layer(graph), actFn(actFn) {}
+
+  virtual ~Activation() = default;
+  
+  Expr apply(Expr x) const override {
+    return actFn(x);
+  }
+};
+
+// A ReLU activation function layer defined via `Activation`.
+struct ReLU final : public Activation {
+  ReLU(Ptr<ExpressionGraph> graph)    : Activation(graph, (Expr(*)(Expr))relu) {}
+};
+
+// A GELU activation function layer defined via `Activation`.
+struct GELU final : public Activation {
+  GELU(Ptr<ExpressionGraph> graph)    : Activation(graph, (Expr(*)(Expr))gelu) {}
+};
+
+// A Tanh activation function layer defined via `Activation`.
+struct Tanh final : public Activation {
+  Tanh(Ptr<ExpressionGraph> graph)    : Activation(graph, (Expr(*)(Expr))tanh) {}
+};
+
+// A Sigmoid activation function layer defined via `Activation`.
+struct Sigmoid final : public Activation {
+  Sigmoid(Ptr<ExpressionGraph> graph) : Activation(graph, (Expr(*)(Expr))sigmoid) {}
+};
+
+// A Swish activation function layer defined via `Activation`.
+struct Swish final : public Activation {
+  Swish(Ptr<ExpressionGraph> graph)   : Activation(graph, (Expr(*)(Expr))swish) {}
+};
+
+// Factory for activation function layers from name as string.
+Ptr<Activation> activationLayerByName(Ptr<ExpressionGraph> graph, const std::string& actName);
+
+// Applies a linear transformation to the incoming data: y = xA^T + b 
+struct Linear : public Layer, public IUnaryLayer {
+  Expr weight;
+  Expr bias;
+
+  int dimOut;
+  bool useBias{true};
+  bool transposed{false};
+  Ptr<inits::NodeInitializer> init;
+
+  // Typical constructor that can take an initializer function
+  Linear(Ptr<ExpressionGraph> graph, 
+         int dimOut,
+         bool useBias = true,
+         bool transposed = false,
+         Ptr<inits::NodeInitializer> init = inits::glorotUniform())
+    : Layer(graph), dimOut(dimOut), useBias(useBias), init(init)
+  {}
+
+  // Alternate constructor which takes a weight parameter that will be re-used, e.g. for tied output weights.
+  // Since the weights are already initialized there is no initializer. Output dimension is initialized from
+  // the given weight parameter.
+  Linear(Ptr<ExpressionGraph> graph,
+         Expr tiedWeight,
+         bool useBias = true,
+         bool transposed = false)
+    : Layer(graph), weight(tiedWeight), dimOut(weight->shape()[-1]), useBias(useBias), init(nullptr)
+  {}
+
+  virtual ~Linear() = default;
+
+  Expr apply(Expr x) const override {
+    int dimIn = x->shape()[-1];
+
+    // if weight is already initialized nothing happens here
+    if(transposed) {
+      registerParameterLazy(weight, Shape({ dimOut, dimIn }), init);
+    } else {
+      registerParameterLazy(weight, Shape({ dimIn, dimOut }), init);
+    }
+    
+    if(useBias) {
+      registerParameterLazy(bias, Shape({ dimOut }), inits::zeros());
+    }
+
+    if(useBias)
+      return marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed);
+    else
+      return marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed);
+  }
+};
+
+struct Dropout final : public Layer, public IUnaryLayer {
+  float dropoutProbabilty;
+  UPtr<Shape> dropoutMaskShape;
+  
+  Dropout(Ptr<ExpressionGraph> graph, 
+          float dropoutProbabilty,
+          const Shape& dropoutMaskShape) 
+    : Layer(graph), dropoutProbabilty(dropoutProbabilty), dropoutMaskShape(new Shape(dropoutMaskShape))
+  {}
+
+  Dropout(Ptr<ExpressionGraph> graph, 
+          float dropoutProbabilty) 
+    : Layer(graph), dropoutProbabilty(dropoutProbabilty), dropoutMaskShape(nullptr)
+  {}
+
+  Expr apply(Expr input) const override {
+    if(getMode() == Mode::eval)
+      return input;
+
+    if(dropoutMaskShape && dropoutProbabilty > 0.f) {
+      return marian::dropout(input, dropoutProbabilty, *dropoutMaskShape);
+    } else if(dropoutProbabilty > 0.f) {
+      return marian::dropout(input, dropoutProbabilty, {input->shape()[-2], input->shape()[-1]});
+    } else {
+      return input;
+    }
+  }
+
+  virtual void clear() override {}
+};
+
+struct LinearReluDropout final : public Linear {
+  using Linear::weight;
+  using Linear::bias;
+
+  using Linear::dimOut;
+  using Linear::useBias;
+  using Linear::transposed;
+  using Linear::init;
+
+  float dropoutProbabilty;
+  UPtr<Shape> dropoutMaskShape;
+
+  // Typical constructor that can take an initializer function
+  LinearReluDropout(Ptr<ExpressionGraph> graph, 
+                    int dimOut,
+                    float dropoutProbabilty,
+                    bool useBias = true,
+                    bool transposed = false,
+                    Ptr<inits::NodeInitializer> init = inits::glorotUniform())
+    : Linear(graph, dimOut, useBias, transposed, init),  
+      dropoutProbabilty(dropoutProbabilty), 
+      dropoutMaskShape(nullptr) {}
+
+  LinearReluDropout(Ptr<ExpressionGraph> graph, 
+                    int dimOut,
+                    float dropoutProbabilty,
+                    const Shape& dropoutMaskShape,
+                    bool useBias = true,
+                    bool transposed = false,
+                    Ptr<inits::NodeInitializer> init = inits::glorotUniform())
+    : Linear(graph, dimOut, useBias, transposed, init),  
+      dropoutProbabilty(dropoutProbabilty), 
+      dropoutMaskShape(new Shape(dropoutMaskShape)) {}
+
+  Expr apply(Expr x) const override {
+    int dimIn = x->shape()[-1];
+
+    // if weight is already initialized nothing happens here
+    if(transposed) {
+      registerParameterLazy(weight, Shape({ dimOut, dimIn }), init);
+    } else {
+      registerParameterLazy(weight, Shape({ dimIn, dimOut }), init);
+    }
+    
+    if(useBias) {
+      registerParameterLazy(bias, Shape({ dimOut }), inits::zeros());
+    }
+
+    // @TODO: handle relu inplace for inference etc.
+    Expr output;
+    if(useBias)
+      output = marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed);
+    else
+      output = marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed);
+
+    if(getMode() == Mode::eval)
+      return relu(output);
+
+    if(dropoutMaskShape && dropoutProbabilty > 0.f) {
+      return marian::dropoutReluInplace(output, dropoutProbabilty, *dropoutMaskShape);
+    } else if(dropoutProbabilty > 0.f) {
+      return marian::dropoutReluInplace(output, dropoutProbabilty, {output->shape()[-2], output->shape()[-1]});
+    } else {
+      return relu(output);
+    }
+  }
+
+  virtual void clear() override {}
+};
+
+
+struct Norm : public Layer, public IUnaryLayer {
+  Norm(Ptr<ExpressionGraph> graph) : Layer(graph) {}
+  virtual ~Norm() = default;
+
+  Expr apply(Expr x) const override = 0;
+};
+
+struct LayerNorm final : public Norm {
+  Expr weight;
+  Expr bias;
+
+  float eps{1e-5f};
+  bool elementwiseAffine{true};
+
+  LayerNorm(Ptr<ExpressionGraph> graph, 
+            float eps = 1e-5f,
+            bool elementwiseAffine = true)
+   : Norm(graph), eps(eps), elementwiseAffine(elementwiseAffine) 
+  {}
+
+  Expr apply(Expr x) const override {
+    int dimModel = x->shape()[-1];
+    if(elementwiseAffine) {
+      registerParameterLazy(weight, Shape({ dimModel }), inits::ones());
+      registerParameterLazy(bias,   Shape({ dimModel }), inits::zeros());
+      return marian::layerNorm(x, weight, bias, eps);
+    } else {
+      return marian::layerNorm(x, nullptr, nullptr, eps);
+    }
+  }
+
+  virtual void clear() override {}
+};
+
+struct RMSNorm final : public Norm {
+  Expr weight;
+
+  float eps{1e-5f};
+  bool elementwiseAffine{true};
+
+  RMSNorm(Ptr<ExpressionGraph> graph, 
+          float eps = 1e-5f,
+          bool elementwiseAffine = true)
+   : Norm(graph), eps(eps), elementwiseAffine(elementwiseAffine) 
+  {}
+
+  Expr apply(Expr x) const override {
+    int dimModel = x->shape()[-1];
+    if(elementwiseAffine) {
+      registerParameterLazy(weight, Shape({ dimModel }), inits::ones());
+      return marian::rmsNorm(x, weight, nullptr, eps);
+    } else {
+      return marian::rmsNorm(x, nullptr, nullptr, eps);
+    }
+  }
+};
+
+} // namespace nn
+} // namespace marian
diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h
new file mode 100644
index 000000000..da3ac4f94
--- /dev/null
+++ b/src/layers_new/rnn.h
@@ -0,0 +1,126 @@
+#pragma once
+
+#include "layers_new/interface.h"
+#include "layers_new/neuralnet.h"
+
+namespace marian {
+namespace nn {
+
+struct CellState {
+  Expr recurrent;
+};
+
+struct ICell {
+  virtual std::vector<Expr> applyToInput(Expr input) const = 0;
+  virtual Expr applyToState(const std::vector<Expr>& inputs, Expr mask, Ptr<CellState> state) const = 0;
+};
+
+class SSRU final : public Layer, public ICell {
+protected:
+  using Layer::namedLayers_;
+
+public:
+  Ptr<Linear> iProj; // input projection
+  Ptr<Linear> fProj; // forget gate projection
+  Ptr<Dropout> dropout;
+
+  int dimState; // state dimension
+
+  SSRU(Ptr<ExpressionGraph> graph, int dimState, float dropProb = 0.f) : Layer(graph), dimState(dimState) {
+    iProj = New<Linear>(graph, dimState, /*useBias=*/false);
+    registerLayer(iProj);
+    fProj = New<Linear>(graph, dimState);
+    registerLayer(fProj);
+    dropout = New<Dropout>(graph, dropProb, Shape({dimState}));
+    registerLayer(dropout);
+  }
+
+  std::vector<Expr> applyToInput(Expr input) const override {
+    int dimModel = input->shape()[-1];
+    ABORT_IF(dimModel != dimState, "Model dimension {} has to match state dimension {}", dimModel, dimState);
+
+    input = dropout->apply(input);
+    
+    Expr output = iProj->apply(input);
+    Expr forget = fProj->apply(input);
+    
+    return {output, forget};
+  }
+
+  Expr applyToState(const std::vector<Expr>& inputs, Expr mask, Ptr<CellState> state) const override {
+    auto prevRecurrent = state->recurrent;
+    auto input  = inputs[0];
+    auto forget = inputs[1];
+
+    auto nextRecurrent = highway(/*input1=*/prevRecurrent, /*input2=*/input, /*gate=*/forget); // rename to "gate"?
+    auto nextOutput    = relu(nextRecurrent);
+
+    // @TODO: not needed? nextRecurrent = mask ? mask * nextRecurrent : nextRecurrent;
+    state->recurrent = nextRecurrent;
+
+    nextOutput    = mask ? mask * nextOutput : nextOutput;
+    return nextOutput;
+  }
+};
+
+template <class Cell>
+class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer {
+protected:
+  using Layer::namedLayers_;
+
+public:
+  Ptr<Cell> cell;
+  Ptr<Linear> oProj;
+
+  RNN(Ptr<ExpressionGraph> graph, int dimState, bool outputProjection = false) 
+  : Layer(graph) {
+    cell = New<Cell>(graph, dimState);
+    registerLayer(cell);
+
+    if(outputProjection) {
+      oProj = New<Linear>(graph, dimState);
+      registerLayer(oProj);
+    }
+  }
+
+  virtual Expr apply(Expr input, Expr inputMask = nullptr) const override {
+    auto state = New<DecoderStateItem>(graph()->constant({1, 1, 1, cell->dimState}, inits::zeros()), /*position=*/0);
+    return apply(input, inputMask, state);
+  }
+
+  virtual Expr apply(Expr input, Expr inputMask, Ptr<DecoderState> state) const override {
+    auto cellState = New<CellState>();
+    cellState->recurrent = state->as<nn::DecoderStateItem>()->get();
+
+    input = swapTimeBatch(input); // [beam, time, batch, dim]
+    if(inputMask)
+      inputMask = swapTimeBatch(inputMask);
+    int dimTimeAxis = -3;
+    
+    std::vector<Expr> inputs = cell->applyToInput(input);
+
+    std::vector<Expr> outputs;
+    for(int i = 0; i < input->shape()[dimTimeAxis]; ++i) {
+      std::vector<Expr> stepInputs(inputs.size());
+      std::transform(inputs.begin(), inputs.end(), stepInputs.begin(),
+                     [i, dimTimeAxis](Expr e) { return slice(e, dimTimeAxis, i); });
+      auto stepMask = inputMask;
+      if(stepMask)
+         stepMask = slice(inputMask, dimTimeAxis, i);
+      
+      Expr output = cell->applyToState(stepInputs, stepMask, /*in/out=*/cellState);
+      outputs.push_back(output);
+    }
+
+    state->as<nn::DecoderStateItem>()->set(cellState->recurrent);
+    
+    Expr output = swapTimeBatch(concatenate(outputs, dimTimeAxis));
+    if(oProj)
+      output = oProj->apply(output);
+
+    return output;
+  }
+};
+
+}
+}
\ No newline at end of file
diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h
new file mode 100644
index 000000000..3302d9d85
--- /dev/null
+++ b/src/layers_new/transformer.h
@@ -0,0 +1,553 @@
+#pragma once
+
+#include "layers_new/attention.h"
+#include "layers_new/decoder.h"
+#include "layers_new/embeddings.h"
+#include "layers_new/neuralnet.h"
+#include "layers_new/rnn.h"
+
+#include <cmath>
+
+namespace marian {
+namespace nn {
+
+/**
+ * This groups the typical transformer pre/post-processing steps in to a class.
+ * Currently these are usually dropout, layer normalization and skip connections.
+ * A transformer block will usually apply one of them.
+ */
+struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer {
+  Ptr<Dropout> dropout;
+  Ptr<Norm> norm;
+  std::string actionDesc;
+
+  TransformerPrePostProcessor(Ptr<ExpressionGraph> graph,
+                              const std::string& actionDesc,
+                              float dropoutProbablity)
+    : Layer(graph), 
+      actionDesc(actionDesc)
+  {
+    for(char a : actionDesc) {
+      if(a == 'd') {
+        ABORT_IF(dropout, "Dropout layer already initialized? Did you specify 'd' more than once?");
+        dropout = New<Dropout>(graph, dropoutProbablity);
+        registerLayer(dropout);
+      } else if(a == 'n') {
+        ABORT_IF(norm, "Norm layer already initialized? Did you specify 'n' or 'r' more than once?");
+        norm = New<LayerNorm>(graph);
+        registerLayer(norm);
+      } else if(a == 'r') {
+        ABORT_IF(norm, "Norm layer already initialized? Did you specify 'n' or 'r' more than once?");
+        norm = New<RMSNorm>(graph);
+        registerLayer(norm);
+      }
+    }
+  }
+  
+  Expr apply(Expr input, Expr previous = nullptr) const override {
+    Expr output = input;
+    for(char action : actionDesc) {
+      if(action == 'd') 
+        output = dropout->apply(output);
+      else if(action == 'a' && previous)
+        output = output + previous;
+      else if(action == 'a' && !previous)
+        ABORT("Action 'a' (add skip connection) specified but no previous input given");
+      else if(action == 'n' || action == 'r')
+        output = norm->apply(output);
+      else
+        ABORT("Action '{}' in '{}' unknown", action, actionDesc);
+    }
+    return output;
+  }
+};
+
+/** 
+ * This is a typical transformer self-attention block. The default configuration will
+ * use a multi-head multiplicative self-attention layer, followed by dropout, the skip
+ * connection and layer normalization (dan) in the post-processor. The pre-processor does
+ * nothing in the default configuration.
+ */
+class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBinaryLayer {
+public:
+  Ptr<TransformerPrePostProcessor> preprocessor;
+  Ptr<AttentionLayer> selfAttention;
+  Ptr<TransformerPrePostProcessor> postprocessor;
+
+  TransformerSelfAttentionBlock(Ptr<ExpressionGraph> graph, 
+                                Ptr<Options> options)
+    : LayerWithOptions(graph, options)
+  {
+    preprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-preprocess", ""),  
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(preprocessor);
+
+    // @TODO: factory to support different attention flavors?
+    selfAttention = attentionFromOptions(graph, options);
+    registerLayer(selfAttention);
+
+    postprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-postprocess", ""), 
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(postprocessor);
+  }
+
+  Expr apply(Expr input, Expr mask = nullptr) const override {
+    auto output = preprocessor->apply(input);                          // optional preprocessing
+    output      = selfAttention->apply(output, output, output, mask);  // self attention, @TODO: make this a IBinaryLayer rather than IQuaternaryLayer
+    output      = postprocessor->apply(output, input);                 // optional postprocessing, optional skip connection
+    return output;
+  }
+};
+
+/** 
+ * This is a typical transformer filter (1-dimensional convolution) block. The default configuration will
+ * use scale up to a larger dimension, apply a ReLU activation and scale down again, followed by dropout, 
+ * the skip connection and layer normalization (dan) in the post-processor. The pre-processor does
+ * nothing in the default configuration.
+ */
+struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLayer {
+  Ptr<TransformerPrePostProcessor> preprocessor;
+  Ptr<Sequential> layers;
+  Ptr<TransformerPrePostProcessor> postprocessor;
+  bool isDecoder{false};
+  
+  TransformerFilterBlock(Ptr<ExpressionGraph> graph, 
+                         Ptr<Options> options,
+                         bool isDecoder = false)
+    : LayerWithOptions(graph, options), isDecoder(isDecoder)
+  {
+    preprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-preprocess", ""),  
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(preprocessor);
+    
+    int modelDim = opt<int>("dim-emb");
+    int ffnDim   = opt<int>("transformer-dim-ffn");
+    if(isDecoder && opt<int>("transformer-decoder-dim-ffn") != 0)
+      ffnDim = opt<int>("transformer-decoder-dim-ffn");
+
+    int depth    = opt<int>("transformer-ffn-depth", 2);
+    if(isDecoder && opt<int>("transformer-decoder-ffn-depth") != 0)
+      depth = opt<int>("transformer-decoder-ffn-depth");
+
+    auto actName = opt<std::string>("transformer-ffn-activation", "relu");
+    float ffnDropoutProbability = opt<float>("transformer-dropout-ffn", 0.f);
+
+    ABORT_IF(depth < 1, "Filter depth {} is smaller than 1", depth);
+
+    // assemble filter of given depth
+    layers = New<Sequential>(graph);
+    registerLayer(layers);
+      
+    if(actName == "relu") {
+      layers->append(New<LinearReluDropout>(graph, ffnDim, ffnDropoutProbability));
+    } else {
+      layers->append(New<Linear>(graph, ffnDim));
+      layers->append(activationLayerByName(graph, actName));
+      layers->append(New<Dropout>(graph, ffnDropoutProbability));
+    }
+    for(int i = 1; i < depth-1; ++i) {
+      if(actName == "relu") {
+        layers->append(New<LinearReluDropout>(graph, ffnDim, ffnDropoutProbability));
+      } else {
+        layers->append(New<Linear>(graph, ffnDim));
+        layers->append(activationLayerByName(graph, actName));
+        layers->append(New<Dropout>(graph, ffnDropoutProbability));
+      }
+    }
+    layers->append(New<Linear>(graph, modelDim));
+
+    postprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-postprocess", ""),
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(postprocessor);
+  }
+
+  Expr apply(Expr input) const override {
+    Expr output = preprocessor->apply(input);          // optional preprocessing
+    output      = layers->apply(output);               // main FFN
+    output      = postprocessor->apply(output, input); // optional postprocessing, optional skip connection
+    return output;
+  }
+};
+
+/** 
+ * A full transformer encoder layer consists of a self-attention block followed by
+ * a filter block. Skip connections etc. are handled inside the blocks, see above.
+ */
+struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLayer {
+  Ptr<TransformerSelfAttentionBlock> selfAttentionBlock;
+  Ptr<TransformerFilterBlock> filterBlock;
+
+  TransformerEncoderLayer(Ptr<ExpressionGraph> graph, 
+                          Ptr<Options> options)
+    : LayerWithOptions(graph, options)
+  {
+    selfAttentionBlock = New<TransformerSelfAttentionBlock>(graph, options);
+    registerLayer(selfAttentionBlock);
+    
+    filterBlock = New<TransformerFilterBlock>(graph, options);
+    registerLayer(filterBlock);
+  }
+
+  Expr apply(Expr input, Expr mask = nullptr) const override {
+    Expr output = selfAttentionBlock->apply(input, mask);
+    output      = filterBlock->apply(output);
+    
+    checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual)
+    
+    return output;
+  }
+};
+
+/**
+ * A full transformer encoder stack. Before applying multiple transformer layers (depth of the encoder), we 
+ * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity
+ * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. 
+ * @TODO: get rid of these transposes.
+ */
+struct TransformerEncoder final : public LayerWithOptions, public IBinaryLayer {
+  Ptr<PositionEmbeddingLayer> positionEmbedding;
+  Ptr<TransformerPrePostProcessor> preprocessor;
+  Ptr<LayerList> layers;
+  Ptr<TransformerPrePostProcessor> postprocessor;
+
+  TransformerEncoder(Ptr<ExpressionGraph> graph, 
+                     Ptr<Options> options)
+    : LayerWithOptions(graph, options)
+  {
+    positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2);
+    registerLayer(positionEmbedding);
+
+    preprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-postprocess-emb", ""),  
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(preprocessor);
+
+    layers = New<LayerList>(graph);
+    registerLayer(layers);
+    for(int i = 0; i < opt<int>("enc-depth"); ++i) {
+      auto transformerEncoderLayer = New<TransformerEncoderLayer>(graph, options);
+      // example of changing linear layer init functions burried deep in the model 
+      if(opt<bool>("transformer-depth-scaling", false))
+        for(auto linear : transformerEncoderLayer->allLayers<Linear>())
+          linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
+
+      layers->append(transformerEncoderLayer);
+    }
+
+    postprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-postprocess-top", ""),  
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(postprocessor);
+  }
+
+  Expr apply(Expr input, Expr mask = nullptr) const override {
+    // first and last operations (see at the bottom of this function) switch the time and batch
+    // dimensions. This order is more natural for the transformer, but more difficult to handle
+    // during beam search or when using RNNs. Hence the input/output transpositions here.
+
+    // @TODO: still worth to review this whole transpose business across the tool. In the 
+    // decoder state, Frank added information about batchMajor/timeMajor orientation. If we 
+    // do that everywhere we can detect inconsistencies automatically. 
+    // reorganize batch and timestep
+    auto output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
+    if(mask) {
+      mask = swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
+      mask = transposedLogMask(mask, opt<int>("transformer-heads"));
+    }
+
+    // apply positional embeddings to contextual input
+    output = positionEmbedding->apply(output);
+
+    // handle for skip connection at top
+    auto prevOutput = output;
+
+    // apply dropout or layer-norm to embeddings if required
+    output = preprocessor->apply(output);
+
+    // traverse the layers, use the same mask for each
+    for(auto layer : *layers)
+      output = layer->apply(output, mask);
+
+    // apply final postprocessor if required, e.g. final layer-norm for pre-norm or final skip connection
+    output = postprocessor->apply(output, prevOutput);
+
+    // restore organization of batch and time steps. This is currently required
+    // to make RNN-based decoders and beam search work with this. We are looking
+    // into making this more natural.
+
+    // @TODO: it might be worth to make this optional when the input goes into a
+    // transformer decoder which now has to undo that again -- or even better
+    // detect idempotent transposes during a process similar to auto-batching.
+    // Or as other toolkits do it, make the transformer order the default and only transpose for RNNs.
+    output = swapTimeBatch(output); // [beam depth=1, max length, batch size, vector dim]
+    return output;
+  }
+};
+
+/** 
+ * This is a typical transformer cross-attention block. The default configuration will
+ * use a multi-head multiplicative cross-attention layer, followed by dropout, the skip
+ * connection and layer normalization (dan) in the post-processor. The pre-processor does
+ * nothing in the default configuration.
+ */
+class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITernaryLayer {
+public:
+  Ptr<TransformerPrePostProcessor> preprocessor;
+  Ptr<AttentionLayer> crossAttention;
+  Ptr<TransformerPrePostProcessor> postprocessor;
+
+  TransformerCrossAttentionBlock(Ptr<ExpressionGraph> graph, 
+                                 Ptr<Options> options)
+    : LayerWithOptions(graph, options)
+  {
+    preprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-preprocess", ""),  
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(preprocessor);
+
+    // @TODO: factory to support different attention flavors?
+    crossAttention = attentionFromOptions(graph, options);
+    registerLayer(crossAttention);
+
+    postprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-postprocess", ""), 
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(postprocessor);
+  }
+
+  Expr apply(Expr input, Expr context, Expr contextMask = nullptr) const override {
+    auto output = preprocessor->apply(input);                                   // optional preprocessing
+    output      = crossAttention->apply(output, context, context, contextMask); // cross attention, @TODO: make this a ITernaryLayer rather than IQuaternaryLayer
+    output      = postprocessor->apply(output, input);                          // optional postprocessing, optional skip connection
+    return output;
+  }
+};
+
+#if 1
+
+class TransformerAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer {
+public:
+  TransformerAutoRegressiveBlock(Ptr<ExpressionGraph> graph, 
+                                 Ptr<Options> options)
+    : LayerWithOptions(graph, options) {}
+  
+  virtual ~TransformerAutoRegressiveBlock() = default;
+
+  using IBinaryDecoderLayer::apply;
+};
+
+/** 
+ * This is a transformer RNN block. 
+ */
+class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
+public:
+  Ptr<TransformerPrePostProcessor> preprocessor;
+  Ptr<RNN<SSRU>> rnn;
+  Ptr<TransformerPrePostProcessor> postprocessor;
+
+  TransformerRNNBlock(Ptr<ExpressionGraph> graph, 
+                      Ptr<Options> options)
+    : TransformerAutoRegressiveBlock(graph, options)
+  {
+    preprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-preprocess", ""),  
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(preprocessor);
+
+    // @TODO: factory to support different attention flavors?
+    rnn = New<RNN<SSRU>>(graph, opt<int>("dim-emb"), opt<bool>("transformer-rnn-projection", false));
+    registerLayer(rnn);
+
+    postprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-postprocess", ""), 
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(postprocessor);
+  }
+
+  Expr apply(Expr input, Expr inputMask, Ptr<DecoderState> state) const override {
+    auto output = preprocessor->apply(input);           // optional preprocessing
+    output      = rnn->apply(output, inputMask, state); // rnn application with state extension
+    output      = postprocessor->apply(output, input);  // optional postprocessing, optional skip connection
+    return output;
+  }
+};
+
+/** 
+ * A full transformer decoder layer consists of a self-attention block followed by
+ * cross-attention block and a filter block. Skip connections etc. are handled inside 
+ * the blocks, see above.
+ * 
+ * For the self-attention block we need a special mask, usually a triangle mask that
+ * prohibits to look into the future. 
+ * @TODO: should the triangle mask be constructed locally here? Would make sense, but expensive 
+ * for many layers. 
+ */
+struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaternaryDecoderLayer {
+  Ptr<TransformerAutoRegressiveBlock> autoRegressiveBlock;
+  Ptr<TransformerCrossAttentionBlock> crossAttentionBlock;
+  Ptr<TransformerFilterBlock> filterBlock;
+
+  TransformerDecoderLayer(Ptr<ExpressionGraph> graph, 
+                          Ptr<Options> options)
+    : LayerWithOptions(graph, options)
+  {
+    auto autoRegressionType = opt<std::string>("transformer-decoder-autoreg", "self-attention");
+    if(autoRegressionType == "self-attention") {
+      ABORT("Auto-regression block type {} not yet implemented", autoRegressionType);
+    } else if(autoRegressionType == "rnn") {
+      autoRegressiveBlock = New<TransformerRNNBlock>(graph, options);
+    } else {
+      ABORT("Unknown auto-regression block type {}", autoRegressionType);
+    }
+    registerLayer(autoRegressiveBlock);
+  
+    crossAttentionBlock = New<TransformerCrossAttentionBlock>(graph, options);
+    registerLayer(crossAttentionBlock);
+    
+    filterBlock = New<TransformerFilterBlock>(graph, options, /*isDecoder=*/true);
+    registerLayer(filterBlock);
+  }
+
+  Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr<DecoderState> state) const override {
+    Expr output = autoRegressiveBlock->apply(input, inputMask, state);
+    output      = crossAttentionBlock->apply(output, context, contextMask);
+    output      = filterBlock->apply(output);
+
+    checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual)    
+    return output;
+  }
+};
+
+/**
+ * A full transformer decoder stack. Before applying multiple transformer layers (depth of the decoder), we 
+ * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity
+ * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. 
+ * @TODO: get rid of these transposes.
+ */
+struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDecoderLayer {
+  Ptr<PositionEmbeddingLayer> positionEmbedding;
+  Ptr<TransformerPrePostProcessor> preprocessor;
+  Ptr<LayerList> layers;
+  Ptr<TransformerPrePostProcessor> postprocessor;
+  
+  TransformerDecoder(Ptr<ExpressionGraph> graph, 
+                     Ptr<Options> options)
+    : LayerWithOptions(graph, options)
+  {
+    positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2);
+    registerLayer(positionEmbedding);
+
+    preprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-postprocess-emb", ""),  
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(preprocessor);
+
+    size_t decDepth = opt<size_t>("dec-depth");
+    std::vector<size_t> tiedLayers = opt<std::vector<size_t>>("transformer-tied-layers", std::vector<size_t>());
+    ABORT_IF(!tiedLayers.empty() && tiedLayers.size() != decDepth,
+             "Specified layer tying for {} layers, but decoder has {} layers",
+             tiedLayers.size(),
+             decDepth);
+    // shift to base-0 indexing
+    for(auto& layerNo : tiedLayers)
+      layerNo = layerNo - 1;
+
+    layers = New<LayerList>(graph);
+    registerLayer(layers);
+    for(size_t i = 0; i < decDepth; ++i) {
+      if(tiedLayers.empty() || tiedLayers[i] == i) { // not tied or tied to itself, so needs to be created first
+        auto transformerDecoderLayer = New<TransformerDecoderLayer>(graph, options);
+        layers->append(transformerDecoderLayer);
+      } else {
+        ABORT_IF(tiedLayers[i] > i, "Cannot tie to layer above this layer??");
+        layers->append(layers->at(tiedLayers[i])); // repeat layer to tie weights
+      }
+
+      auto currentLayer = layers->at(i)->as<TransformerDecoderLayer>();
+      // example of changing linear layer init functions burried deep in the model 
+      if(opt<bool>("transformer-depth-scaling", false)) {
+        auto autoRegLayer = currentLayer->autoRegressiveBlock->as<TransformerRNNBlock>();
+        autoRegLayer->rnn->oProj->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
+
+        for(auto linear : currentLayer->crossAttentionBlock->allLayers<Linear>())
+          linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
+        for(auto linear : currentLayer->filterBlock->allLayers<Linear>())
+          linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
+
+      }
+    }
+
+    postprocessor = New<TransformerPrePostProcessor>(
+      graph, 
+      opt<std::string>("transformer-postprocess-top", ""),  
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(postprocessor);
+  }
+
+  Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr<DecoderState> state) const override {
+    // first and last operations (see at the bottom of this function) switch the time and batch
+    // dimensions. This order is more natural for the transformer, but more difficult to handle
+    // during beam search or when using RNNs. Hence the input/output transpositions here.
+    Expr output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
+    context = swapTimeBatch(context); 
+
+    // @TODO: write function prepareMasks();
+    // @TODO: create triangle mask here and combine with inputMask
+    LOG_ONCE(info, "Don't forget the triangle mask if required!");
+    if(inputMask) {
+      inputMask = swapTimeBatch(inputMask);   // [beam depth=1, batch size, max length, vector dim=1]
+    }
+
+    if(contextMask) {
+      contextMask = swapTimeBatch(contextMask);    // [beam depth=1, max length, batch size, vector dim=1]
+      contextMask = transposedLogMask(contextMask, opt<int>("transformer-heads")); // [beam broadcast=1, batch size * num heads, max length broadcast=1, max length]
+    }
+    
+    // apply positional embeddings to contextual input @TODO: remove need for conversion to int
+    output = positionEmbedding->apply(output, (int)state->getPosition());
+    
+    // handle for skip connection at top
+    auto prevOutput = output;
+
+    // apply dropout or layer-norm to embeddings if required
+    output = preprocessor->apply(output);
+
+    // get an iterator to per-layer states
+    auto layerStateIt = state->as<nn::DecoderStateList>()->begin();
+    // traverse the layers, use the same mask for each
+    for(auto layer : *layers)
+      output = layer->as<TransformerDecoderLayer>()->apply(output, inputMask, context, contextMask, /*in/out=*/*layerStateIt++);
+
+    // apply final postprocessor if requred, e.g. final layer-norm for pre-norm or final skip connection
+    output = postprocessor->apply(output, prevOutput);
+
+    // restore organization of batch and time steps. This is currently required
+    // to make RNN-based decoders and beam search work with this. We are looking
+    // into making this more natural.
+    // @TODO: it might be worth to make this optional when the input goes into a
+    // transformer decoder which now has to undo that again -- or even better
+    // detect idempotent transposes during a process similar to auto-batching.
+    // Or as other toolkits do it, make the transformer order the default and only transpose for RNNs.
+    output = swapTimeBatch(output); // [beam depth=1, max length, batch size, vector dim]
+    return output;
+  }
+};
+#endif
+
+} // namespace nn
+} // namespace marian
diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp
index 5a317019d..17ee2a4d9 100644
--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@@ -12,6 +12,7 @@
 #include "models/s2s.h"
 #include "models/laser.h"
 #include "models/transformer_factory.h"
+#include "models/transformer_new.h"
 
 #ifdef CUDNN
 #include "models/char_s2s.h"
@@ -183,20 +184,43 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
         .construct(graph);
   }
 
-  else if(type == "transformer") {
-#if 1
+  else if(type == "transformer-new") {
     auto newOptions = options->with("usage", use);
     auto res = New<EncoderDecoder>(graph, newOptions);
-    res->push_back(New<EncoderTransformer>(graph, newOptions->with("type", "transformer")));
-    res->push_back(New<DecoderTransformer>(graph, newOptions->with("type", "transformer")));
+    
+    auto enc = New<TransformerBatchEncoder>(graph, newOptions->with("type", "transformer"));
+    enc->setName("TransformerBatchEncoder");
+    res->push_back(enc);
+    
+    auto dec = New<TransformerBatchDecoder>(graph, newOptions->with("type", "transformer"));
+    dec->setName("TransformerBatchDecoder");
+    res->push_back(dec);
+    
     return res;
-#else
-    return models::encoder_decoder(options->with(
-         "usage", use))
-        .push_back(models::encoder()("type", "transformer"))
-        .push_back(models::decoder()("type", "transformer"))
-        .construct(graph);
-#endif
+  }
+
+  else if(type == "transformer") {
+    const char* tflavor = std::getenv("TRANSFORMER_FLAVOR");
+    if(tflavor && std::strcmp(tflavor, "experimental") == 0) {
+      auto newOptions = options->with("usage", use);
+      auto res = New<TransformerLegacy>(graph, newOptions);
+      
+      auto enc = New<TransformerBatchEncoder>(graph, newOptions->with("type", "transformer"));
+      enc->setName("TransformerBatchEncoder");
+      res->push_back(enc);
+      
+      auto dec = New<TransformerBatchDecoder>(graph, newOptions->with("type", "transformer"));
+      dec->setName("TransformerBatchDecoder");
+      res->push_back(dec);
+      
+      return res;
+    } else {
+      auto newOptions = options->with("usage", use);
+      auto res = New<EncoderDecoder>(graph, newOptions);
+      res->push_back(New<EncoderTransformer>(graph, newOptions->with("type", "transformer")));
+      res->push_back(New<DecoderTransformer>(graph, newOptions->with("type", "transformer")));
+      return res;
+    }
   }
 
   else if(type == "transformer_s2s") {
diff --git a/src/models/s2s.h b/src/models/s2s.h
index 8eb2ef8d1..cfab3fcae 100644
--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@@ -246,7 +246,7 @@ class DecoderS2S : public DecoderBase {
     }
 
     rnn::States startStates(opt<size_t>("dec-depth"), {start, start});
-    return New<DecoderState>(startStates, Logits(), encStates, batch);
+    return New<DecoderState>(startStates, Logits(), encStates, batch, /*isBatchMajor=*/false);
   }
 
   virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph,
@@ -341,8 +341,7 @@ class DecoderS2S : public DecoderBase {
       logits = output_->applyAsLogits({embeddings, decoderContext});
 
     // return unormalized(!) probabilities
-    auto nextState = New<DecoderState>(
-      decoderStates, logits, state->getEncoderStates(), state->getBatch());
+    auto nextState = New<DecoderState>(decoderStates, logits, state->getEncoderStates(), state->getBatch(), /*isBatchMajor=*/false);
 
     // Advance current target token position by one
     nextState->setPosition(state->getPosition() + 1);
@@ -351,8 +350,7 @@ class DecoderS2S : public DecoderBase {
 
   // helper function for guided alignment
   virtual const std::vector<Expr> getAlignments(int i = 0) override {
-    auto att
-        = rnn_->at(0)->as<rnn::StackedCell>()->at(i + 1)->as<rnn::Attention>();
+    auto att = rnn_->at(0)->as<rnn::StackedCell>()->at(i + 1)->as<rnn::Attention>();
     return att->getAlignments();
   }
 
diff --git a/src/models/states.h b/src/models/states.h
index 20dd59c95..a4be3795e 100644
--- a/src/models/states.h
+++ b/src/models/states.h
@@ -21,19 +21,16 @@ class EncoderState {
 
   virtual Expr getContext() const { return context_; }
   virtual Expr getAttended() const { return context_; }
-  virtual Expr getMask() const {
-    return mask_;
-  }  // source batch mask; may have additional positions suppressed
-
+  virtual Expr getMask() const { return mask_; }  
+  
+  // source batch mask; may have additional positions suppressed
   virtual const Words& getSourceWords() { return batch_->front()->data(); }
 
   // Sub-select active batch entries from encoder context and context mask
-  Ptr<EncoderState> select(
-      const std::vector<IndexType>& batchIndices) {  // [batchIndex] indices of active batch entries
+  Ptr<EncoderState> select(const std::vector<IndexType>& batchIndices) {  // [batchIndex] indices of active batch entries
     // Dimension -2 is OK for both, RNN and Transformer models as the encoder context in Transformer
     // gets transposed to the same dimension layout
-    return New<EncoderState>(
-        index_select(context_, -2, batchIndices), index_select(mask_, -2, batchIndices), batch_);
+    return New<EncoderState>(index_select(context_, -2, batchIndices), index_select(mask_, -2, batchIndices), batch_);
   }
 };
 
@@ -43,6 +40,7 @@ class DecoderState {
   Logits logProbs_;
   std::vector<Ptr<EncoderState>> encStates_;
   Ptr<data::CorpusBatch> batch_;
+  bool isBatchMajor_{false};
 
   Expr targetHistoryEmbeddings_;  // decoder history (teacher-forced or from decoding), embedded
   Expr targetMask_;
@@ -55,8 +53,9 @@ class DecoderState {
   DecoderState(const rnn::States& states,
                Logits logProbs,
                const std::vector<Ptr<EncoderState>>& encStates,
-               Ptr<data::CorpusBatch> batch)
-      : states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch) {}
+               Ptr<data::CorpusBatch> batch,
+               bool isBatchMajor = false)
+      : states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch), isBatchMajor_(isBatchMajor) {}
   virtual ~DecoderState() {}
 
   // @TODO: Do we need all these to be virtual?
@@ -64,9 +63,9 @@ class DecoderState {
 
   virtual Logits getLogProbs() const { return logProbs_; }
   virtual void setLogProbs(Logits logProbs) { logProbs_ = logProbs; }
+  virtual bool isBatchMajor() { return isBatchMajor_; }
 
-  // @TODO: should this be a constructor? Then derived classes can call this without the New<> in
-  // the loop
+  // @TODO: should this be a constructor? Then derived classes can call this without the New<> in the loop
   virtual Ptr<DecoderState> select(
       const std::vector<IndexType>& hypIndices,    // [beamIndex * activeBatchSize + batchIndex]
       const std::vector<IndexType>& batchIndices,  // [batchIndex]
@@ -75,15 +74,14 @@ class DecoderState {
     for(auto& es : encStates_)
       // If the size of the batch dimension of the encoder state context changed, subselect the
       // correct batch entries
-      newEncStates.push_back(
-          es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices));
+      newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices));
 
     // hypindices matches batchIndices in terms of batch dimension, so we only need hypIndices
-    auto selectedState
-        = New<DecoderState>(states_.select(hypIndices, beamSize, /*isBatchMajor=*/false),
-                            logProbs_,
-                            newEncStates,
-                            batch_);
+    auto selectedState = New<DecoderState>(states_.select(hypIndices, beamSize, /*isBatchMajor=*/isBatchMajor_),
+                                           logProbs_,
+                                           newEncStates,
+                                           batch_, 
+                                           isBatchMajor_);
 
     // Set positon of new state based on the target token position of current state
     selectedState->setPosition(getPosition());
diff --git a/src/models/transformer.h b/src/models/transformer.h
index 1fed868b6..a3f6d9b53 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -285,6 +285,7 @@ class Transformer : public EncoderOrDecoderBase {
     auto Wq = graph_->param(prefix + "_Wq", {dimModel, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f));
     auto bq = graph_->param(prefix + "_bq", {       1, dimModel}, inits::zeros());
     auto qh = affine(q, Wq, bq);
+    
     qh = SplitHeads(qh, dimHeads); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim]
 
     Expr kh;
@@ -633,35 +634,6 @@ class EncoderTransformer : public Transformer<EncoderBase> {
   virtual void clear() override {}
 };
 
-class TransformerState : public DecoderState {
-public:
-  TransformerState(const rnn::States& states,
-                   Logits logProbs,
-                   const std::vector<Ptr<EncoderState>>& encStates,
-                   Ptr<data::CorpusBatch> batch)
-      : DecoderState(states, logProbs, encStates, batch) {}
-
-  virtual Ptr<DecoderState> select(const std::vector<IndexType>& hypIndices,   // [beamIndex * activeBatchSize + batchIndex]
-                                   const std::vector<IndexType>& batchIndices, // [batchIndex]
-                                   int beamSize) const override {
-
-    // @TODO: code duplication with DecoderState only because of isBatchMajor=true, should rather be a contructor argument of DecoderState?
-    
-    std::vector<Ptr<EncoderState>> newEncStates;
-    for(auto& es : encStates_) 
-      // If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries    
-      newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices));
-
-    // Create hypothesis-selected state based on current state and hyp indices
-    auto selectedState = New<TransformerState>(states_.select(hypIndices, beamSize, /*isBatchMajor=*/true), logProbs_, newEncStates, batch_); 
-
-    // Set the same target token position as the current state
-    // @TODO: This is the same as in base function.
-    selectedState->setPosition(getPosition());
-    return selectedState;
-  }
-};
-
 class DecoderTransformer : public Transformer<DecoderBase> {
   typedef Transformer<DecoderBase> Base;
   using Base::Base;
@@ -718,12 +690,11 @@ class DecoderTransformer : public Transformer<DecoderBase> {
       start->set_name("decoder_start_state_" + std::to_string(batchIndex_));
       rnn::States startStates(opt<size_t>("dec-depth"), {start, start});
 
-      // don't use TransformerState for RNN layers
-      return New<DecoderState>(startStates, Logits(), encStates, batch);
+      return New<DecoderState>(startStates, Logits(), encStates, batch, /*isBatchMajor=*/false);
     }
     else {
       rnn::States startStates;
-      return New<TransformerState>(startStates, Logits(), encStates, batch);
+      return New<DecoderState>(startStates, Logits(), encStates, batch, /*isBatchMajor=*/true);
     }
   }
 
@@ -825,7 +796,7 @@ class DecoderTransformer : public Transformer<DecoderBase> {
       rnn::State prevDecoderState;
       if(prevDecoderStates.size() > 0)
         prevDecoderState = prevDecoderStates[i];
-
+      
       // self-attention
       std::string layerType = opt<std::string>("transformer-decoder-autoreg", "self-attention");
       rnn::State decoderState;
@@ -903,7 +874,6 @@ class DecoderTransformer : public Transformer<DecoderBase> {
     auto decoderContext = transposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
 
     //************************************************************************//
-
     // final feed-forward layer (output)
     if(shortlist_)
       output_->setShortlist(shortlist_);
@@ -912,11 +882,9 @@ class DecoderTransformer : public Transformer<DecoderBase> {
     // return unormalized(!) probabilities
     Ptr<DecoderState> nextState;
     if (opt<std::string>("transformer-decoder-autoreg", "self-attention") == "rnn") {
-      nextState = New<DecoderState>(
-        decoderStates, logits, state->getEncoderStates(), state->getBatch());
+      nextState = New<DecoderState>(decoderStates, logits, state->getEncoderStates(), state->getBatch(), state->isBatchMajor());
     } else {
-      nextState = New<TransformerState>(
-        decoderStates, logits, state->getEncoderStates(), state->getBatch());
+      nextState = New<DecoderState>(decoderStates, logits, state->getEncoderStates(), state->getBatch(), state->isBatchMajor());
     }
     nextState->setPosition(state->getPosition() + 1);
     return nextState;
diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h
index b282d819c..46df741b0 100644
--- a/src/models/transformer_factory.h
+++ b/src/models/transformer_factory.h
@@ -3,10 +3,172 @@
 
 #include "marian.h"
 
+#include "layers_new/neuralnet.h"
 #include "models/decoder.h"
 #include "models/encoder.h"
+#include "models/encoder_decoder.h"
 
 namespace marian {
 Ptr<EncoderBase> NewEncoderTransformer(Ptr<ExpressionGraph> graph, Ptr<Options> options);
 Ptr<DecoderBase> NewDecoderTransformer(Ptr<ExpressionGraph> graph, Ptr<Options> options);
+
+class TransformerLegacy : public EncoderDecoder {
+public:
+  TransformerLegacy(Ptr<ExpressionGraph> graph, Ptr<Options> options) 
+   : EncoderDecoder(graph, options), nameMap_(createNameMap()) { }
+
+  void load(Ptr<ExpressionGraph> graph,
+            const std::vector<io::Item>& items,
+            bool markedReloaded = true) override {
+
+    for(auto it = items.begin(); it != items.end(); it++) {
+      auto pair = nameMap_.find(it->name);
+      if(pair != nameMap_.end()) {
+        LOG(debug, "Mapping parameter {} to {}", it->name, pair->second);
+        const_cast<io::Item&>(*it).name = pair->second;
+
+        // reduce shape of bias vectors from {1, dimModel} to {dimModel}
+        int dimModel = it->shape[-1];
+        if(it->shape == Shape({1, dimModel}))
+          const_cast<io::Item&>(*it).shape = Shape({dimModel});
+      } else {
+        LOG(debug, "Could not find parameter {}", it->name);
+      }
+    }
+
+    // in the new model, linear layers are transposed; we undo that here.
+    // @TODO: alternatively, we can transpose the item data
+    auto encoder = std::dynamic_pointer_cast<nn::Layer>(encoders_[0]);
+    ABORT_IF(!encoder, "Could not cast to new type of encoder??");
+    for(auto& linear : encoder->allLayers<nn::Linear>())
+      linear->transposed = false;
+
+    auto decoder = std::dynamic_pointer_cast<nn::Layer>(decoders_[0]);
+    ABORT_IF(!decoder, "Could not cast to new type of decoder??");
+    for(auto& linear : decoder->allLayers<nn::Linear>())
+      linear->transposed = false;
+
+    // load items into the graph
+    graph->load(items);
+  }
+
+  void load(Ptr<ExpressionGraph> graph,
+            const std::string& name,
+            bool markReloaded = true) override {
+    LOG(info, "Loading model from {}", name);
+    auto items = io::loadItems(name);
+    load(graph, items, markReloaded);
+  }
+
+private:
+  std::map<std::string, std::string> nameMap_;
+  
+  std::map<std::string, std::string> createNameMap() {
+    std::map<std::string, std::string> nameMap = {
+      {"Wemb", "Wemb"},
+    };
+
+    // @TODO: This is going to change
+    std::string prefix = "TransformerBatchEncoder";
+
+    std::string key, value;
+    for(int layerNo = 0; layerNo < opt<int>("enc-depth"); ++layerNo) {
+      // name maps for encoder self-attention blocks
+      nameMap[fmt::format("encoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->selfAttention->qProj->weight", prefix, layerNo);
+      nameMap[fmt::format("encoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->selfAttention->qProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("encoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->selfAttention->kProj->weight", prefix, layerNo);
+      nameMap[fmt::format("encoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->selfAttention->kProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("encoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->selfAttention->vProj->weight", prefix, layerNo);
+      nameMap[fmt::format("encoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->selfAttention->vProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("encoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->selfAttention->oProj->weight", prefix, layerNo);
+      nameMap[fmt::format("encoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->selfAttention->oProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("encoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("encoder_l{}_self_Wo_ln_bias", layerNo + 1)]  = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock->postprocessor->norm->bias", prefix, layerNo);
+
+      // name maps for encoder FFN blocks
+      int mult = 3;
+      for(int ffnLayerNo = 0; ffnLayerNo < opt<int>("transformer-ffn-depth"); ++ffnLayerNo) {
+        std::string layerType = "Linear";
+        // multiplying with 3 since in new model activation and dropout are also layers that are always added
+        if(opt<std::string>("transformer-ffn-activation") == "relu" && ffnLayerNo < opt<int>("transformer-ffn-depth") - 1) {
+          mult = 1;
+          layerType = "LinearReluDropout";
+        }
+        nameMap[fmt::format("encoder_l{}_ffn_W{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->filterBlock->layers->at({})->as<marian::nn::{}>()->weight", prefix, layerNo, mult * ffnLayerNo, layerType);
+        nameMap[fmt::format("encoder_l{}_ffn_b{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->filterBlock->layers->at({})->as<marian::nn::{}>()->bias", prefix, layerNo, mult * ffnLayerNo, layerType);
+      }
+      nameMap[fmt::format("encoder_l{}_ffn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->filterBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("encoder_l{}_ffn_ffn_ln_bias", layerNo + 1)]  = fmt::format("{}->encoder->layers->at({})->as<marian::nn::TransformerEncoderLayer>()->filterBlock->postprocessor->norm->bias", prefix, layerNo);
+    }
+
+    prefix = "TransformerBatchDecoder";
+    for(int layerNo = 0; layerNo < opt<int>("dec-depth"); ++layerNo) {
+      // name maps for decoder self-attention blocks
+      nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->qProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->qProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->kProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->kProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->vProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->vProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->oProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->oProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->postprocessor->norm->bias", prefix, layerNo);
+
+      // name maps for decoder SSRU
+      nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo);
+      
+      nameMap[fmt::format("decoder_l{}_rnn_Wf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->fProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_bf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->fProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("decoder_l{}_rnn_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->oProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->oProj->bias", prefix, layerNo);
+      
+      nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo);
+
+      // name maps for decoder cross-attention blocks
+      nameMap[fmt::format("decoder_l{}_context_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->qProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->qProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("decoder_l{}_context_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->kProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->kProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("decoder_l{}_context_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->vProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->vProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("decoder_l{}_context_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->oProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->oProj->bias", prefix, layerNo);
+
+      nameMap[fmt::format("decoder_l{}_context_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_Wo_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->postprocessor->norm->bias", prefix, layerNo);
+
+      // name maps for decoder FFN blocks
+      int mult = 3;
+      for(int ffnLayerNo = 0; ffnLayerNo < opt<int>("transformer-ffn-depth"); ++ffnLayerNo) {
+        std::string layerType = "Linear";
+        // multiplying with 3 since in new model activation and dropout are also layers that are always added
+        if(opt<std::string>("transformer-ffn-activation") == "relu" && ffnLayerNo < opt<int>("transformer-ffn-depth") - 1) {
+          mult = 1;
+          layerType = "LinearReluDropout";
+        }
+        nameMap[fmt::format("decoder_l{}_ffn_W{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->filterBlock->layers->at({})->as<marian::nn::{}>()->weight", prefix, layerNo, mult * ffnLayerNo, layerType);
+        nameMap[fmt::format("decoder_l{}_ffn_b{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->filterBlock->layers->at({})->as<marian::nn::{}>()->bias", prefix, layerNo, mult * ffnLayerNo, layerType);
+      }
+      nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->filterBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->filterBlock->postprocessor->norm->bias", prefix, layerNo);
+    }
+
+    return nameMap;
+  }
+};
+
 }  // namespace marian
diff --git a/src/models/transformer_new.h b/src/models/transformer_new.h
new file mode 100644
index 000000000..cfc3a6b14
--- /dev/null
+++ b/src/models/transformer_new.h
@@ -0,0 +1,245 @@
+#pragma once
+
+#include "layers_new/transformer.h"
+
+#include "models/encoder.h"
+#include "models/decoder.h"
+#include "models/states.h"
+#include "layers/constructors.h"
+
+namespace marian {
+
+// Wrapper for backwards compatibility that uses current encoder/decoder framework
+struct TransformerBatchEncoder : public nn::LayerWithOptions, 
+                                 public nn::IEmbeddingLayer,  // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings
+                                 public EncoderBase {         // @TODO: should all encoders be IEmbeddingLayer?
+  Ptr<nn::TransformerEncoder> encoder;
+
+  TransformerBatchEncoder(Ptr<ExpressionGraph> graph, 
+                          Ptr<Options> options)
+    : LayerWithOptions(graph, options),
+      EncoderBase(graph, options)
+  {
+    encoder = New<nn::TransformerEncoder>(graph, options);
+    registerLayer(encoder);
+  }
+
+  // @TODO: subBatch should be of type Expr
+  virtual std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override {
+    // @TODO: this is still using the bad old interface
+    auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt<bool>("ulr", false));
+    const auto& [batchEmbedding, batchMask] = embeddingLayer->apply(subBatch);
+    auto batchContext = encoder->apply(batchEmbedding, batchMask); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+    return std::make_tuple(batchContext, batchMask);
+  }
+
+  virtual Expr apply(const Words& words, const Shape& shape) const override final {
+    return applyIndices(toWordIndexVector(words), shape);
+  }
+
+  // alternative from indices directly
+  virtual Expr applyIndices(const std::vector<WordIndex>& wordIndices, const Shape& shape) const override final {
+    auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt<bool>("ulr", false));
+    Expr batchEmbedding = embeddingLayer->applyIndices(wordIndices, shape);
+    auto batchContext = encoder->apply(batchEmbedding, /*mask=*/nullptr); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+    return batchContext;
+  }
+
+  // @TODO: currently here for backwards compat, should be replaced with apply()
+  virtual Ptr<EncoderState> build(Ptr<ExpressionGraph> graph,
+                                  Ptr<data::CorpusBatch> batch) override {
+#if 1
+    // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors
+    EncoderBase::graph_ = graph;
+    setGraph(graph);
+    // This makes sure that the graph passed into the model during construction and now evaluation are identical.
+    // A good check to have for catching weird situations early. 
+    ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
+#endif
+    
+    const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]);
+    return New<EncoderState>(batchEmbedding, batchMask, batch);
+  }
+
+  virtual void clear() override {
+    Layer::clear();
+  }
+};
+
+// Wrapper for backwards compatibility that uses current encoder/decoder framework
+class TransformerBatchDecoder : public nn::LayerWithOptions, 
+                                public DecoderBase {
+
+  Ptr<nn::TransformerDecoder> decoder;
+  Ptr<mlp::Output> output_; 
+
+  void lazyCreateOutputLayer()
+  {
+    using db = DecoderBase;
+
+    if(output_) // create it lazily
+      return;
+
+    int dimTrgVoc = db::opt<std::vector<int>>("dim-vocabs")[batchIndex_];
+
+    auto outputFactory = mlp::OutputFactory(
+        "prefix", prefix_ + "_ff_logit_out",
+        "dim", dimTrgVoc,
+        "vocab", db::opt<std::vector<std::string>>("vocabs")[batchIndex_], // for factored outputs
+        "output-omit-bias", db::opt<bool>("output-omit-bias", false),
+        "output-approx-knn", db::opt<std::vector<int>>("output-approx-knn", {}),
+        "lemma-dim-emb", db::opt<int>("lemma-dim-emb", 0),
+        "lemma-dependency", db::opt<std::string>("lemma-dependency", ""), // for factored outputs
+        "factors-combine", db::opt<std::string>("factors-combine", "")); // for factored outputs
+
+    if(db::opt<bool>("tied-embeddings") || db::opt<bool>("tied-embeddings-all"))
+      outputFactory.tieTransposed(db::opt<bool>("tied-embeddings-all") || db::opt<bool>("tied-embeddings-src") ? "Wemb" : prefix_ + "_Wemb");
+
+    output_ = std::dynamic_pointer_cast<mlp::Output>(outputFactory.construct(graph())); // (construct() returns only the underlying interface)
+  }
+
+public:
+  TransformerBatchDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options) 
+  : LayerWithOptions(graph, options), DecoderBase(graph, options) {
+    
+    decoder = New<nn::TransformerDecoder>(graph, options);
+    registerLayer(decoder);
+
+  }
+
+  virtual Ptr<DecoderState> startState(Ptr<ExpressionGraph> graph,
+                                       Ptr<data::CorpusBatch> batch,
+                                       std::vector<Ptr<EncoderState>>& encStates) override {
+
+#if 1
+    // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors
+    DecoderBase::graph_ = graph;
+    setGraph(graph);
+    // This makes sure that the graph passed into the model during construction and now evaluation are identical.
+    // A good check to have for catching weird situations early. 
+    ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
+#endif
+
+    std::string layerType = DecoderBase::opt<std::string>("transformer-decoder-autoreg", "self-attention");
+    if (layerType == "rnn") {
+      int dimBatch = (int)batch->size();
+      int dim = DecoderBase::opt<int>("dim-emb");
+
+      auto start = graph->constant({1, 1, dimBatch, dim}, inits::zeros());
+      rnn::States startStates(DecoderBase::opt<size_t>("dec-depth"), {start, start});
+
+      // don't use TransformerState for RNN layers
+      return New<DecoderState>(startStates, Logits(), encStates, batch, /*isBatchMajor=*/false);
+    }
+    else {
+      rnn::States startStates;
+      return New<DecoderState>(startStates, Logits(), encStates, batch, /*isBatchMajor=*/true);
+    }
+  }
+
+  virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph,
+                                 Ptr<DecoderState> state) override {
+#if 1 // Sanity check for as long as we mix legacy code and new code
+    ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
+#endif
+
+    lazyCreateOutputLayer();
+    return step(state);
+  }
+
+  Ptr<DecoderState> step(Ptr<DecoderState> state) {
+    auto embeddings  = state->getTargetHistoryEmbeddings(); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
+    auto decoderMask = state->getTargetMask();              // [max length, batch size, 1]  --this is a hypothesis
+
+    //************************************************************************//
+
+    auto encoderContext = state->getEncoderStates()[0]->getContext(); // encoder output
+    auto encoderMask    = state->getEncoderStates()[0]->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention
+    
+    // Convert old style decoder state to new decoder state
+    size_t position = state->getPosition();
+    auto nnState = New<nn::DecoderStateList>(position);
+    for(auto& layerState : state->getStates())
+      nnState->as<nn::DecoderStateList>()->append(New<nn::DecoderStateItem>(layerState.cell, position));
+
+    auto decoderContext = decoder->apply(embeddings, decoderMask, encoderContext, encoderMask, nnState);
+
+    // final feed-forward layer (output)
+    if(shortlist_)
+      output_->setShortlist(shortlist_);
+    auto logits = output_->applyAsLogits(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim]
+    
+    // Convert new style decoder state to old decoder state
+    // @TODO: This is such a mess!
+    rnn::States decoderStates;
+    for(auto layerState : *nnState->as<nn::DecoderStateList>()) {
+      auto cellState = layerState->as<nn::DecoderStateItem>()->get();
+      decoderStates.push_back(rnn::State({ cellState, cellState }));
+    }
+    // return unnormalized(!) probabilities
+    auto nextState = New<DecoderState>(decoderStates, logits, state->getEncoderStates(), state->getBatch(), state->isBatchMajor());
+    nextState->setPosition(state->getPosition() + 1);
+
+    return nextState;
+  }
+
+  // helper function for guided alignment
+  // @TODO: const vector<> seems wrong. Either make it non-const or a const& (more efficient but dangerous)
+  virtual const std::vector<Expr> getAlignments(int /*i*/ = 0) override {
+    ABORT("Not implemented");
+    return {};
+  }
+
+  virtual void clear() override {
+    Layer::clear();
+    if (output_)
+      output_->clear();
+  }
+};
+
+} // namespace marian
+
+#if 0 // ignore me. To-be-removed once fully functional.
+
+static void testme() {
+  using namespace marian;
+  using namespace nn;
+  
+  auto options = New<Options>(
+    "enc-depth", 12, 
+    "transformer-heads", 8, 
+    "dim-emb", 512, 
+    "transformer-ffn-depth", 2,
+    "transformer-dim-ffn",   2048, 
+    "transformer-dropout",   0.1,
+    "transformer-dropout-attention", 0.0,
+    "transformer-postprocess", "dan",
+    "transformer-ffn-activation", "relu",
+    "transformer-train-position-embeddings", false,
+    "transformer-depth-scaling", true,
+    "max-length", 256);
+
+  Config::seed = 1234;
+
+  auto graph = New<ExpressionGraph>(/*inference=*/true);
+  graph->setDevice(CPU0);
+  graph->reserveWorkspaceMB(1000);
+
+  auto input = graph->constant({10, 1, 512}, inits::glorotUniform()); // [length, batch, dim]
+  auto mask  = graph->constant({10, 1,   1}, inits::ones());          // [length, batch,   1]
+
+  auto encoder = New<TransformerEncoder>(graph, options);
+  encoder->setName("TransformerEncoder");
+  encoder->setEvalMode();
+  
+  auto context = encoder->apply(input, mask);
+
+  std::cerr << encoder->layerInfo(/*includeChildren=*/true) << std::endl;
+
+  debug(context);
+  
+  graph->forward();
+  graph->save("test.npz");
+}
+
+#endif
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index 1e1adc38b..5be3eee26 100755
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -388,12 +388,13 @@ void TransposeGeneric(Tensor out, Tensor in, const std::vector<int>& vAxis) {
 }
 
 void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis) {
-  if(vAxis == std::vector<int>({0, 2, 1, 3}))
-    Transpose0213<false>(out, in);
 #if MKL_FOUND
-  else if(vAxis.size() == 4 && vAxis[3] == 3)
+  if(vAxis.size() == 4 && vAxis[3] == 3)
     TransposeFirst3In4<false>(out, in, vAxis);
+  else
 #endif  // MKL_FOUND
+  if(vAxis == std::vector<int>({0, 2, 1, 3}))
+    Transpose0213<false>(out, in);
   else if(vAxis == std::vector<int>({1, 0}) && in->shape()[-1] % 16 == 0
           && in->shape()[-2] % 16 == 0)
     Transpose10(out, in);
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index ccf8cc72d..0a6c047cd 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -10,6 +10,7 @@ if(NOT MSVC)
       prod
       cli
       pooling
+      # transformer_new
   )
 
   foreach(test ${APP_TESTS})
diff --git a/src/tests/transformer_new.cpp b/src/tests/transformer_new.cpp
new file mode 100644
index 000000000..2d1e89281
--- /dev/null
+++ b/src/tests/transformer_new.cpp
@@ -0,0 +1,11 @@
+#include "marian.h"
+#include "models/transformer_new.h"
+
+
+int main(int argc, char** argv) {
+  using namespace marian;
+
+  testme();
+
+  return 0;
+}

From d225c24d7fa72372387fc63cbe1c118d14071fcb Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Wed, 1 Mar 2023 13:48:09 +0000
Subject: [PATCH 04/26] Merged PR 28128: Comet scoring and training with new
 layer framework

This PR adds:
* code for comet scoring and training with the new layer framework
* conversion scripts from Unbabel comet to Marian model
---
 CHANGELOG.md                           |   8 +-
 VERSION                                |   2 +-
 scripts/bert/contrib/chpt2pt.py        |  23 ++
 scripts/bert/contrib/hugging2marian.py | 153 ++++++++++++
 scripts/bert/contrib/roberta2marian.py | 163 ++++++++++++
 scripts/comet/comet2marian.py          | 216 ++++++++++++++++
 src/common/aliases.cpp                 |  28 +++
 src/common/config_parser.cpp           |  11 +
 src/common/file_stream.cpp             |   2 +-
 src/data/corpus_base.cpp               |   8 +-
 src/data/corpus_base.h                 |   1 +
 src/data/sentencepiece_vocab.cpp       |   3 +-
 src/embedder/embedder.h                |  20 +-
 src/embedder/vector_collector.cpp      |  20 +-
 src/embedder/vector_collector.h        |   3 +-
 src/functional/operators.h             |  12 +-
 src/graph/expression_operators.cpp     |   2 +-
 src/graph/node_operators_unary.h       |  36 ++-
 src/layers/embedding.cpp               |   7 +
 src/layers_new/embeddings.h            |   6 +-
 src/layers_new/transformer.h           |   4 +-
 src/models/bert.h                      |   1 +
 src/models/comet_qe.h                  | 327 +++++++++++++++++++++++++
 src/models/encoder_pooler.h            |   7 +-
 src/models/model_factory.cpp           |  60 ++++-
 src/tensors/gpu/add.inc                |   7 +-
 src/tensors/gpu/add_all.inc            |   2 +
 src/tensors/gpu/element.inc            |   2 +
 src/training/graph_group.cpp           |   9 +-
 src/training/validator.cpp             | 113 +++++++++
 src/training/validator.h               |  19 ++
 31 files changed, 1236 insertions(+), 39 deletions(-)
 create mode 100644 scripts/bert/contrib/chpt2pt.py
 create mode 100644 scripts/bert/contrib/hugging2marian.py
 create mode 100644 scripts/bert/contrib/roberta2marian.py
 create mode 100644 scripts/comet/comet2marian.py
 create mode 100644 src/models/comet_qe.h

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6a7316be9..6aff5037f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,11 +8,15 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
-
+- Re-implementation of COMET-QE for inference and training; conversion scripts from Unbabel-Comet to Marian.
+- Validator that generates embeddings and can be used during COMET training with an external script.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
-- Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp)
+- Only collect batch statistics during mini-batch-fit up to actual max-length.
+- Implemented fully correct version of GELU instead of using bad approximatin via Swish.
+- Handle copying from fp32 or fp16 embeddings in embedder mode correctly.
+- Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp).
 
 ## [1.12.0] - 2023-02-20
 
diff --git a/VERSION b/VERSION
index 41de27dfa..00f862625 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.2
+v1.12.3
diff --git a/scripts/bert/contrib/chpt2pt.py b/scripts/bert/contrib/chpt2pt.py
new file mode 100644
index 000000000..3ca8fee6a
--- /dev/null
+++ b/scripts/bert/contrib/chpt2pt.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+"""
+This script converts *.chpt files to *.pt files, potentially useful for extracting weights only from larger checkpoints.
+"""
+
+import torch
+import argparse
+
+# Create a parser for command line arguments
+parser = argparse.ArgumentParser()
+
+# Add arguments for the source and target files
+parser.add_argument("--source", type=str, required=True, help="Path to the source *.chpt file")
+parser.add_argument("--target", type=str, required=True, help="Path to the target *.pt file")
+
+# Parse the command line arguments
+args = parser.parse_args()
+
+# Load the model from the source file
+model = torch.load(args.source)
+
+# Save the model to the target file
+torch.save(model, args.target)
\ No newline at end of file
diff --git a/scripts/bert/contrib/hugging2marian.py b/scripts/bert/contrib/hugging2marian.py
new file mode 100644
index 000000000..0ee31414a
--- /dev/null
+++ b/scripts/bert/contrib/hugging2marian.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""
+This script converts Huggingface Bert model to Marian weight file.
+"""
+
+import argparse
+import numpy as np
+import sys
+import yaml
+
+from transformers import XLMRobertaModel
+
+parser = argparse.ArgumentParser(description='Convert Huggingface Bert model to Marian weight file.')
+parser.add_argument('--bert', help='Path to Huggingface Bert PyTorch model', required=True)
+parser.add_argument('--marian', help='Output path for Marian weight file', required=True)
+args = parser.parse_args()
+
+huggingface = XLMRobertaModel.from_pretrained(args.bert)
+huggingface.eval()
+
+print(huggingface.config)
+
+config = dict()
+config["type"] = "bert-classifier"
+config["input-types"] = ["sequence"]
+config["tied-embeddings-all"] = True
+config["tied-embeddings-src"] = False
+
+config["transformer-ffn-depth"] = 2
+config["transformer-train-position-embeddings"] = True
+config["transformer-preprocess"] = ""
+config["transformer-postprocess"] = "dan"
+config["transformer-postprocess-emb"] = "nd"
+config["bert-train-type-embeddings"] = False
+# @TODO: figure out if it's worth adding `cometModel.name_or_path` to the end of this version string.
+config["version"] = "huggingface2marian.py conversion"
+
+config["enc-depth"] = 0
+config["transformer-dim-ffn"] = huggingface.config.intermediate_size
+config["transformer-heads"] = huggingface.config.num_attention_heads
+config["transformer-ffn-activation"] = huggingface.config.hidden_act
+
+config["bert-sep-symbol"] = "</s>"
+config["bert-class-symbol"] = "</s>"
+
+marianModel = dict()
+
+def transposeOrder(mat):
+    matT = np.transpose(mat) # just a view with changed row order
+    return matT.flatten(order="C").reshape(matT.shape) # force row order change and reshape
+    
+
+def convert(pd, srcs, trg, transpose=True, bias=False):
+    if len(srcs) == 1:
+        for src in srcs:
+            num = pd[src].detach().numpy()
+            if bias:
+                marianModel[trg] = np.atleast_2d(num)
+            else:
+                if transpose:
+                    marianModel[trg] = transposeOrder(num) # transpose with row order change
+                else:
+                    marianModel[trg] = num
+    else: # path that joins matrices together for fused self-attention
+        nums = [pd[src].detach().numpy() for src in srcs]
+        if bias:
+            nums = [np.transpose(np.atleast_2d(num)) for num in nums]
+        marianModel[trg] = np.stack(nums, axis=0)
+
+
+def extract(layer, nth, level):
+    name = type(layer).__name__
+    print("  " * level, nth, name)
+    if name == "BertLayer":
+        pd = dict(layer.named_parameters())
+        for n in pd:
+            print("  " * (level + 1), n, pd[n].shape)
+    
+        convert(pd, ["attention.self.query.weight"], f"encoder_l{nth + 1}_self_Wq", transpose=True)
+        convert(pd, ["attention.self.key.weight"],   f"encoder_l{nth + 1}_self_Wk")
+        convert(pd, ["attention.self.value.weight"], f"encoder_l{nth + 1}_self_Wv")
+
+        convert(pd, ["attention.self.query.bias"],   f"encoder_l{nth + 1}_self_bq", bias=True)
+        convert(pd, ["attention.self.key.bias"],     f"encoder_l{nth + 1}_self_bk", bias=True)
+        convert(pd, ["attention.self.value.bias"],   f"encoder_l{nth + 1}_self_bv", bias=True)
+
+        convert(pd, ["attention.output.dense.weight"], f"encoder_l{nth + 1}_self_Wo")
+        convert(pd, ["attention.output.dense.bias"],   f"encoder_l{nth + 1}_self_bo", bias=True)
+
+        convert(pd, ["attention.output.LayerNorm.weight"], f"encoder_l{nth + 1}_self_Wo_ln_scale", bias=True)
+        convert(pd, ["attention.output.LayerNorm.bias"],   f"encoder_l{nth + 1}_self_Wo_ln_bias", bias=True)
+
+        convert(pd, ["intermediate.dense.weight"], f"encoder_l{nth + 1}_ffn_W1")
+        convert(pd, ["intermediate.dense.bias"],   f"encoder_l{nth + 1}_ffn_b1", bias=True)
+        convert(pd, ["output.dense.weight"], f"encoder_l{nth + 1}_ffn_W2")
+        convert(pd, ["output.dense.bias"],   f"encoder_l{nth + 1}_ffn_b2", bias=True)
+
+        convert(pd, ["output.LayerNorm.weight"], f"encoder_l{nth + 1}_ffn_ffn_ln_scale", bias=True)
+        convert(pd, ["output.LayerNorm.bias"],   f"encoder_l{nth + 1}_ffn_ffn_ln_bias", bias=True)
+        
+        config["enc-depth"] += 1
+
+    elif name == "BertEmbeddings":
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+        pd = dict(layer.named_parameters())
+        convert(pd, ["word_embeddings.weight"], f"Wemb", transpose=False)
+        convert(pd, ["position_embeddings.weight"], f"Wpos", transpose=False)
+
+        config["bert-type-vocab-size"] = 0
+        if hasattr(layer, "token_type_embeddings"):
+            convert(pd, ["token_type_embeddings.weight"], f"Wtype", transpose=False)
+            config["bert-type-vocab-size"] = pd["token_type_embeddings.weight"].shape[0]
+            config["bert-train-type-embeddings"] = True
+
+        convert(pd, ["LayerNorm.weight"], f"encoder_emb_ln_scale_pre", bias=True)
+        convert(pd, ["LayerNorm.bias"],   f"encoder_emb_ln_bias_pre", bias=True)
+
+        config["dim-emb"]    = pd["word_embeddings.weight"].shape[1]
+        config["dim-vocabs"] = [ pd["word_embeddings.weight"].shape[0] ]
+        config["max-length"] = pd["position_embeddings.weight"].shape[0]
+    
+    elif name == "BertPooler":
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+
+        pd = dict(layer.named_parameters())
+        convert(pd, ["dense.weight"], "classifier_ff_logit_l1_W")
+        convert(pd, ["dense.bias"], "classifier_ff_logit_l1_b", bias=True)
+
+    else:
+        recurse(layer, level + 1)
+
+def recurse(parent, level=0):
+    for i, child in enumerate(parent.children()):
+        extract(child, i, level)
+        
+recurse(huggingface)
+
+for m in marianModel:
+    print(m, marianModel[m].shape)
+
+configYamlStr = yaml.dump(config, default_flow_style=False)
+desc = list(configYamlStr)
+npDesc = np.chararray((len(desc),))
+npDesc[:] = desc
+npDesc.dtype = np.int8
+marianModel["special:model.yml"] = npDesc
+
+print("\nMarian config:")
+print(configYamlStr)
+print("Saving Marian model to %s" % (args.marian,))
+np.savez(args.marian, **marianModel)
\ No newline at end of file
diff --git a/scripts/bert/contrib/roberta2marian.py b/scripts/bert/contrib/roberta2marian.py
new file mode 100644
index 000000000..fb80733f4
--- /dev/null
+++ b/scripts/bert/contrib/roberta2marian.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+This script converts Fairseq Roberta model to Marian weight file.
+"""
+
+import argparse
+import numpy as np
+import sys
+import torch
+import yaml
+
+from fairseq.models.roberta import RobertaModel
+
+parser = argparse.ArgumentParser(description='Convert Fairseq Roberta model to Marian weight file.')
+parser.add_argument('--roberta', help='Path to Roberta model', required=True)
+parser.add_argument('--comet', help='Path to COMET model', required=True)
+parser.add_argument('--marian', help='Output path for Marian weight file', required=True)
+args = parser.parse_args()
+
+roberta = RobertaModel.from_pretrained(args.roberta)
+model = torch.load(args.comet)
+print(model)
+
+roberta.eval()
+
+config = dict()
+config["type"] = "bert-encoder"
+config["input-types"] = ["sequence"]
+config["tied-embeddings-all"] = True
+config["tied-embeddings-src"] = False
+
+config["transformer-ffn-depth"] = 2
+config["transformer-ffn-activation"] = "gelu" # figure this out dynamically
+config["transformer-train-position-embeddings"] = True
+config["transformer-preprocess"] = ""
+config["transformer-postprocess"] = "dan"
+config["transformer-postprocess-emb"] = "nd"
+config["bert-train-type-embeddings"] = False
+config["bert-type-vocab-size"] = 0
+# @TODO: figure out if it's worth adding `cometModel.name_or_path` to the end of this version string.
+config["version"] = "roberta2marian.py conversion"
+
+config["enc-depth"] = 0
+
+marianModel = dict()
+
+def convert(pd, srcs, trg, transpose=True, bias=False):
+    if len(srcs) == 1:
+        for src in srcs:
+            num = pd[src].detach().numpy()
+            if bias:
+                marianModel[trg] = np.atleast_2d(num).copy()
+            else:
+                if transpose:
+                    marianModel[trg] = np.transpose(num).copy()
+                else:
+                    marianModel[trg] = num
+    else: # path that joins matrices together for fused self-attention
+        nums = [pd[src].detach().numpy() for src in srcs]
+        if bias:
+            nums = [np.transpose(np.atleast_2d(num)) for num in nums]
+        marianModel[trg] = np.stack(nums, axis=0).copy()
+
+
+def extract(layer, nth, level):
+    name = type(layer).__name__
+    print("  " * level, nth, name)
+    if name == "TransformerSentenceEncoderLayer":
+        pd = dict(layer.named_parameters())
+        for n in pd:
+            print("  " * (level + 1), n, pd[n].shape)
+    
+        convert(pd, ["self_attn.q_proj.weight"], f"encoder_l{nth + 1}_self_Wq")
+        convert(pd, ["self_attn.k_proj.weight"], f"encoder_l{nth + 1}_self_Wk")
+        convert(pd, ["self_attn.v_proj.weight"], f"encoder_l{nth + 1}_self_Wv")
+
+        convert(pd, ["self_attn.q_proj.bias"],   f"encoder_l{nth + 1}_self_bq", bias=True)
+        convert(pd, ["self_attn.k_proj.bias"],   f"encoder_l{nth + 1}_self_bk", bias=True)
+        convert(pd, ["self_attn.v_proj.bias"],   f"encoder_l{nth + 1}_self_bv", bias=True)
+
+        # convert(pd, ["self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight"], f"encoder_l{nth + 1}_self_Wt")
+        # convert(pd, ["self_attn.q_proj.bias",   "self_attn.k_proj.bias",   "self_attn.v_proj.bias"],   f"encoder_l{nth + 1}_self_bt", bias=True)
+
+        convert(pd, ["self_attn.out_proj.weight"], f"encoder_l{nth + 1}_self_Wo")
+        convert(pd, ["self_attn.out_proj.bias"],   f"encoder_l{nth + 1}_self_bo", bias=True)
+
+        convert(pd, ["self_attn_layer_norm.weight"], f"encoder_l{nth + 1}_self_Wo_ln_scale", bias=True)
+        convert(pd, ["self_attn_layer_norm.bias"],   f"encoder_l{nth + 1}_self_Wo_ln_bias", bias=True)
+
+        convert(pd, ["fc1.weight"], f"encoder_l{nth + 1}_ffn_W1")
+        convert(pd, ["fc1.bias"],   f"encoder_l{nth + 1}_ffn_b1", bias=True)
+        convert(pd, ["fc2.weight"], f"encoder_l{nth + 1}_ffn_W2")
+        convert(pd, ["fc2.bias"],   f"encoder_l{nth + 1}_ffn_b2", bias=True)
+
+        convert(pd, ["final_layer_norm.weight"], f"encoder_l{nth + 1}_ffn_ffn_ln_scale", bias=True)
+        convert(pd, ["final_layer_norm.bias"],   f"encoder_l{nth + 1}_ffn_ffn_ln_bias", bias=True)
+
+        config["transformer-dim-ffn"] = pd["fc1.bias"].shape[-1]
+        config["transformer-heads"] = layer.self_attn.num_heads
+        config["enc-depth"] += 1
+
+    elif name == "Embedding":
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+        pd = dict(layer.named_parameters())
+        convert(pd, ["weight"], f"Wemb", transpose=False)
+
+        config["dim-emb"] = pd["weight"].shape[1]
+        config["dim-vocabs"] = [ pd["weight"].shape[0] ]
+
+    elif name == "LearnedPositionalEmbedding":
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+        pd = dict(layer.named_parameters())
+        convert(pd, ["weight"], f"Wpos", transpose=False)
+
+        config["max-length"] = pd["weight"].shape[0]
+
+    elif name == "RobertaLMHead":
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+    
+        pd = dict(layer.named_parameters())
+        convert(pd, ["dense.weight"],      f"masked-lm_ff_logit_l1_W")
+        convert(pd, ["dense.bias"],        f"masked-lm_ff_logit_l1_b", bias=True)
+        convert(pd, ["layer_norm.weight"], f"masked-lm_ff_ln_scale", bias=True)
+        convert(pd, ["layer_norm.bias"],   f"masked-lm_ff_ln_bias", bias=True)
+        
+        convert(pd, ["bias"],              f"masked-lm_ff_logit_l2_b", bias=True)
+        # reuse Wemb here as weight
+        # convert(pd, ["weight"],    f"masked-lm_ff_logit_l2_b")
+        
+    elif name == "LayerNorm":
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+
+        pd = dict(layer.named_parameters())
+        convert(pd, ["weight"], f"encoder_emb_ln_scale_pre", bias=True)
+        convert(pd, ["bias"],   f"encoder_emb_ln_bias_pre", bias=True)
+
+    else:
+        recurse(layer, level + 1)
+
+def recurse(parent, level=0):
+    for i, child in enumerate(parent.children()):
+        extract(child, i, level)
+        
+recurse(roberta)
+
+for m in marianModel:
+    print(m, marianModel[m].shape)
+
+configYamlStr = yaml.dump(config, default_flow_style=False)
+desc = list(configYamlStr)
+npDesc = np.chararray((len(desc),))
+npDesc[:] = desc
+npDesc.dtype = np.int8
+marianModel["special:model.yml"] = npDesc
+
+print("\nMarian config:")
+print(configYamlStr)
+print("Saving Marian model to %s" % (args.marian,))
+np.savez(args.marian, **marianModel)
\ No newline at end of file
diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py
new file mode 100644
index 000000000..9ddbb45c1
--- /dev/null
+++ b/scripts/comet/comet2marian.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+This script converts Unbabel COMET-QE models to Marian weight file.
+"""
+
+import argparse
+import yaml
+import numpy as np
+
+parser = argparse.ArgumentParser(description='Convert Unbabel COMET-QE models to Marian weight file.')
+inputs = parser.add_mutually_exclusive_group(required=True)
+inputs.add_argument('--comet', help='Path to COMET model')
+inputs.add_argument('--roberta', help='Initialize with Roberta model', action='store_true')
+parser.add_argument('--marian', help='Output path for Marian weight file', required=True)
+parser.add_argument('-s', '--add_sigmoid', help='Add final sigmoid if not already present', action='store_true')
+args = parser.parse_args()
+
+
+if args.roberta:
+    from transformers import AutoModel
+    # Load the model that Unbabel based COMET on: https://huggingface.co/microsoft/infoxlm-large
+    robertaModel = AutoModel.from_pretrained("microsoft/infoxlm-large", add_pooling_layer=False)
+    robertaModel.eval()
+    print(robertaModel)
+    cometModel = robertaModel
+else:
+    from comet import load_from_checkpoint
+    cometModel = load_from_checkpoint(args.comet)
+    cometModel.eval()
+    print(cometModel)
+
+marianModel = dict()
+
+config = dict()
+config["type"] = "comet-qe"
+config["tied-embeddings-all"] = True
+config["tied-embeddings-src"] = False
+config["transformer-ffn-depth"] = 2
+config["transformer-ffn-activation"] = "gelu" # figure this out dynamically
+config["transformer-train-position-embeddings"] = True
+config["transformer-preprocess"] = ""
+config["transformer-postprocess"] = "dan"
+config["transformer-postprocess-emb"] = "nd"
+config["bert-train-type-embeddings"] = False
+config["bert-type-vocab-size"] = 0
+config["comet-prepend-zero"] = True
+config["comet-final-sigmoid"] = args.add_sigmoid
+config["comet-pooler-ffn"] = [2048, 1024]
+# @TODO: figure out if it's worth adding `cometModel.name_or_path` to the end of this version string.
+config["version"] = "comet2marian2.py conversion"
+config["enc-depth"] = 0
+
+def yaml2np(config):
+    configYamlStr = yaml.dump(config, default_flow_style=False)
+    print("\nMarian config:")
+    print(configYamlStr)
+
+    desc = bytes(configYamlStr, 'ascii') + b'\x00'
+    npDesc = np.chararray((len(desc),))
+    npDesc.dtype = np.int8
+    for i, b in enumerate(desc):
+        npDesc[i] = b
+    return npDesc
+
+def convert(pd, srcs, trg, transpose=True, bias=False):
+    if len(srcs) == 1:
+        for src in srcs:
+            num = pd[src].detach().numpy()
+            if bias:
+                marianModel[trg] = num.copy()
+            else:
+                if transpose:
+                    marianModel[trg] = np.transpose(num).copy()
+                else:
+                    marianModel[trg] = num
+    else: # path that joins matrices together for fused self-attention
+        nums = [pd[src].detach().numpy() for src in srcs]
+        if bias:
+            nums = [np.transpose(num) for num in nums]
+        marianModel[trg] = np.stack(nums, axis=0).copy()
+
+def extract(layer, nth, level):
+    name = type(layer).__name__
+    print("  " * level, nth, name)
+    if "RobertaLayer" in name:
+        pd = dict(layer.named_parameters())
+        for n in pd:
+            print("  " * (level + 1), n, pd[n].shape)
+
+        prefix = "CometEncoder"
+
+        blockPrefix = f"{prefix}->encoder->layers->at({nth})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock"
+
+        # self-attention    
+        # query transformation
+        convert(pd, ["attention.self.query.weight"],       f"{blockPrefix}->selfAttention->qProj->weight")
+        convert(pd, ["attention.self.query.bias"],         f"{blockPrefix}->selfAttention->qProj->bias", bias=True)
+        
+        # key transformation
+        convert(pd, ["attention.self.key.weight"],         f"{blockPrefix}->selfAttention->kProj->weight")
+        convert(pd, ["attention.self.key.bias"],           f"{blockPrefix}->selfAttention->kProj->bias", bias=True)
+        
+        # values transformation
+        convert(pd, ["attention.self.value.weight"],       f"{blockPrefix}->selfAttention->vProj->weight")
+        convert(pd, ["attention.self.value.bias"],         f"{blockPrefix}->selfAttention->vProj->bias", bias=True)
+
+        # output transformation
+        convert(pd, ["attention.output.dense.weight"],     f"{blockPrefix}->selfAttention->oProj->weight")
+        convert(pd, ["attention.output.dense.bias"],       f"{blockPrefix}->selfAttention->oProj->bias", bias=True)
+
+        # self-attention layer-norm
+        convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True)
+        convert(pd, ["attention.output.LayerNorm.bias"],   f"{blockPrefix}->postprocessor->norm->bias", bias=True)
+
+        # ffn 
+        # first ffn layer
+        blockPrefix = f"{prefix}->encoder->layers->at({nth})->as<marian::nn::TransformerEncoderLayer>()->filterBlock"
+
+        convert(pd, ["intermediate.dense.weight"],         f"{blockPrefix}->layers->at(0)->as<marian::nn::Linear>()->weight")
+        convert(pd, ["intermediate.dense.bias"],           f"{blockPrefix}->layers->at(0)->as<marian::nn::Linear>()->bias", bias=True)
+        # second ffn layer
+        convert(pd, ["output.dense.weight"],               f"{blockPrefix}->layers->at(3)->as<marian::nn::Linear>()->weight")
+        convert(pd, ["output.dense.bias"],                 f"{blockPrefix}->layers->at(3)->as<marian::nn::Linear>()->bias", bias=True)
+        # ffn layer-norm
+        convert(pd, ["output.LayerNorm.weight"],           f"{blockPrefix}->postprocessor->norm->weight", bias=True)
+        convert(pd, ["output.LayerNorm.bias"],             f"{blockPrefix}->postprocessor->norm->bias", bias=True)
+
+        config["transformer-dim-ffn"] = pd["intermediate.dense.bias"].shape[-1]
+        config["transformer-heads"] = layer.attention.self.num_attention_heads
+        config["enc-depth"] += 1
+
+    elif "RobertaEmbeddings" in name:
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+        pd = dict(layer.named_parameters())
+
+        # shift word embeddings so that we are back at 250,000 vocab items
+        npWembTemp = pd["word_embeddings.weight"].detach().numpy()
+        npWemb = npWembTemp[1:-1, :].copy()
+        npWemb[0, :] = npWembTemp[0, :]
+        npWemb[2, :] = npWembTemp[2, :]
+        marianModel["Wemb"] = npWemb
+
+        prefix = "CometEncoder"
+        
+        # shift position embeddings so that we are back at 512 items and start at 0
+        npPos = pd["position_embeddings.weight"].detach().numpy()
+        npPos = npPos[2:, :].copy()
+        marianModel[f"{prefix}->encoder->positionEmbedding->embeddings"] = npPos
+
+        # post-embedding layer normalization
+        convert(pd, ["LayerNorm.weight"], f"{prefix}->encoder->preprocessor->norm->weight", bias=True)
+        convert(pd, ["LayerNorm.bias"],   f"{prefix}->encoder->preprocessor->norm->bias", bias=True)
+
+        config["dim-emb"]    =   npWemb.shape[1]
+        config["dim-vocabs"] = [ npWemb.shape[0] ]
+        config["max-length"] = npPos.shape[0]
+
+    elif name == "LayerwiseAttention":
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+        pd = dict(layer.named_parameters())
+
+        # mix layers
+        weights = []
+        for i in range(25):
+            weights.append(pd[f"scalar_parameters.{i}"].detach().numpy())
+        marianModel["CometEncoder->encoder->weights"] = np.concatenate(weights).copy()
+
+        # gamma for weird batch/layer-norm step in pooler/encoder of COMET
+        # @TODO: make optional
+        marianModel["CometEncoder->encoder->gamma"] = pd["gamma"].detach().numpy().copy()
+        config["comet-mix"] = True
+        config["comet-mix-norm"] = True
+        
+
+    elif name == "FeedForward":
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+        pd = dict(layer.named_parameters())
+
+        if layer.ff[-1].__class__.__name__ == "Sigmoid" or args.add_sigmoid:
+            config["comet-final-sigmoid"] = True
+
+        config["comet-pooler-ffn"] = [
+            pd["ff.0.bias"].shape[0],
+            pd["ff.3.bias"].shape[0]
+        ]
+
+        # 3-layer FFN network that computes COMET regression
+        prefix = "CometQEPooler"
+
+        # @TODO: make final sigmoid optional
+        convert(pd, ["ff.0.weight"], f"{prefix}->layers->at(0)->as<marian::nn::Linear>()->weight")
+        convert(pd, ["ff.0.bias"],   f"{prefix}->layers->at(0)->as<marian::nn::Linear>()->bias", bias=True)
+
+        convert(pd, ["ff.3.weight"], f"{prefix}->layers->at(3)->as<marian::nn::Linear>()->weight")
+        convert(pd, ["ff.3.bias"],   f"{prefix}->layers->at(3)->as<marian::nn::Linear>()->bias", bias=True)
+
+        convert(pd, ["ff.6.weight"], f"{prefix}->layers->at(6)->as<marian::nn::Linear>()->weight")
+        convert(pd, ["ff.6.bias"],   f"{prefix}->layers->at(6)->as<marian::nn::Linear>()->bias", bias=True)        
+    else:
+        recurse(layer, level + 1)
+
+def recurse(parent, level=0):
+    for i, child in enumerate(parent.children()):
+        extract(child, i, level)
+
+recurse(cometModel)
+marianModel["special:model.yml"] = yaml2np(config)
+
+for m in marianModel:
+    print(m, marianModel[m].shape)
+
+print("Saving Marian model to %s" % (args.marian,))
+np.savez(args.marian, **marianModel)
diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp
index 75d9bdf97..653ca6f8a 100644
--- a/src/common/aliases.cpp
+++ b/src/common/aliases.cpp
@@ -227,6 +227,34 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) {
       config["valid-mini-batch"] = 8;
       config["normalize"] = 1.0;
     });
+
+    // Model architecture for Unbabel's COMET-QE models
+    cli.alias("task", "comet-qe", [](YAML::Node& config) {
+      // Model options
+      config["bert-train-type-embeddings"] = false;
+      config["bert-type-vocab-size"] = 0;
+      config["comet-final-sigmoid"] = true;
+      config["comet-mix"] = false;
+      config["comet-mix-norm"] = false;
+      config["comet-dropout"] = 0.1;
+      config["comet-pooler-ffn"] = std::vector<int>({2048, 1024});
+      config["comet-prepend-zero"] = true;
+      config["dim-emb"] = 1024;
+      config["dim-vocabs"] = std::vector<int>({250000});
+      config["enc-depth"] = 24;
+      config["max-length"] = 512;
+      config["valid-max-length"] = 512;
+      config["tied-embeddings-all"] = true;
+      config["transformer-dim-ffn"] = 4096;
+      config["transformer-ffn-activation"] = "gelu";
+      config["transformer-ffn-depth"] = 2;
+      config["transformer-heads"] = 16;
+      config["transformer-postprocess"] = "dan";
+      config["transformer-postprocess-emb"] = "nd";
+      config["transformer-preprocess"] = "";
+      config["transformer-train-position-embeddings"] = true;
+      config["type"] = "comet-qe";
+    });
   }
 }
 
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 4cc23f2ca..aaeeb514b 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -323,6 +323,17 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
   cli.add<float>("--bert-masking-fraction", "Fraction of masked out tokens during training", 0.15f);
   cli.add<bool>("--bert-train-type-embeddings", "Train bert type embeddings, set to false to use static sinusoidal embeddings", true);
   cli.add<int>("--bert-type-vocab-size", "Size of BERT type vocab (sentence A and B)", 2);
+
+  // Options specific for the "comet-qe" model type
+  cli.add<bool>("--comet-final-sigmoid", "Add final sigmoid to COMET model");
+  cli.add<bool>("--comet-mix", "Mix encoder layers to produce embedding");
+  cli.add<bool>("--comet-mix-norm", "Normalize layers prior to mixing");
+  cli.add<float>("--comet-dropout", "Dropout for pooler layers", 0.1f);
+  cli.add<float>("--comet-mixup", "Alpha parameter for Beta distribution for mixup", 0.0f);
+  cli.add<bool>("--comet-mixup-reg", "Use original and mixed-up samples in training");
+  cli.add<std::vector<int>>("--comet-pooler-ffn", "Hidden sizes for comet pooler", {2048, 1024});
+  cli.add<bool>("--comet-prepend-zero", "Add a start symbol to batch entries");
+
 #ifdef CUDNN
   cli.add<int>("--char-stride",
       "Width of max-pooling layer after convolution layer in char-s2s model",
diff --git a/src/common/file_stream.cpp b/src/common/file_stream.cpp
index e1572f62e..e2870b17a 100644
--- a/src/common/file_stream.cpp
+++ b/src/common/file_stream.cpp
@@ -97,7 +97,7 @@ OutputFileStream::OutputFileStream(const std::string &file)
     : std::ostream(NULL), file_(file) {
   streamBuf1_.reset(new std::filebuf());
   auto ret = static_cast<std::filebuf*>(streamBuf1_.get())->open(file.c_str(), std::ios::out | std::ios_base::binary);
-  ABORT_IF(!ret, "File cannot be opened", file);
+  ABORT_IF(!ret, "Error opening file ({}): {}", errno, file_.string());
   ABORT_IF(ret != streamBuf1_.get(), "Return value is not equal to streambuf pointer, that is weird");
 
   if(file_.extension() == marian::filesystem::Path(".gz")) {
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index d276ca6bc..a429ae2f3 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -59,6 +59,7 @@ CorpusBase::CorpusBase(const std::vector<std::string>& paths,
       maxLength_(options_->get<size_t>("max-length")),
       maxLengthCrop_(options_->get<bool>("max-length-crop")),
       rightLeft_(options_->get<bool>("right-left")),
+      prependZero_(options_->get<bool>("comet-prepend-zero", false)),
       tsv_(options_->get<bool>("tsv", false)),
       tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
   // TODO: support passing only one vocab file if we have fully-tied embeddings
@@ -84,6 +85,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate, size_t seed)
       maxLength_(options_->get<size_t>("max-length")),
       maxLengthCrop_(options_->get<bool>("max-length-crop")),
       rightLeft_(options_->get<bool>("right-left")),
+      prependZero_(options_->get<bool>("comet-prepend-zero", false)),
       tsv_(options_->get<bool>("tsv", false)),
       tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
   bool training = !translate;
@@ -420,9 +422,13 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
   // on the vocabulary type, this can be non-trivial, e.g. when SentencePiece
   // is used.
   Words words = vocabs_[batchIndex]->encode(line, /*addEOS =*/ addEOS_[batchIndex], inference_);
-
   ABORT_IF(words.empty(), "Empty input sequences are presently untested");
 
+  auto inputTypes = options_->get<std::vector<std::string>>("input-types", {}); // empty list by default
+
+  if(prependZero_ && inputTypes[batchIndex] == "sequence")
+    words.insert(words.begin(), Word::fromWordIndex(0));
+
   if(maxLengthCrop_ && words.size() > maxLength_) {
     words.resize(maxLength_);
     if(addEOS_[batchIndex])
diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h
index 2e572ebd8..123250d97 100644
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@@ -638,6 +638,7 @@ class CorpusBase : public DatasetBase<SentenceTuple, CorpusIterator, CorpusBatch
   size_t maxLength_{0};
   bool maxLengthCrop_{false};
   bool rightLeft_{false};
+  bool prependZero_{false};
 
   bool tsv_{false};  // true if the input is a single file with tab-separated values
   size_t tsvNumInputFields_{0};  // number of fields from the TSV input that are associated
diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp
index 8f774c2bb..dc06cc17b 100644
--- a/src/data/sentencepiece_vocab.cpp
+++ b/src/data/sentencepiece_vocab.cpp
@@ -227,7 +227,8 @@ class SentencePieceVocab : public IVocab {
     else
       spm_->SampleEncode(line, -1, alpha_, &spmIds);
 
-    Words words; words.reserve(spmIds.size() + addEOS);
+    Words words;
+    words.reserve(spmIds.size() + addEOS);
     for (auto&& spmId : spmIds)
       words.push_back(Word::fromWordIndex(spmId));
 
diff --git a/src/embedder/embedder.h b/src/embedder/embedder.h
index d45e14cd3..ebd9782e2 100644
--- a/src/embedder/embedder.h
+++ b/src/embedder/embedder.h
@@ -19,7 +19,7 @@ using namespace data;
 
 /*
  * The tool is used to create output sentence embeddings from available
- * Marian encoders. With --compute-similiarity and can return the cosine
+ * Marian encoders. With --compute-similiarity it can return the cosine
  * similarity between two sentences provided from two sources.
  */
 class Embedder {
@@ -56,8 +56,7 @@ class Embed : public ModelTask {
   Embed(Ptr<Options> options) : options_(options) {
     
     options_ = options_->with("inference", true, 
-                              "shuffle", "none",
-                              "input-types", std::vector<std::string>({"sequence"}));
+                              "shuffle", "none");
 
     // if a similarity is computed then double the input types and vocabs for
     // the two encoders that are used in the model.
@@ -109,7 +108,7 @@ class Embed : public ModelTask {
     auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_);
     batchGenerator->prepare();
 
-    auto output = New<VectorCollector>(options_);
+    auto output = New<VectorCollector>(options_->get<std::string>("output"), options_->get<bool>("binary"));
 
     size_t batchId = 0;
     {
@@ -128,8 +127,19 @@ class Embed : public ModelTask {
           auto embeddings = builder->build(graph, batch);
           graph->forward();
 
+          // handle copying from fp32 or fp16 embeddings correctly.
           std::vector<float> sentVectors;
-          embeddings->val()->get(sentVectors);
+          if(embeddings->value_type() == Type::float32) {
+            embeddings->val()->get(sentVectors);
+          } else if (embeddings->value_type() == Type::float16) {
+            std::vector<float16> sentVectors16;
+            embeddings->val()->get(sentVectors16);
+            sentVectors.reserve(sentVectors16.size());
+            for(auto& v: sentVectors16)
+              sentVectors.push_back(v);
+          } else {
+            ABORT("Unknown embedding type {}", embeddings->value_type());
+          }
           
           // collect embedding vector per sentence.
           // if we compute similarities this is only one similarity per sentence pair.
diff --git a/src/embedder/vector_collector.cpp b/src/embedder/vector_collector.cpp
index c1caf2f7b..11b07b43b 100644
--- a/src/embedder/vector_collector.cpp
+++ b/src/embedder/vector_collector.cpp
@@ -11,14 +11,17 @@ namespace marian {
 // This class manages multi-threaded writing of embedded vectors to stdout or an output file.
 // It will either output string versions of float vectors or binary equal length versions depending
 // on its binary_ flag.
+VectorCollector::VectorCollector(bool binary)
+  : nextId_(0),
+    binary_(binary) {}
 
-VectorCollector::VectorCollector(const Ptr<Options>& options)
-    : nextId_(0), binary_{options->get<bool>("binary", false)} {
-    if(options->get<std::string>("output") == "stdout")
-      outStrm_.reset(new std::ostream(std::cout.rdbuf()));
-    else
-      outStrm_.reset(new io::OutputFileStream(options->get<std::string>("output")));
-  }
+VectorCollector::VectorCollector(std::string outFile, bool binary)
+  : nextId_(0),
+    outStrm_(new std::ostream(std::cout.rdbuf())),
+    binary_(binary) {
+  if (outFile != "stdout")
+    outStrm_.reset(new io::OutputFileStream(outFile));
+}
 
 void VectorCollector::Write(long id, const std::vector<float>& vec) {
   std::lock_guard<std::mutex> lock(mutex_);
@@ -60,8 +63,7 @@ void VectorCollector::WriteVector(const std::vector<float>& vec) {
   if(binary_) {
     outStrm_->write((char*)vec.data(), vec.size() * sizeof(float));
   } else {
-    std::stringstream ss;
-    ss << std::fixed << std::setprecision(8);
+    *outStrm_ << std::fixed << std::setprecision(4);
     for(auto v : vec)
       *outStrm_ << v << " ";
     *outStrm_ << std::endl;
diff --git a/src/embedder/vector_collector.h b/src/embedder/vector_collector.h
index 80110958a..fc39ea6ec 100644
--- a/src/embedder/vector_collector.h
+++ b/src/embedder/vector_collector.h
@@ -14,7 +14,8 @@ namespace marian {
 // on its binary_ flag.
 class VectorCollector {
 public:
-  VectorCollector(const Ptr<Options>& options);
+  VectorCollector(bool binary=false);
+  VectorCollector(std::string outFile, bool binary=false);
   virtual ~VectorCollector() {}
   
   virtual void Write(long id, const std::vector<float>& vec);
diff --git a/src/functional/operators.h b/src/functional/operators.h
index 80b40ff40..3628fdcb9 100644
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include "common/types.h"
+
+#define _USE_MATH_DEFINES
 #include <cmath>
 
 namespace marian {
@@ -24,7 +26,8 @@ struct Ops {
   static HOST_DEVICE_INLINE T sqrt(const T&) { ABORT("Unknown type"); }
   static HOST_DEVICE_INLINE T neg(const T&)  { ABORT("Unknown type"); }
   static HOST_DEVICE_INLINE T sgn(const T&)  { ABORT("Unknown type"); }
-
+  static HOST_DEVICE_INLINE T erf(const T&)  { ABORT("Unknown type"); }
+  
   static HOST_DEVICE_INLINE T round(const T&)  { ABORT("Unknown type"); }
   static HOST_DEVICE_INLINE T floor(const T&)  { ABORT("Unknown type"); }
   static HOST_DEVICE_INLINE T ceil(const T&)   { ABORT("Unknown type"); }
@@ -82,6 +85,7 @@ struct Ops<float> {
   static HOST_DEVICE_INLINE float sqrt(const float& x) { return sqrtf(x); }
   static HOST_DEVICE_INLINE float neg(const float& x)  { return -x; }
   static HOST_DEVICE_INLINE float sgn(const float& x)  { return (float)((0 < x) - (x < 0)); }
+  static HOST_DEVICE_INLINE float erf(const float& x)  { return erff(x); }
 
   static HOST_DEVICE_INLINE float round(const float& x)  { return roundf(x); }
   static HOST_DEVICE_INLINE float floor(const float& x)  { return floorf(x); }
@@ -151,6 +155,7 @@ struct Ops<double> {
   static HOST_DEVICE_INLINE double sqrt(const double& x) { return std::sqrt(x); }
   static HOST_DEVICE_INLINE double neg(const double& x)  { return -x; }
   static HOST_DEVICE_INLINE double sgn(const double& x)  { return (0 < x) - (x < 0); }
+  static HOST_DEVICE_INLINE double erf(const double& x)  { return std::erf(x); }
 
   static HOST_DEVICE_INLINE double round(const double& x)  { return std::round(x); }
   static HOST_DEVICE_INLINE double floor(const double& x)  { return std::floor(x); }
@@ -265,6 +270,7 @@ struct Ops<float32x4> {
 
   // @TODO: get rid of loop4 with proper intrisics
   static inline float32x4 sgn(const float32x4& x)  { return loop4(Ops<float>::sgn, x); }
+  static inline float32x4 erf(const float32x4& x)  { return loop4(Ops<float>::erf, x); }
 
   static inline float32x4 round(const float32x4& x)  { return _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT); }
   static inline float32x4 floor(const float32x4& x)  { return _mm_floor_ps(x); }
@@ -394,6 +400,7 @@ struct Ops<float32x8> {
 
   // @TODO: get rid of loop8 with proper intrisics
   static inline float32x8 sgn(const float32x8& x)  { return loop8(Ops<float>::sgn, x); }
+  static inline float32x8 erf(const float32x8& x)  { return loop8(Ops<float>::erf, x); }
 
   static inline float32x8 round(const float32x8& x)  { return _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT); }
   static inline float32x8 floor(const float32x8& x)  { return _mm256_floor_ps(x); }
@@ -494,6 +501,7 @@ struct Ops<half> {
 #endif
 
   static DEVICE_INLINE half sgn(const half& x)  { half zero = 0.f; return (zero < x) - (x < zero); } // @TODO half has this information somewhere in the struct, right?
+  static DEVICE_INLINE half erf(const half& x)  { return erff((float)x); }
 
   static DEVICE_INLINE half round(const half& x)  { return hrint(x); }
   static DEVICE_INLINE half floor(const half& x)  { return hfloor(x); }
@@ -597,6 +605,7 @@ struct Ops<halfx2> {
 #endif
   
   static DEVICE_INLINE halfx2 sgn(const halfx2& x)  { halfx2 zero(0.f, 0.f); return __hsub2(__hlt2(zero, x), __hlt2(x, zero)); }
+  static DEVICE_INLINE halfx2 erf(const halfx2& x)  { return {Ops<half>::erf(x[0]), Ops<half>::erf(x[1])}; }
 
   static DEVICE_INLINE halfx2 round(const halfx2& x)  { return h2rint(x); }
   static DEVICE_INLINE halfx2 floor(const halfx2& x)  { return h2floor(x); }
@@ -714,6 +723,7 @@ UNARY(Sqr,     sqr,        Ops<ElementType>::sqr(x));
 UNARY(Sqrt,    sqrt,       Ops<ElementType>::sqrt(x));
 UNARY(Neg,     operator-,  Ops<ElementType>::neg(x));
 UNARY(Sgn,     sgn,        Ops<ElementType>::sgn(x));
+UNARY(Erf,     erf,        Ops<ElementType>::erf(x));
 
 UNARY(Round,   round,      Ops<ElementType>::round(x));
 UNARY(Floor,   floor,      Ops<ElementType>::floor(x));
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index a6504ebac..c928e8ce0 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -95,7 +95,7 @@ Expr swish(Expr a) {
 }
 
 Expr gelu(Expr a) {
-  return Expression<SwishNodeOp>(a, 1.702f);
+  return Expression<GeluNodeOp>(a);
 }
 
 Expr operator-(Expr a) {
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index 27121fa6d..4e78e7166 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -1,16 +1,19 @@
 #pragma once
 
-#include "tensors/backend.h"
-#include "tensors/tensor.h"
-
+#include "common/definitions.h"
 #include "functional/functional.h"
 #include "graph/node.h"
+#include "tensors/backend.h"
 #include "tensors/tensor_operators.h"
+#include "tensors/tensor.h"
 
 #ifdef CUDNN
 #include "tensors/gpu/cudnn_wrappers.h"
 #endif
 
+#define _USE_MATH_DEFINES  // enables math constants. We need M_PI
+#include <math.h>
+
 namespace marian {
 
 struct UnaryNodeOp : public NaryNodeOp {
@@ -417,6 +420,33 @@ struct SwishNodeOp : public UnaryNodeOp {
   float b_;
 };
 
+/**
+ * Represents a <a href="https://arxiv.org/pdf/1606.08415.pdf">GELU</a> node
+ * in an expression graph.
+ */
+struct GeluNodeOp : public UnaryNodeOp {
+  GeluNodeOp(Expr a) : UnaryNodeOp(a) {}
+
+  NodeOps forwardOps() override {
+    using namespace functional;
+    return {
+      NodeOp(Element(_1 = 0.5f * _2 * (1.f + erf(_2 / sqrt(2.f))), val_, child(0)->val()))
+    };
+  }
+
+  NodeOps backwardOps() override {
+    using namespace functional;
+    auto erf_prime = (2.f / sqrt((float)M_PI)) * exp(-(_1 * _1) / 2.f);
+    auto dx = 0.5 * (erf(_1 / sqrt(2.f)) + _1 * erf_prime / sqrt(2.f) + 1.f);
+    return {NodeOp(Add(dx * _2,
+                       child(0)->grad(),
+                       child(0)->val(),
+                       adj_))};
+  }
+
+  const std::string type() override { return "gelu"; }
+};
+
 struct SoftmaxNodeOp : public UnaryNodeOp {
   SoftmaxNodeOp(Expr a) : UnaryNodeOp(a) {}
 
diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp
index 334f0b865..85c14f51b 100644
--- a/src/layers/embedding.cpp
+++ b/src/layers/embedding.cpp
@@ -40,6 +40,13 @@ Embedding::Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options)
     }
   }
 
+#if 0
+  auto emb = graph_->get(name);
+  if(emb) {
+    dimVoc = emb->shape()[-2];
+  }
+#endif
+
   E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed);
 }
 
diff --git a/src/layers_new/embeddings.h b/src/layers_new/embeddings.h
index b7d297b63..e080906fe 100644
--- a/src/layers_new/embeddings.h
+++ b/src/layers_new/embeddings.h
@@ -202,9 +202,9 @@ struct LearnedPositionEmbedding : public PositionEmbeddingLayer {
       int dimEmb   = input->shape()[-1];
       int dimWords = input->shape()[positionAxis];
 
-      registerParameter(embeddings, 
-                        Shape({maxLength, dimEmb}), 
-                        inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true));
+      registerParameterLazy(embeddings, 
+                            Shape({maxLength, dimEmb}), 
+                            inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true));
 
       ABORT_IF(start + dimWords > maxLength, 
                "Number of positions ({}) starting at position {} exceeds maximum length {}",
diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h
index 3302d9d85..8776820ef 100644
--- a/src/layers_new/transformer.h
+++ b/src/layers_new/transformer.h
@@ -212,7 +212,7 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa
  * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. 
  * @TODO: get rid of these transposes.
  */
-struct TransformerEncoder final : public LayerWithOptions, public IBinaryLayer {
+struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
   Ptr<PositionEmbeddingLayer> positionEmbedding;
   Ptr<TransformerPrePostProcessor> preprocessor;
   Ptr<LayerList> layers;
@@ -250,6 +250,8 @@ struct TransformerEncoder final : public LayerWithOptions, public IBinaryLayer {
     registerLayer(postprocessor);
   }
 
+  virtual ~TransformerEncoder() = default;
+
   Expr apply(Expr input, Expr mask = nullptr) const override {
     // first and last operations (see at the bottom of this function) switch the time and batch
     // dimensions. This order is more natural for the transformer, but more difficult to handle
diff --git a/src/models/bert.h b/src/models/bert.h
index 99dfae55e..1e0153e6c 100644
--- a/src/models/bert.h
+++ b/src/models/bert.h
@@ -238,6 +238,7 @@ class BertEncoder : public EncoderTransformer {
                                ("prefix", "Wtype")
                                ("dimVocab", dimTypeVocab) // sentence A or sentence B
                                ("dimEmb", dimEmb)
+                               ("inference", graph_->isInference())
                                .construct(graph_);
       signal = sentenceEmbeddings->applyIndices(bertBatch->bertSentenceIndices(), {dimWords, dimBatch, dimEmb});
     } else {
diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h
new file mode 100644
index 000000000..cca18cac7
--- /dev/null
+++ b/src/models/comet_qe.h
@@ -0,0 +1,327 @@
+#pragma once
+
+#include "layers_new/transformer.h"
+
+#include "models/encoder.h"
+#include "layers/constructors.h"
+
+namespace marian {
+namespace models {
+
+struct CometEncoder final : public nn::TransformerEncoder {
+  Expr weights;
+  Expr gamma;
+
+  CometEncoder(Ptr<ExpressionGraph> graph, 
+               Ptr<Options> options) 
+    : TransformerEncoder(graph, options) {}
+
+  Expr apply(Expr input, Expr mask) const override {
+    auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
+    
+    mask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
+    auto binMask = mask;
+    mask = marian::nn::transposedLogMask(mask, opt<int>("transformer-heads"));
+  
+    // apply positional embeddings to contextual input
+    output = positionEmbedding->apply(output);
+
+    // handle for skip connection at top
+    auto prevOutput = output;
+
+    // apply dropout or layer-norm to embeddings if required
+    output = preprocessor->apply(output);
+
+    // This seems to be a mix of LayerNorm and BatchNorm and present in the original Unbabel code.
+    // It norms over time, not batch, also should be optimized. Seems safe to disable for custom
+    // models trained by us, but required when doing inference with Unbabel models.
+    auto cometNorm = [&, this](Expr x, Expr binMask) {
+      if(opt<bool>("comet-mix-norm", false)) {
+        registerParameterLazy(gamma, Shape({ 1 }), inits::ones());
+        int dimModel = x->shape()[-1];
+
+        // Convert type to fp32 for better accumulation. This is a no-op if things are already fp32.
+        Type origType = x->value_type();
+        x       = marian::cast(x,       Type::float32);
+        binMask = marian::cast(binMask, Type::float32);
+        
+        x = x * binMask;
+        auto denom = (float)dimModel * sum(binMask, -2);
+        auto mu    = sum(sum(x, -1), -2) / denom; // sum over model and time
+        auto sigma = sum(sum(square(x - mu), -1), -2) / denom;
+
+        auto normed = (x - mu) / sqrt(sigma + 1e-12f);
+        auto output = marian::cast(gamma, Type::float32) * sum(normed * binMask, -2) / sum(binMask, -2);
+
+        // Undo conversion to fp32 if not originally fp32 (most likely fp16 then)
+        return marian::cast(output, origType);
+      } else {
+        return sum(x * binMask, -2) / sum(binMask, -2);
+      }
+    };
+
+    std::vector<Expr> pooler;
+    if(opt<bool>("comet-mix", false))
+      pooler.push_back(cometNorm(output, binMask));
+
+    // traverse the layers, use the same mask for each
+    for(auto layer : *layers) {
+      output = layer->apply(output, mask);
+      if(opt<bool>("comet-mix", false))
+        pooler.push_back(cometNorm(output, binMask)); // [ batch, time, modelDim ]
+    }
+
+    if(opt<bool>("comet-mix", false)) {
+      registerParameterLazy(weights, Shape({ opt<int>("enc-depth") + 1 }), inits::ones());
+      auto weightsNorm = reshape(softmax(weights), {weights->shape()[-1], 1});
+      output = sum(weightsNorm * concatenate(pooler, /*axis=*/-2), -2); // [batch, 1, modelDim]
+    } else {
+      // just use last layer, average over time dim
+      output = cometNorm(output, binMask); // [batch, 1, modelDim]
+    }
+
+    return output;
+  }
+};
+
+// Wrapper for backwards compatibility that uses current encoder/decoder framework
+struct CometBatchEncoder final : public nn::LayerWithOptions, 
+                                 public nn::IEmbeddingLayer,  // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings
+                                 public EncoderBase {         // @TODO: should all encoders be IEmbeddingLayer?
+  Ptr<CometEncoder> encoder;
+
+  CometBatchEncoder(Ptr<ExpressionGraph> graph, 
+                    Ptr<Options> options)
+    : LayerWithOptions(graph, options),
+      EncoderBase(graph, options)
+  {
+    encoder = New<CometEncoder>(graph, options);
+    registerLayer(encoder);
+  }
+
+  // @TODO: subBatch should be of type Expr
+  virtual std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override {
+    // @TODO: this is still using the bad old interface
+    auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt<bool>("ulr", false));
+    const auto& [batchEmbedding, batchMask] = embeddingLayer->apply(subBatch);
+
+    auto batchContext = encoder->apply(batchEmbedding, batchMask); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+    return std::make_tuple(batchContext, batchMask);
+  }
+
+  virtual Expr apply(const Words& words, const Shape& shape) const override final {
+    return applyIndices(toWordIndexVector(words), shape);
+  }
+
+  // alternative from indices directly
+  virtual Expr applyIndices(const std::vector<WordIndex>& wordIndices, const Shape& shape) const override final {
+    auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt<bool>("ulr", false));
+    Expr batchEmbedding = embeddingLayer->applyIndices(wordIndices, shape);
+    auto batchContext = encoder->apply(batchEmbedding, /*mask=*/nullptr); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+    return batchContext;
+  }
+
+  // @TODO: currently here for backwards compat, should be replaced with apply()
+  virtual Ptr<EncoderState> build(Ptr<ExpressionGraph> graph,
+                                  Ptr<data::CorpusBatch> batch) override {
+#if 1
+    // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors
+    EncoderBase::graph_ = graph;
+    setGraph(graph);
+    // This makes sure that the graph passed into the model during construction and now evaluation are identical.
+    // A good check to have for catching weird situations early. 
+    ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
+#endif
+    
+    const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]);
+    return New<EncoderState>(batchEmbedding, batchMask, batch);
+  }
+
+  virtual void clear() override {
+    Layer::clear();
+  }
+};
+
+class CometQEPooler final : public nn::LayerWithOptions, 
+                            public PoolerBase {
+private:
+  Ptr<nn::Sequential> layers;
+  std::mt19937 rng{(uint32_t)Config::seed};
+
+public:
+  CometQEPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
+  : LayerWithOptions(graph, options),
+    PoolerBase(graph, options) {
+    
+    float dropoutProb = LayerWithOptions::opt<float>("comet-dropout", 0.1f);
+    auto ffnHidden = LayerWithOptions::opt<std::vector<int>>("comet-pooler-ffn", {2048, 1024});
+    layers = New<nn::Sequential>(
+      graph,
+      New<nn::Linear>(graph, ffnHidden[0]),
+      New<nn::Tanh>(graph),
+      New<nn::Dropout>(graph, dropoutProb),
+      New<nn::Linear>(graph, ffnHidden[1]),
+      New<nn::Tanh>(graph),
+      New<nn::Dropout>(graph, dropoutProb),
+      New<nn::Linear>(graph, 1)
+    );
+
+    if(LayerWithOptions::opt<bool>("comet-final-sigmoid"))
+      layers->append(New<nn::Sigmoid>(graph));
+    
+    registerLayer(layers);
+  }
+
+  std::vector<Expr> apply(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch, const std::vector<Ptr<EncoderState>>& encoderStates) override {
+#if 1
+    // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors
+    PoolerBase::graph_ = graph;
+    setGraph(graph);
+    // This makes sure that the graph passed into the model during construction and now evaluation are identical.
+    // A good check to have for catching weird situations early. 
+    ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
+#endif
+
+    auto beta = [](float alpha, std::mt19937& gen) {
+      // Generate random numbers x and y from gamma distributions with the given alpha and beta parameters
+      std::gamma_distribution<float> gamma(alpha, 1.f);
+      float x = gamma(gen);
+      float y = gamma(gen);
+      return x / (x + y);
+    };
+
+    auto mixup = [&](Expr x, Expr y, float alpha, bool reg=true) -> Expr2 {
+      if(alpha == 0.f)
+        return {x, y};
+  
+      int dimBatch = x->shape()[-3];
+      Type xType = x->value_type();
+      
+      std::vector<IndexType> indices(dimBatch);
+      std::iota(indices.begin(), indices.end(), 0);
+
+      // permute the indices and select batch entries accordingly
+      std::shuffle(indices.begin(), indices.end(), rng);
+      auto xPrime = index_select(x, -3, indices);
+      auto yPrime = index_select(y, -3, indices);
+
+      std::vector<float> lambdasVec(dimBatch);
+      std::generate(lambdasVec.begin(), lambdasVec.end(), [&]{ return beta(alpha, rng); });
+      auto lambdas = graph->constant({dimBatch, 1, 1}, inits::fromVector(lambdasVec), Type::float32);
+
+      auto xMixup = (1.f - marian::cast(lambdas, xType)) * x + marian::cast(lambdas, xType) * xPrime;
+      auto yMixup = (1.f - lambdas) * y + lambdas * yPrime;
+
+      if(reg) {
+        // return original and mixed samples
+        xMixup = concatenate({x, xMixup}, /*axis=*/-2);
+        yMixup = concatenate({y, yMixup}, /*axis=*/-2);
+      }
+
+      return {xMixup, yMixup};
+    };
+
+    ABORT_IF(encoderStates.size() != 2, "Pooler expects exactly two encoder state");
+    
+    auto src = encoderStates[0]->getContext();
+    auto mt  = encoderStates[1]->getContext();
+    
+    auto diff = abs(mt - src);
+    auto prod = mt * src;
+
+    Expr output;
+    if(LayerWithOptions::opt<int>("usage") == (int)models::usage::embedding) {
+      auto embFwd  = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model]
+      auto embBwd  = concatenate({src, mt, prod, diff}, /*axis=*/-1); // [batch, 1, model]
+      auto emb     = concatenate({embFwd, embBwd}, /*axis=*/-2);
+      output = layers->apply(emb);
+
+      int dimBatch = output->shape()[-3];
+      output = reshape(output, {dimBatch, 1, 2});
+      return { output };
+    } else {
+      auto emb = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model]
+      
+      auto softLabelsWords = batch->front()->data();
+      auto classVocab      = batch->front()->vocab();
+      
+      int dimBatch = (int)softLabelsWords.size();
+      std::vector<float> softLabels;
+      for(auto w : softLabelsWords) {
+        // @TODO: this is a super-ugly hack to get regression values
+        float score = w != Word::NONE ? std::stof((*classVocab)[w]) : 0.f;
+        softLabels.push_back(score);
+      }
+      auto labels = graph->constant({dimBatch, 1, 1}, inits::fromVector(softLabels), Type::float32);
+
+      if(getMode() == Mode::train) {
+        float mixupAlpha = LayerWithOptions::opt<float>("comet-mixup", 0.f);
+        bool mixupReg    = LayerWithOptions::opt<bool>("comet-mixup-reg", false);
+        auto xy = mixup(emb, labels, mixupAlpha, mixupReg);
+        emb     = get<0>(xy);
+        labels  = get<1>(xy);
+      }
+      output = marian::cast(layers->apply(emb), Type::float32);
+      return { output, labels };
+    }  
+  }
+
+  void clear() override {}
+};
+
+// Wraps an EncoderClassifier so it can produce a cost from raw logits. @TODO: Needs refactoring
+class CometBinaryCE final : public ICost {
+protected:
+  Ptr<Options> options_;
+  const bool inference_{false};
+  const bool rescore_{false};
+
+public:
+  CometBinaryCE(Ptr<Options> options)
+    : options_(options), inference_(options->get<bool>("inference", false)), 
+      rescore_(options->get<std::string>("cost-type", "ce-sum") == "ce-rescore") { }
+
+  Ptr<MultiRationalLoss> apply(Ptr<IModel> model,
+                               Ptr<ExpressionGraph> graph,
+                               Ptr<data::Batch> batch,
+                               bool clearGraph = true) override {
+    auto encpool = std::static_pointer_cast<EncoderPooler>(model);
+    auto corpusBatch = std::static_pointer_cast<data::CorpusBatch>(batch);
+
+    auto inputTypes = options_->get<std::vector<std::string>>("input-types", {});
+    ABORT_IF(inputTypes != std::vector<std::string>({"class", "sequence", "sequence"}), 
+             "Expected input-types to be have fields (class, sequence, sequence)");
+    ABORT_IF(corpusBatch->sets() != 3, "Expected 3 sub-batches, not {}", corpusBatch->sets());
+
+    auto lossFn = [&](Expr x, Expr y) {
+      float eps = 1e-5f;
+      if(!options_->get<bool>("comet-final-sigmoid"))
+        x = sigmoid(x);
+      return -(y * log(x + eps) + (1.f - y) * log((1.f + eps) - x));
+    };
+
+    auto encoded = encpool->apply(graph, corpusBatch, clearGraph);
+    
+    Expr x = encoded[0];
+    Expr y = encoded[1];
+    auto loss = lossFn(x, y);
+
+    loss = mean(loss, /*axis=*/-2); // this should only do something with mixup regularization
+
+    int dimBatch = loss->shape()[-3];
+    if(rescore_)
+      loss = reshape(loss, {1, dimBatch, 1});
+    else 
+      loss = sum(loss, /*axis=*/-3); // [1, 1, 1]
+    
+    Ptr<MultiRationalLoss> multiLoss = New<SumMultiRationalLoss>();
+    RationalLoss lossPiece(loss, (float)dimBatch);
+    multiLoss->push_back(lossPiece);
+
+    return multiLoss;
+  }
+};
+
+} // namespace models
+} // namespace marian
+
diff --git a/src/models/encoder_pooler.h b/src/models/encoder_pooler.h
index 124d873c5..0a781c9d5 100644
--- a/src/models/encoder_pooler.h
+++ b/src/models/encoder_pooler.h
@@ -122,7 +122,6 @@ class EncoderPooler : public EncoderPoolerBase {
                     "skip",
                     "layer-normalization",
                     "right-left",
-                    "input-types",
                     "special-vocab",
                     "tied-embeddings",
                     "tied-embeddings-src",
@@ -158,6 +157,12 @@ class EncoderPooler : public EncoderPoolerBase {
     modelFeatures_.insert("lemma-dependency");
     modelFeatures_.insert("factors-combine");
     modelFeatures_.insert("factors-dim-emb");
+
+    modelFeatures_.insert("comet-prepend-zero");
+    modelFeatures_.insert("comet-pooler-ffn");
+    modelFeatures_.insert("comet-final-sigmoid");
+    modelFeatures_.insert("comet-mix");
+    modelFeatures_.insert("comet-mix-norm");
   }
 
   virtual Ptr<Options> getOptions() override { return options_; }
diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp
index 17ee2a4d9..40ba122a6 100644
--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@@ -1,5 +1,7 @@
 #include "marian.h"
 
+#include "common/fastopt.h"
+
 #include "models/model_factory.h"
 #include "models/encoder_decoder.h"
 #include "models/encoder_classifier.h"
@@ -14,6 +16,8 @@
 #include "models/transformer_factory.h"
 #include "models/transformer_new.h"
 
+#include "models/comet_qe.h"
+
 #ifdef CUDNN
 #include "models/char_s2s.h"
 #endif
@@ -46,7 +50,7 @@ Ptr<EncoderBase> EncoderFactory::construct(Ptr<ExpressionGraph> graph) {
   if(options_->get<std::string>("type") == "bert-encoder")
     return New<BertEncoder>(graph, options_);
 
-  ABORT("Unknown encoder type");
+  ABORT("Unknown encoder type {}", options_->get<std::string>("type"));
 }
 
 Ptr<DecoderBase> DecoderFactory::construct(Ptr<ExpressionGraph> graph) {
@@ -69,7 +73,7 @@ Ptr<ClassifierBase> ClassifierFactory::construct(Ptr<ExpressionGraph> graph) {
 Ptr<PoolerBase> PoolerFactory::construct(Ptr<ExpressionGraph> graph) {
   if(options_->get<std::string>("type") == "max-pooler")
     return New<MaxPooler>(graph, options_);
-  if(options_->get<std::string>("type") == "slice-pooler")
+  else if(options_->get<std::string>("type") == "slice-pooler")
     return New<SlicePooler>(graph, options_);
   else if(options_->get<std::string>("type") == "sim-pooler")
     return New<SimPooler>(graph, options_);
@@ -136,6 +140,34 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
     size_t fields = trainEmbedderRank ? dimVocabs.size() : 0;
     int dimVocab = dimVocabs[0];
     
+    if(type == "comet-qe") {
+      auto newOptions = options->with("usage", use);
+      auto res = New<EncoderPooler>(newOptions);
+
+      auto inputTypes = options->get<std::vector<std::string>>("input-types");
+      ABORT_IF(inputTypes.empty(),
+        "Required option --input-types for COMET-QE not set. "
+        "For inference that should be --input-types sequence sequence. "
+        "For training set --input-types class sequence sequence");
+
+      int shift = 0;
+      if(inputTypes[0] == "class")
+        shift = 1;
+      
+      auto enc1 = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 0 + shift));
+      enc1->setName("CometEncoder");
+      res->push_back(enc1);
+      
+      auto enc2 = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 1 + shift));
+      enc2->setName("CometEncoder");
+      res->push_back(enc2);
+      
+      auto pooler = New<CometQEPooler>(graph, newOptions);
+      pooler->setName("CometQEPooler");
+      res->push_back(pooler);
+      return res;
+    }
+
     Ptr<Options> newOptions;
     if(options->get<bool>("compute-similarity", false)) {
       newOptions = options->with("usage", use,
@@ -175,6 +207,28 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
     return res;
   }
 
+  if(use == usage::training || use == usage::scoring) {
+    if(type == "comet-qe") {
+      auto newOptions = options->with("usage", use);
+      auto res = New<EncoderPooler>(newOptions);
+      
+      // For training, first rank in batch is class!
+
+      auto enc1 = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 1));
+      enc1->setName("CometEncoder");
+      res->push_back(enc1);
+      
+      auto enc2 = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 2));
+      enc2->setName("CometEncoder");
+      res->push_back(enc2);
+      
+      auto pooler = New<CometQEPooler>(graph, newOptions);
+      pooler->setName("CometQEPooler");
+      res->push_back(pooler);
+      return res;
+    }
+  }
+
   if(type == "s2s" || type == "amun" || type == "nematus") {
     return models::encoder_decoder(options->with(
          "usage", use,
@@ -435,6 +489,8 @@ Ptr<ICriterionFunction> createCriterionFunctionFromOptions(Ptr<Options> options,
     return New<Trainer>(baseModel, New<MNISTCrossEntropyCost>());
 #endif
 #endif
+  else if (type == "comet-qe" && std::dynamic_pointer_cast<EncoderPooler>(baseModel))
+    return New<Trainer>(baseModel, New<CometBinaryCE>(options));
   else if (std::dynamic_pointer_cast<EncoderPooler>(baseModel))
     return New<Trainer>(baseModel, New<EncoderPoolerRankCost>(options));
   else
diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc
index 6d4c4a95d..1b233bb1b 100755
--- a/src/tensors/gpu/add.inc
+++ b/src/tensors/gpu/add.inc
@@ -36,6 +36,7 @@ template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plu
 template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Add<marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Aggregate<marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase> >(marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
\ No newline at end of file
diff --git a/src/tensors/gpu/add_all.inc b/src/tensors/gpu/add_all.inc
index ba466d895..b983b7b7e 100644
--- a/src/tensors/gpu/add_all.inc
+++ b/src/tensors/gpu/add_all.inc
@@ -41,6 +41,7 @@ template void marian::AggregateAll<float, float, marian::functional::UnaryFuncto
 template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::AggregateAll<float, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 
 #if COMPILE_FP16
 template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
@@ -83,4 +84,5 @@ template void marian::AggregateAll<__half, float, marian::functional::UnaryFunct
 template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::AggregateAll<__half, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 #endif
diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc
index edec0e1a7..730817849 100755
--- a/src/tensors/gpu/element.inc
+++ b/src/tensors/gpu/element.inc
@@ -73,6 +73,8 @@ template void marian::gpu::Element<marian::functional::Assign<marian::functional
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::Assignee<2>, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::Assignee<2>, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > >, marian::functional::Assignee<3> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > >, marian::functional::Assignee<3> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > > > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > > > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+
 // How to add new specializations:
 // When you use a new specialization, it will cause a link error of this form (example):
 //   .../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element<marian::functional::Assign< ... > ( ... )'
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index cb95470f4..c160332e4 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -563,8 +563,7 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
   size_t step = options_->get<size_t>("mini-batch-fit-step");
 
   size_t maxLength = options_->get<size_t>("max-length");
-  maxLength = (size_t)(std::ceil(maxLength / (float)step) * step);
-
+  
   // this should be only one class label per line on input, hence restricting length to 1
   std::vector<size_t> localMaxes(numFiles, maxLength);
   auto inputTypes = options_->get<std::vector<std::string>>("input-types", {});
@@ -599,7 +598,11 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
 
   // Do a binary search for maxmimum batch size that fits into given workspace memory
   // for a tested sentence length.
-  for(size_t i = step; i <= maxLength; i += step) {
+  // We round the maxLength to the next larger step to avoid a situation where we do not
+  // collect batch statistics for maximum length between steps. However, we do not exceed 
+  // the actual maxLength even if the rounded value is larger.
+  size_t maxLengthRounded = (size_t)(std::ceil(maxLength / (float)step) * step);
+  for(size_t i = step; i <= maxLengthRounded; i += step) {
     size_t start = 1;
     size_t end = maxBatch;
 
diff --git a/src/training/validator.cpp b/src/training/validator.cpp
index ef1bac3db..cdc5ef5ac 100644
--- a/src/training/validator.cpp
+++ b/src/training/validator.cpp
@@ -1,4 +1,5 @@
 #include "training/validator.h"
+#include "embedder/vector_collector.h"
 
 namespace marian {
 
@@ -37,6 +38,9 @@ std::vector<Ptr<ValidatorBase/*<data::Corpus>*/>> Validators(
     } else if(metric == "bert-sentence-accuracy") {
       auto validator = New<BertAccuracyValidator>(vocabs, config, false);
       validators.push_back(validator);
+    } else if(metric == "embedding") {
+      auto validator = New<EmbeddingValidator>(vocabs, config);
+      validators.push_back(validator);
     } else {
       ABORT("Unknown validation metric: {}", metric);
     }
@@ -437,6 +441,115 @@ float TranslationValidator::validate(const std::vector<Ptr<ExpressionGraph>>& gr
   return val;
 };
 
+///////////////////////////////////////////////////////////////////////////////////////
+EmbeddingValidator::EmbeddingValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options)
+    : Validator(vocabs, options, false), quiet_(options_->get<bool>("quiet-translation")) {
+  // @TODO: remove, only used for saving?
+  builder_ = models::createModelFromOptions(options_, models::usage::embedding);
+
+  if(!options_->hasAndNotEmpty("valid-script-path"))
+    LOG_VALID(warn, "No post-processing script given for validating translator");
+
+  createBatchGenerator(/*isTranslating=*/true);
+}
+
+float EmbeddingValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                                   Ptr<const TrainingState> state) {
+  using namespace data;
+
+  // Generate batches
+  batchGenerator_->prepare();
+
+  std::vector<Ptr<models::IModel>> models;
+  for(auto graph : graphs) {
+    models.push_back(models::createModelFromOptions(options_, models::usage::embedding));
+    graph->setInference(true);
+  }
+
+  // Set up output file
+  std::string fileName;
+  Ptr<io::TemporaryFile> tempFile;
+
+  if(options_->hasAndNotEmpty("valid-translation-output")) {
+    fileName = options_->get<std::string>("valid-translation-output");
+    // fileName can be a template with fields for training state parameters:
+    fileName = state->fillTemplate(fileName);
+  } else {
+    tempFile.reset(new io::TemporaryFile(options_->get<std::string>("tempdir"), false));
+    fileName = tempFile->getFileName();
+  }
+ 
+  timer::Timer timer;
+  {
+    // @TODO: This can be simplified. If there is no "valid-translation-output", fileName already
+    // contains the name of temporary file that should be used?
+    auto output = options_->hasAndNotEmpty("valid-translation-output")
+                         ? New<VectorCollector>(fileName)
+                         : New<VectorCollector>(tempFile->getFileName());
+
+    std::deque<Ptr<ExpressionGraph>> graphQueue(graphs.begin(), graphs.end());
+    std::deque<Ptr<models::IModel>> modelQueue(models.begin(), models.end());
+    auto task = [=, &graphQueue, &modelQueue](BatchPtr batch) {
+      thread_local Ptr<ExpressionGraph> graph;
+      thread_local Ptr<models::IModel> builder;
+
+      if(!graph) {
+        std::unique_lock<std::mutex> lock(mutex_);
+        ABORT_IF(graphQueue.empty(), "Asking for graph, but none left on queue");
+        graph = graphQueue.front();
+        graphQueue.pop_front();
+
+        ABORT_IF(modelQueue.empty(), "Asking for scorer, but none left on queue");
+        builder = modelQueue.front();
+        modelQueue.pop_front();
+      }
+
+      auto embedder    = std::dynamic_pointer_cast<EncoderPooler>(builder);
+      auto corpusBatch = std::dynamic_pointer_cast<data::CorpusBatch>(batch);
+      auto embeddings  = cast(embedder->apply(graph, corpusBatch, /*clearGraph=*/true)[0], Type::float32);
+
+      graph->forward();
+
+      std::vector<float> sentVectors;
+      embeddings->val()->get(sentVectors);
+
+      // collect embedding vector per sentence.
+      // if we compute similarities this is only one similarity per sentence pair.
+      for(size_t i = 0; i < batch->size(); ++i) {
+          auto embSize = embeddings->shape()[-1];
+          auto beg = i * embSize;
+          auto end = (i + 1) * embSize;
+          std::vector<float> sentVector(sentVectors.begin() + beg, sentVectors.begin() + end);
+          output->Write((long)batch->getSentenceIds()[i], sentVector);
+      }
+    };
+
+    threadPool_.reserve(graphs.size());
+    TaskBarrier taskBarrier;
+    for(auto batch : *batchGenerator_)
+      taskBarrier.push_back(threadPool_.enqueue(task, batch));
+    // ~TaskBarrier waits until all are done
+  }
+
+  for(auto graph : graphs)
+    graph->setInference(false);
+
+  float val = 0.0f;
+
+  // Run post-processing script if given
+  if(options_->hasAndNotEmpty("valid-script-path")) {
+    // auto command = options_->get<std::string>("valid-script-path") + " " + fileName;
+    // auto valStr = utils::exec(command);
+    auto valStr = utils::exec(options_->get<std::string>("valid-script-path"),
+                              options_->get<std::vector<std::string>>("valid-script-args"),
+                              fileName);
+    val = (float)std::atof(valStr.c_str());
+    updateStalled(graphs, val);
+  }
+
+  return val;
+};
+
 ///////////////////////////////////////////////////////////////////////////////////////
 SacreBleuValidator::SacreBleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, const std::string& metric)
     : Validator(vocabs, options, /*lowerIsBetter=*/false),
diff --git a/src/training/validator.h b/src/training/validator.h
index 16bfd2457..d7580a500 100644
--- a/src/training/validator.h
+++ b/src/training/validator.h
@@ -359,6 +359,25 @@ class SacreBleuValidator : public Validator<data::Corpus, models::IModel> {
   bool quiet_{ false };
 };
 
+// Validator that writes embeddings to a file and computes any metric specified with an external script
+class EmbeddingValidator : public Validator<data::Corpus, models::IModel> {
+public:
+  EmbeddingValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~EmbeddingValidator() {}
+
+  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                         Ptr<const TrainingState> state) override;
+
+  std::string type() override { return "embed"; }
+
+protected:
+  bool quiet_{false};
+
+  virtual float validateBG(const std::vector<Ptr<ExpressionGraph>>& /*graphs*/) override {
+    return 0;
+  }
+};
+
 /**
  * @brief Creates validators from options
  *

From 30f41daf96c1bb3e6c4e346f2f5d5dd7d4ab74bb Mon Sep 17 00:00:00 2001
From: Fai Sigalov <fsigalov@microsoft.com>
Date: Thu, 16 Mar 2023 01:11:47 +0000
Subject: [PATCH 05/26] Merged PR 28460: Revert "Merged PR 26311: [FSM] make
 model loading lock non-static"

locally I see this is causing a 5% regression in startup time, and we see a regression in prod as well.

Revert "Merged PR 26311: [FSM] make model loading lock non-static"

This reverts commit 4f145c450f2b4b956d175fbbfe118a90e494acf4.
---
 src/data/factored_vocab.cpp | 3 ++-
 src/data/factored_vocab.h   | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp
index f51869d56..caee2e0c3 100644
--- a/src/data/factored_vocab.cpp
+++ b/src/data/factored_vocab.cpp
@@ -21,7 +21,8 @@ namespace marian {
   maxSizeUnused;
   // If model has already been loaded, then assume this is a shared object, and skip loading it again.
   // This can be multi-threaded, so must run under lock.
-  std::lock_guard<std::mutex> criticalSection(loadMtx_);
+  static std::mutex s_mtx;
+  std::lock_guard<std::mutex> criticalSection(s_mtx);
   if (size() != 0) {
     //LOG(info, "[vocab] Attempting to load model a second time; skipping (assuming shared vocab)");
     return size();
diff --git a/src/data/factored_vocab.h b/src/data/factored_vocab.h
index edbee1544..b644ce4c4 100644
--- a/src/data/factored_vocab.h
+++ b/src/data/factored_vocab.h
@@ -110,7 +110,6 @@ class FactoredVocab : public IVocab {
   Word unkId_{};
   WordLUT vocab_;
   size_t lemmaSize_;
-  std::mutex loadMtx_;
 
   // factors
   char factorSeparator_ = '|';                         // separator symbol for parsing factored words

From 26b178c19cf71c3254046688eea66edc61f6ea36 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <thammegowda@microsoft.com>
Date: Fri, 17 Mar 2023 18:57:24 +0000
Subject: [PATCH 06/26] Merged PR 28179: comet2marian.py: download comet models
 automatically.

--comet argument can be either a model path or model ID
```
  --comet COMET, -c COMET
                        COMET model path or an ID: wmt20-comet-qe-da, wmt20-comet-qe-da-v2,
                       wmt21-comet-qe-mqm, wmt21-comet-qe-da
```
---
 scripts/comet/comet2marian.py | 48 ++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 12 deletions(-)
 mode change 100644 => 100755 scripts/comet/comet2marian.py

diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py
old mode 100644
new mode 100755
index 9ddbb45c1..2a2ee7777
--- a/scripts/comet/comet2marian.py
+++ b/scripts/comet/comet2marian.py
@@ -4,30 +4,54 @@
 """
 
 import argparse
-import yaml
+import logging as log
 import numpy as np
+import yaml
+
+from pathlib import Path
+
+## Uncomment to see model names supported by your installed version of unbabel-comet
+# from comet.models import available_metrics
+# supported_comets = [m for m in available_metrics if 'qe' in m.lower()]
+supported_comets = ['wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da']
+log.basicConfig(level=log.INFO)
 
 parser = argparse.ArgumentParser(description='Convert Unbabel COMET-QE models to Marian weight file.')
 inputs = parser.add_mutually_exclusive_group(required=True)
-inputs.add_argument('--comet', help='Path to COMET model')
-inputs.add_argument('--roberta', help='Initialize with Roberta model', action='store_true')
-parser.add_argument('--marian', help='Output path for Marian weight file', required=True)
+inputs.add_argument('--roberta', '-r', help='Initialize with Roberta model', action='store_true')
+inputs.add_argument('--comet', '-c', help=f'COMET model path or an ID: {", ".join(supported_comets)}')
+parser.add_argument('--marian', '-m', help='Output path for Marian weight file', required=True)
 parser.add_argument('-s', '--add_sigmoid', help='Add final sigmoid if not already present', action='store_true')
 args = parser.parse_args()
 
 
-if args.roberta:
+def load_from_huggingface(model_id):
+    log.info(f"Loading COMET model from huggingface {model_id}")
     from transformers import AutoModel
+    try:
+        model = AutoModel.from_pretrained(model_id, add_pooling_layer=False)    
+    except:
+        log.error(f"Could not resolve {model_id} from huggingface")
+        raise
+    return model.eval()
+
+
+if args.roberta:
     # Load the model that Unbabel based COMET on: https://huggingface.co/microsoft/infoxlm-large
-    robertaModel = AutoModel.from_pretrained("microsoft/infoxlm-large", add_pooling_layer=False)
-    robertaModel.eval()
-    print(robertaModel)
-    cometModel = robertaModel
+    cometModel = load_from_huggingface("microsoft/infoxlm-large")
 else:
-    from comet import load_from_checkpoint
-    cometModel = load_from_checkpoint(args.comet)
+    from comet import load_from_checkpoint, download_model
+    model_path = args.comet
+    if not Path(model_path).exists():
+        if model_path not in supported_comets:
+            log.info(f"Could not find {model_path}")  # maybe it's an invalid path
+        log.info(f"trying to resolve download {model_path}")
+        model_path = download_model(model_path)
+    log.info(f"Loading COMET model from checkpoint {model_path}")
+    cometModel = load_from_checkpoint(model_path)
     cometModel.eval()
-    print(cometModel)
+
+print(cometModel)
 
 marianModel = dict()
 

From cd4d1ec49616288eadb87c15df4ec1566e1e3b1e Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Thu, 30 Mar 2023 07:17:55 +0000
Subject: [PATCH 07/26] Merged PR 28674: Add --early-stopping-epsilon param

The new option `--early-stopping-epsilon` sets minimum required improvement to consider a consecutive validation score not a stalled one. You must set a single value or a separate epsilon for each validation metric. Negative values are allowed.

Regression tests: https://github.com/marian-nmt/marian-regression-tests/pull/90
---
 src/common/config_parser.cpp    | 14 ++++---
 src/common/config_validator.cpp |  5 +++
 src/training/training.h         |  4 +-
 src/training/validator.cpp      | 68 +++++++++++++++++++--------------
 src/training/validator.h        | 41 ++++++++++++--------
 5 files changed, 81 insertions(+), 51 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index aaeeb514b..16d090897 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -602,14 +602,18 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
       "Multiple metrics can be specified",
       {"cross-entropy"});
   cli.add<bool>("--valid-reset-stalled",
-     "Reset stalled validation metrics when the training is restarted");
+      "Reset stalled validation metrics when the training is restarted");
   cli.add<bool>("--valid-reset-all",
-     "Reset all validation metrics when the training is restarted");
+      "Reset all validation metrics when the training is restarted");
   cli.add<size_t>("--early-stopping",
-     "Stop if the first validation metric does not improve for arg consecutive validation steps",
-     10);
+      "Stop if the first validation metric does not improve for arg consecutive validation steps",
+      10);
+  cli.add<std::vector<float>>("--early-stopping-epsilon",
+      "An improvement lower than or equal to arg does not prevent stalled validation. "
+      "i-th value corresponds to i-th metric in --valid-metrics",
+      {0});
   cli.add<std::string>("--early-stopping-on",
-      "Decide if early stopping should take into account first, all, or any validation metrics"
+      "Decide if early stopping should take into account first, all, or any validation metrics. "
       "Possible values: first, all, any",
       "first");
 
diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp
index 6c6b002aa..5563b240d 100644
--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@@ -141,6 +141,11 @@ void ConfigValidator::validateOptionsTraining() const {
   ABORT_IF(supportedStops.find(get<std::string>("early-stopping-on")) == supportedStops.end(),
            "Supported options for --early-stopping-on are: first, all, any");
 
+  // check if --early-stopping-epsilon is provided for each validation metric or is a single value
+  auto epsilons = get<std::vector<float>>("early-stopping-epsilon");
+  ABORT_IF(epsilons.size() > 1 && epsilons.size() != get<std::vector<std::string>>("valid-metrics").size(),
+           "--early-stopping-epsilon must have as many values as there is --valid-metrics or only one");
+
   // validations for learning rate decaying
   ABORT_IF(get<float>("lr-decay") > 1.f, "Learning rate decay factor greater than 1.0 is unusual");
 
diff --git a/src/training/training.h b/src/training/training.h
index a5723f308..7f6176879 100644
--- a/src/training/training.h
+++ b/src/training/training.h
@@ -23,10 +23,10 @@ class Train : public ModelTask {
 
   void run() override {
     using namespace data;
-    
+
     // MPI init should be first thing in training
     auto mpi = initMPI(/*multiThreaded=*/!options_->get<bool>("sync-sgd")); // @TODO: do we need the multiThreaded distinction at all?
-    
+
     if(mpi) { // if we run MPI, then make sure to sync seed across processes as first action
       mpi->bCast(&Config::seed, 1, IMPIWrapper::getDataType(&Config::seed));
       LOG(info, "Synced seed {}", Config::seed);
diff --git a/src/training/validator.cpp b/src/training/validator.cpp
index cdc5ef5ac..bd9068acf 100644
--- a/src/training/validator.cpp
+++ b/src/training/validator.cpp
@@ -1,49 +1,60 @@
 #include "training/validator.h"
 #include "embedder/vector_collector.h"
 
+#include <deque>
+
 namespace marian {
 
+static std::vector<std::string> CE_METRICS
+    = {"cross-entropy", "ce-mean", "ce-sum", "ce-mean-words", "perplexity"};
+
 std::vector<Ptr<ValidatorBase/*<data::Corpus>*/>> Validators(
     std::vector<Ptr<Vocab>> vocabs,
     Ptr<Options> config) {
   std::vector<Ptr<ValidatorBase/*<data::Corpus>*/>> validators;
 
-  auto validMetrics = config->get<std::vector<std::string>>("valid-metrics");
-
-  std::vector<std::string> ceMetrics
-      = {"cross-entropy", "ce-mean", "ce-sum", "ce-mean-words", "perplexity"};
+  auto epsilonsVec = config->get<std::vector<float>>("early-stopping-epsilon");
+  std::deque<float> epsilons(epsilonsVec.begin(), epsilonsVec.end());
+  auto eps = epsilons.front();
+  epsilons.pop_front();
 
+  auto validMetrics = config->get<std::vector<std::string>>("valid-metrics");
   for(auto metric : validMetrics) {
-    if(std::find(ceMetrics.begin(), ceMetrics.end(), metric) != ceMetrics.end()) {
+    if(std::find(CE_METRICS.begin(), CE_METRICS.end(), metric) != CE_METRICS.end()) {
       Ptr<Options> opts = New<Options>(*config);
       opts->set("cost-type", metric);
 
-      auto validator = New<CrossEntropyValidator>(vocabs, opts);
+      auto validator = New<CrossEntropyValidator>(vocabs, opts, eps);
       validators.push_back(validator);
     } else if(metric == "valid-script") {
-      auto validator = New<ScriptValidator>(vocabs, config);
+      auto validator = New<ScriptValidator>(vocabs, config, eps);
       validators.push_back(validator);
     } else if(metric == "translation") {
-      auto validator = New<TranslationValidator>(vocabs, config);
+      auto validator = New<TranslationValidator>(vocabs, config, eps);
       validators.push_back(validator);
     } else if(metric == "bleu" || metric == "bleu-detok" || metric == "bleu-segmented" || metric == "chrf") {
-      auto validator = New<SacreBleuValidator>(vocabs, config, metric);
+      auto validator = New<SacreBleuValidator>(vocabs, config, metric, eps);
       validators.push_back(validator);
     } else if(metric == "accuracy") {
-      auto validator = New<AccuracyValidator>(vocabs, config);
+      auto validator = New<AccuracyValidator>(vocabs, config, eps);
       validators.push_back(validator);
     } else if(metric == "bert-lm-accuracy") {
-      auto validator = New<BertAccuracyValidator>(vocabs, config, true);
+      auto validator = New<BertAccuracyValidator>(vocabs, config, true, eps);
       validators.push_back(validator);
     } else if(metric == "bert-sentence-accuracy") {
-      auto validator = New<BertAccuracyValidator>(vocabs, config, false);
+      auto validator = New<BertAccuracyValidator>(vocabs, config, false, eps);
       validators.push_back(validator);
     } else if(metric == "embedding") {
-      auto validator = New<EmbeddingValidator>(vocabs, config);
+      auto validator = New<EmbeddingValidator>(vocabs, config, eps);
       validators.push_back(validator);
     } else {
       ABORT("Unknown validation metric: {}", metric);
     }
+
+    if(!epsilons.empty()) {
+      eps = epsilons.front();
+      epsilons.pop_front();
+    }
   }
 
   return validators;
@@ -63,8 +74,8 @@ void ValidatorBase::actAfterLoaded(TrainingState& state) {
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////
-CrossEntropyValidator::CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options)
-    : Validator(vocabs, options) {
+CrossEntropyValidator::CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon)
+    : Validator(vocabs, options, true, epsilon) {
   createBatchGenerator(/*isTranslating=*/false);
 
   auto opts = options_->with("inference",
@@ -126,8 +137,8 @@ float CrossEntropyValidator::validateBG(const std::vector<Ptr<ExpressionGraph>>&
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////
-AccuracyValidator::AccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options)
-    : Validator(vocabs, options, /*lowerIsBetter=*/false) {
+AccuracyValidator::AccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon)
+    : Validator(vocabs, options, /*lowerIsBetter=*/false, epsilon) {
   createBatchGenerator(/*isTranslating=*/false);
 
   // @TODO: remove, only used for saving?
@@ -200,8 +211,9 @@ float AccuracyValidator::validateBG(const std::vector<Ptr<ExpressionGraph>>& gra
 ///////////////////////////////////////////////////////////////////////////////////////
 BertAccuracyValidator::BertAccuracyValidator(std::vector<Ptr<Vocab>> vocabs,
                                            Ptr<Options> options,
-                                           bool evalMaskedLM)
-    : Validator(vocabs, options, /*lowerIsBetter=*/false), evalMaskedLM_(evalMaskedLM) {
+                                           bool evalMaskedLM,
+                                           float epsilon)
+    : Validator(vocabs, options, /*lowerIsBetter=*/false, epsilon), evalMaskedLM_(evalMaskedLM) {
   createBatchGenerator(/*isTranslating=*/false);
   // @TODO: remove, only used for saving?
   builder_ = models::createModelFromOptions(options_, models::usage::raw);
@@ -295,8 +307,8 @@ float BertAccuracyValidator::validateBG(const std::vector<Ptr<ExpressionGraph>>&
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////
-ScriptValidator::ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options)
-    : Validator(vocabs, options, false) {
+ScriptValidator::ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon)
+    : Validator(vocabs, options, false, epsilon) {
   // @TODO: remove, only used for saving?
   builder_ = models::createModelFromOptions(options_, models::usage::raw);
 
@@ -322,8 +334,8 @@ float ScriptValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////
-TranslationValidator::TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options)
-    : Validator(vocabs, options, false), quiet_(options_->get<bool>("quiet-translation")) {
+TranslationValidator::TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon)
+    : Validator(vocabs, options, false, epsilon), quiet_(options_->get<bool>("quiet-translation")) {
   // @TODO: remove, only used for saving?
   builder_ = models::createModelFromOptions(options_, models::usage::translation);
 
@@ -442,8 +454,8 @@ float TranslationValidator::validate(const std::vector<Ptr<ExpressionGraph>>& gr
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////
-EmbeddingValidator::EmbeddingValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options)
-    : Validator(vocabs, options, false), quiet_(options_->get<bool>("quiet-translation")) {
+EmbeddingValidator::EmbeddingValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon)
+    : Validator(vocabs, options, false, epsilon), quiet_(options_->get<bool>("quiet-translation")) {
   // @TODO: remove, only used for saving?
   builder_ = models::createModelFromOptions(options_, models::usage::embedding);
 
@@ -478,7 +490,7 @@ float EmbeddingValidator::validate(const std::vector<Ptr<ExpressionGraph>>& grap
     tempFile.reset(new io::TemporaryFile(options_->get<std::string>("tempdir"), false));
     fileName = tempFile->getFileName();
   }
- 
+
   timer::Timer timer;
   {
     // @TODO: This can be simplified. If there is no "valid-translation-output", fileName already
@@ -551,8 +563,8 @@ float EmbeddingValidator::validate(const std::vector<Ptr<ExpressionGraph>>& grap
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////
-SacreBleuValidator::SacreBleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, const std::string& metric)
-    : Validator(vocabs, options, /*lowerIsBetter=*/false),
+SacreBleuValidator::SacreBleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, const std::string& metric, float epsilon)
+    : Validator(vocabs, options, /*lowerIsBetter=*/false, epsilon),
       metric_(metric),
       computeChrF_(metric == "chrf"),
       useWordIds_(metric == "bleu-segmented"),
diff --git a/src/training/validator.h b/src/training/validator.h
index d7580a500..aed710778 100644
--- a/src/training/validator.h
+++ b/src/training/validator.h
@@ -30,16 +30,18 @@ class ValidatorBase : public TrainingObserver {
 protected:
   bool lowerIsBetter_{true};
   float lastBest_;
+  float epsilon_{0.f};
   size_t stalled_{0};
   std::mutex mutex_;
   ThreadPool threadPool_;
 
 public:
-  ValidatorBase(bool lowerIsBetter) : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}
+  ValidatorBase(bool lowerIsBetter, float epsilon = 0.f)
+      : lowerIsBetter_(lowerIsBetter), lastBest_(initScore()), epsilon_(epsilon) {}
   virtual ~ValidatorBase() {}
 
-  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
-                         Ptr<const TrainingState> state) = 0;
+  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs, Ptr<const TrainingState> state) = 0;
+
   virtual std::string type() = 0;
 
   float& lastBest() { return lastBest_; }
@@ -53,8 +55,8 @@ template <class DataSet, class BuilderType> // @TODO: BuilderType doesn't really
 class Validator : public ValidatorBase {
 public:
   virtual ~Validator() {}
-  Validator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool lowerIsBetter = true)
-      : ValidatorBase(lowerIsBetter),
+  Validator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool lowerIsBetter = true, float epsilon = 0.f)
+      : ValidatorBase(lowerIsBetter, epsilon),
         vocabs_(vocabs),
         // options_ is a clone of global options, so it can be safely modified within the class
         options_(New<Options>(options->clone())) {
@@ -119,13 +121,20 @@ class Validator : public ValidatorBase {
 
   void updateStalled(const std::vector<Ptr<ExpressionGraph>>& graphs,
                      float val) {
-    if((lowerIsBetter_ && lastBest_ > val)
-       || (!lowerIsBetter_ && lastBest_ < val)) {
-      stalled_ = 0;
+    if((lowerIsBetter_ && lastBest_ > val) || (!lowerIsBetter_ && lastBest_ < val)) {
+      // If epsilon is given, reset the stall count only if the improvement is greater than the epsilon
+      if(epsilon_ != 0.f && ((lowerIsBetter_ && lastBest_ - val < epsilon_)
+                         || (!lowerIsBetter_ && val - lastBest_ < epsilon_))) {
+        stalled_++;
+      } else {
+        stalled_ = 0;
+      }
       lastBest_ = val;
       if(options_->get<bool>("keep-best"))
         keepBest(graphs);
-    } else /* if (lastBest_ != val) */ { // (special case 0 at start)  @TODO: needed? Seems stall count gets reset each time it does improve. If not needed, remove "if(...)" again.
+    } else /* if (lastBest_ != val) */ { // (special case 0 at start)
+    // @TODO: needed? Seems stall count gets reset each time it does improve.
+    // If not needed, remove "if(...)" again.
       stalled_++;
     }
   }
@@ -142,7 +151,7 @@ class CrossEntropyValidator : public Validator<data::Corpus, models::ICriterionF
   using Validator::BatchPtr;
 
 public:
-  CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon = 0.f);
   virtual ~CrossEntropyValidator() {}
 
   std::string type() override { return options_->get<std::string>("cost-type"); }
@@ -154,7 +163,7 @@ class CrossEntropyValidator : public Validator<data::Corpus, models::ICriterionF
 // Used for validating with classifiers. Compute prediction accuracy versus ground truth for a set of classes
 class AccuracyValidator : public Validator<data::Corpus, models::IModel> {
 public:
-  AccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  AccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon = 0.f);
   virtual ~AccuracyValidator() {}
 
   std::string type() override { return "accuracy"; }
@@ -168,7 +177,7 @@ class BertAccuracyValidator : public Validator<data::Corpus, models::IModel> {
   bool evalMaskedLM_{true};
 
 public:
-  BertAccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool evalMaskedLM);
+  BertAccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool evalMaskedLM, float epsilon = 0.f);
   virtual ~BertAccuracyValidator() {}
 
   std::string type() override {
@@ -185,7 +194,7 @@ class BertAccuracyValidator : public Validator<data::Corpus, models::IModel> {
 
 class ScriptValidator : public Validator<data::Corpus, models::IModel> {
 public:
-  ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon = 0.f);
   virtual ~ScriptValidator() {}
 
   virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
@@ -202,7 +211,7 @@ class ScriptValidator : public Validator<data::Corpus, models::IModel> {
 // validator that translates and computes BLEU (or any metric) with an external script
 class TranslationValidator : public Validator<data::Corpus, models::IModel> {
 public:
-  TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon = 0.f);
   virtual ~TranslationValidator() {}
 
   virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
@@ -223,7 +232,7 @@ class TranslationValidator : public Validator<data::Corpus, models::IModel> {
 // @TODO: combine with TranslationValidator (above) to avoid code duplication
 class SacreBleuValidator : public Validator<data::Corpus, models::IModel> {
 public:
-  SacreBleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, const std::string& metric);
+  SacreBleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, const std::string& metric, float epsilon = 0.f);
   virtual ~SacreBleuValidator() {}
 
   virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
@@ -362,7 +371,7 @@ class SacreBleuValidator : public Validator<data::Corpus, models::IModel> {
 // Validator that writes embeddings to a file and computes any metric specified with an external script
 class EmbeddingValidator : public Validator<data::Corpus, models::IModel> {
 public:
-  EmbeddingValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  EmbeddingValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, float epsilon = 0.f);
   virtual ~EmbeddingValidator() {}
 
   virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,

From a42147675148c89e909135f6325e5bcdde7a5e8f Mon Sep 17 00:00:00 2001
From: Thamme Gowda <thammegowda@microsoft.com>
Date: Thu, 13 Apr 2023 18:30:45 +0000
Subject: [PATCH 08/26] Merged PR 28502: Comet2Marian: add --spm argument to
 download vocabulary file

Adds --spm argument to download vocabulary file in comet2marian.py conversion script.
---
 scripts/comet/comet2marian.py | 52 +++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py
index 2a2ee7777..8ef4d29fc 100755
--- a/scripts/comet/comet2marian.py
+++ b/scripts/comet/comet2marian.py
@@ -22,36 +22,64 @@
 inputs.add_argument('--comet', '-c', help=f'COMET model path or an ID: {", ".join(supported_comets)}')
 parser.add_argument('--marian', '-m', help='Output path for Marian weight file', required=True)
 parser.add_argument('-s', '--add_sigmoid', help='Add final sigmoid if not already present', action='store_true')
+parser.add_argument('--spm', '-spm', type=Path, help='Save tokenizer SPM file here', required=False)
 args = parser.parse_args()
 
 
 def load_from_huggingface(model_id):
-    log.info(f"Loading COMET model from huggingface {model_id}")
-    from transformers import AutoModel
+    log.info(f"Loading transformer model from huggingface {model_id}")
+    from transformers import AutoModel, AutoTokenizer
     try:
-        model = AutoModel.from_pretrained(model_id, add_pooling_layer=False)    
+        model = AutoModel.from_pretrained(model_id, add_pooling_layer=False) 
+        AutoTokenizer.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        return model.eval(), getattr(tokenizer, 'vocab_file', None)
     except:
         log.error(f"Could not resolve {model_id} from huggingface")
         raise
-    return model.eval()
 
 
-if args.roberta:
-    # Load the model that Unbabel based COMET on: https://huggingface.co/microsoft/infoxlm-large
-    cometModel = load_from_huggingface("microsoft/infoxlm-large")
-else:
+def load_comet_model(model_path):
     from comet import load_from_checkpoint, download_model
-    model_path = args.comet
+    from transformers import AutoTokenizer
+
     if not Path(model_path).exists():
         if model_path not in supported_comets:
             log.info(f"Could not find {model_path}")  # maybe it's an invalid path
         log.info(f"trying to resolve download {model_path}")
         model_path = download_model(model_path)
     log.info(f"Loading COMET model from checkpoint {model_path}")
-    cometModel = load_from_checkpoint(model_path)
-    cometModel.eval()
+    comet_model = load_from_checkpoint(model_path)
+    comet_model.eval()
+    
+    vocab_file = None
+    try:
+        pretrained_model = comet_model.hparams.get('pretrained_model')
+        log.info(f"comet: {model_path}; pretrained: {pretrained_model}")
+        if pretrained_model:
+            tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
+        vocab_file =  getattr(tokenizer, 'vocab_file', None)
+    except Exception as e:
+        log.warning(f'Error while locating vocab file: {e}')
+        pass
+    return comet_model, vocab_file
+
+if args.roberta:
+    # Load the model that Unbabel based COMET on: https://huggingface.co/microsoft/infoxlm-large
+    cometModel, vocab_file = load_from_huggingface("microsoft/infoxlm-large")
+else:
+    cometModel, vocab_file = load_comet_model(args.comet)
+
+if args.spm:
+    vocab_file = vocab_file and Path(vocab_file)
+    if vocab_file and vocab_file.exists():
+        if not args.spm.parent.exists():
+            raise Exception(f"Directory {args.spm.parent} does not exist")
+        log.info(f"Copying {vocab_file} to {args.spm}")
+        args.spm.write_bytes(vocab_file.read_bytes())
+    else:
+        raise Exception(f"Could not locate or save the vocab file: {vocab_file}; please remove --spm argument and try downloading the file manually")
 
-print(cometModel)
 
 marianModel = dict()
 

From 02678ef37a8f4f35fc30c4b21cdd0e31fdd5442e Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 19 Jun 2023 15:51:25 +0000
Subject: [PATCH 09/26] Merged PR 29868: Add option to replace current
 parameters with smoothed version during training

Adds option to replace current parameters with smoothed version during training. Could potentially help with convergence and training stability.
---
 src/common/config_parser.cpp      |  3 +++
 src/optimizers/optimizers.cpp     | 22 +++++++++++++++++++++-
 src/optimizers/optimizers.h       |  4 ++++
 src/training/graph_group.cpp      | 18 ++++++++++++++++++
 src/training/graph_group.h        |  6 ++++++
 src/training/graph_group_sync.cpp |  6 +++++-
 src/training/scheduler.h          |  7 +++++++
 7 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 16d090897..e24709f6a 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -517,6 +517,9 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
      "Maintain smoothed version of parameters for validation and saving with smoothing factor. 0 to disable. "
       "Auto-adjusted to --mini-batch-words-ref if given.",
      0.f)->implicit_val("1e-4");
+  cli.add<std::string/*SchedulerPeriod*/>("--exponential-smoothing-replace-freq",
+      "When exponential-smoothing is enabled replace master parameters with smoothed parameters once every n steps (possible units u=updates, t=target labels, e=epochs)",
+      "0");
   cli.add<std::string>("--guided-alignment",
      "Path to a file with word alignments. Use guided alignment to guide attention or 'none'. "
      "If --tsv it specifies the index of a TSV field that contains the alignments (0-based)",
diff --git a/src/optimizers/optimizers.cpp b/src/optimizers/optimizers.cpp
index f54276e18..d53e46eef 100644
--- a/src/optimizers/optimizers.cpp
+++ b/src/optimizers/optimizers.cpp
@@ -109,7 +109,7 @@ void OptimizerBase::swapWithSmoothed(Tensor params) {
   if(castOptimizerType_) {
     // If true then optimizer type is different from the graph type,
     // hence a parameter master copy exists and we swap with the master copy.
-    // We then from optimizer parameter type to graph parameter type
+    // We then copy and cast from optimizer parameter type to graph parameter type
     pm_->swap(avg_);
     CopyCast(params, pm_);
   } else {
@@ -121,6 +121,26 @@ void OptimizerBase::swapWithSmoothed(Tensor params) {
   }
 }
 
+void OptimizerBase::replaceWithSmoothed(Tensor params) {
+  if(!mvAvg_) // no smoothing, don't do anything
+    return;
+
+  // This function will overwrite the original parameters which are then lost.
+  if(castOptimizerType_) {
+    // If true then optimizer type is different from the graph type,
+    // hence a parameter master copy exists and we copy to the master copy.
+    // We then copy and cast from optimizer parameter type to graph parameter type
+    pm_->copyFrom(avg_);
+    CopyCast(params, pm_);
+  } else {
+    // Types are equal hence there is no parameter master copy. This means
+    // we need to do a proper copy from the graph params to the smoothed
+    // version. 
+    params->copyFrom(avg_);
+  }
+}
+
+
 void OptimizerBase::load(std::vector<io::Item>& items,
                          const std::vector<Ptr<OptimizerBase>>& opts,
                          const std::vector<Ptr<Backend>>& backends,
diff --git a/src/optimizers/optimizers.h b/src/optimizers/optimizers.h
index e7e8c8ed1..2c7128c51 100644
--- a/src/optimizers/optimizers.h
+++ b/src/optimizers/optimizers.h
@@ -114,6 +114,10 @@ class OptimizerBase : public TrainingObserver, public ExponentialSmoothing {
   // This function swaps out the current optimizer parameters with the smoothed version (provided smoothing is enabled).
   // Usually we will call this twice, to swap in and to swap out.
   void swapWithSmoothed(Tensor params);
+  
+  // This function replaces the current optimizer parameters with the smoothed version (provided smoothing is enabled).
+  // This is different from swapping (swapping twice restores original state) as the original parameters get overwritten. 
+  void replaceWithSmoothed(Tensor params);
 
   // return stateful optimizer shards, for base that's only averaged parameters
   virtual std::vector<Tensor> getShards() { 
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index c160332e4..367e47e16 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -526,6 +526,24 @@ void GraphGroup::swapWithSmoothed() {
   barrier();
 }
 
+void GraphGroup::replaceWithSmoothed() {
+  if(isMainProcess())
+    LOG(info, "Replacing master parameters with smoothed parameters");
+
+  auto replace = [&](size_t i, size_t begin, size_t end) {
+    auto curParam = graphs_[i]->params()->vals()->subtensor(begin, end-begin);
+    optimizerShards_[i]->replaceWithSmoothed(curParam);
+    return true; // dummy success
+  };
+  comm_->foreach(replace);
+  comm_->allGatherParams();
+  
+  if(shardingMode_ == ShardingMode::local)
+    comm_->broadcastParams();
+    
+  barrier();
+}
+
 void GraphGroup::validate() { //@TODO: rename this function to something less confusing.
   ABORT_IF(finalized_, "Training has already finished.");
 }
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index 0895caa77..d7525a102 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -114,8 +114,14 @@ class GraphGroup {
                       const OptimizerBase::GatherStateFunc& gatherFn);
 
 public:
+  // This function swaps out the current optimizer parameters with the smoothed version (provided smoothing is enabled).
+  // Usually we will call this twice, to swap in and to swap out.
   void swapWithSmoothed();
 
+  // This function replaces the current optimizer parameters with the smoothed version (provided smoothing is enabled).
+  // This is different from swapping (swapping twice restores original state) as the original parameters get overwritten. 
+  void replaceWithSmoothed();
+
   bool isMainProcess() const { return mpi_->isMainProcess(); } // (we need this test a few times)
   void barrier() const { mpi_->barrier(); } // (we need this several times)
 
diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp
index a3eee8a7b..b97845814 100644
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@@ -348,7 +348,7 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
     if(scheduler_->syncing()) {
       syncParametersAndShards();
     }
-
+    
     // save intermediate model (and optimizer state) to file
     if(scheduler_->saving()) {
       save();
@@ -361,6 +361,10 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
       scheduler_->validate(graphs_);
       swapWithSmoothed();
     }
+
+    if(scheduler_->replacingWithSmoothed()) {
+      replaceWithSmoothed();
+    }
   }
 
   if(saneGradient)
diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index 30f8c8de7..b6ac1df79 100644
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -286,6 +286,13 @@ class Scheduler : public TrainingObserver {
     return state_->enteredNewPeriodOf(options_->get<std::string>("sync-freq", "0"));
   }
 
+  bool replacingWithSmoothed() {
+    if(options_->get<float>("exponential-smoothing", 0.f) != 0.f)
+      return state_->enteredNewPeriodOf(options_->get<std::string>("exponential-smoothing-replace-freq", "0"));
+    else
+      return false;
+  }
+
   void validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                 bool isFinal = false) {
     // Do not validate if already validated (for instance, after the model is loaded)

From 7425c0261c56c1dab4a026b6c08a134a063fcaaf Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Tue, 27 Jun 2023 19:56:58 +0000
Subject: [PATCH 10/26] Merged PR 30009: Divergence detection and fallback to
 fp32 if training with fp16 fails

This PR adds a do-while loop to training. It should only repeat if a fp16 training run was interrupted via the throwing of a DivergenceException from training/scheduler.h and if --throw-on-divergence and --fp16-fallback-to-fp32 are enabled.

The repeated training run will continue from last checkpoint (similar to a manually interrupted training) but attempt training in fp32. If that training run or any other fp32 training happens to diverge, training will exit with an unhandled DivergenceException. This is on purpose to indicate a fatal error.
---
 CHANGELOG.md                  |   2 +
 VERSION                       |   2 +-
 src/common/config_parser.cpp  |   6 ++
 src/training/scheduler.h      | 162 ++++++++++++++++++++++-------
 src/training/training.h       | 190 +++++++++++++++++++++-------------
 src/training/training_state.h |  14 +++
 6 files changed, 264 insertions(+), 112 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6aff5037f..a2a9a9bdd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Add --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) 
+  diverged runs. If not recoverable, exception gets rethrown and goes unhandled to force fatal error and shutdown.
 - Re-implementation of COMET-QE for inference and training; conversion scripts from Unbabel-Comet to Marian.
 - Validator that generates embeddings and can be used during COMET training with an external script.
 - New experimental layer framework for Transformer-like models.
diff --git a/VERSION b/VERSION
index 00f862625..21decde5d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.3
+v1.12.4
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index e24709f6a..d70048fe9 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -559,6 +559,12 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       "Dynamic cost scaling for mixed precision training: "
       "scaling factor, frequency, multiplier, minimum factor")
       ->implicit_val("8.f 10000 1.f 8.f");
+  cli.add<std::vector<std::string>>("--throw-on-divergence",
+      "Throw exception if training diverges. Divergence is detected if the running average loss over arg1 steps "
+      "is exceeded by the running average loss over arg2 steps (arg1 >> arg2) by arg3 standard deviations")
+      ->implicit_val("100 10 3.0f");
+  cli.add<bool>("--fp16-fallback-to-fp32", 
+      "If fp16 training diverges and throws try to continue training with fp32 precision");
   cli.add<size_t>("--gradient-norm-average-window",
       "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
       "After this many updates about 90% of the mass of the exponential average comes from these updates",
diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index b6ac1df79..9c84d1593 100644
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -9,6 +9,18 @@
 
 namespace marian {
 
+/**
+ * This exception gets thrown when a training run divergence was detected. See below in main update function.
+*/
+class DivergenceException : public std::runtime_error {
+public:
+  DivergenceException(float averageSlow, float averageFast, float sigmas)
+  : std::runtime_error(fmt::format(
+      "Detected training divergence: slow-moving average loss {:.4f} exceeded by fast-moving average loss {:.4f} by {:.4f} = {:.4f} * sigmas", 
+      averageSlow, averageFast, averageFast - averageSlow, sigmas)) 
+    {}
+};
+
 class Scheduler : public TrainingObserver {
 private:
   Ptr<Options> options_;
@@ -17,6 +29,12 @@ class Scheduler : public TrainingObserver {
   Ptr<IMPIWrapper> mpi_;
 
   bool first_{true};                  // true if this is the first update after renewing the training
+
+  bool throwOnDivergence_{false};     // throw an exception if training divergence is detected
+  size_t lossAvgWindowSlow_{100};     // window size for slow-moving average loss for divergence detection
+  size_t lossAvgWindowFast_{10};      // window size for fast-moving average loss for divergence detection
+  float divergenceTolerance_{3.f};    // tolerance for divergence detection as multiples of standard deviation
+  
   size_t gradientNormAvgWindow_{100}; // window size for recording the exponential average of gradient norms, after this many updates about 90% of the mass comes from this many last updates
   SchedulingParameter logicalEpoch_;
   size_t logicalEpochWidth_{0};
@@ -134,6 +152,21 @@ class Scheduler : public TrainingObserver {
       : options_(options), state_(state), mpi_(mpi),
         gradientNormAvgWindow_(options_->get<size_t>("gradient-norm-average-window", 100)) {
 
+    auto throwParameters = options_->get<std::vector<std::string>>("throw-on-divergence");
+    if(!throwParameters.empty()) {
+      throwOnDivergence_ = true;
+      if(throwParameters.size() > 0)
+        lossAvgWindowSlow_ = std::stoul(throwParameters[0]);
+      if(throwParameters.size() > 1)
+        lossAvgWindowFast_ = std::stoul(throwParameters[1]);
+      if(throwParameters.size() > 2)
+        divergenceTolerance_ = std::stof(throwParameters[2]);
+        LOG(info, 
+            "[scheduler] Divergence detection is enabled for slow-moving averaging window over {} steps "
+            "vs fast-moving window over {} steps with tolerance of {} sigmas", 
+            lossAvgWindowSlow_, lossAvgWindowFast_, divergenceTolerance_);
+    }
+
     // parse logical-epoch parameters
     auto logicalEpochStr = options->get<std::vector<std::string>>("logical-epoch", {"1e", "0"});
     ABORT_IF(logicalEpochStr.empty(), "Logical epoch information is missing?");
@@ -405,27 +438,84 @@ class Scheduler : public TrainingObserver {
                                          // -freq parameters do not support epoch units
     state_->validated = false;
 
-    // Since batchLabels is counted across all MPI processes, we also should temporarily
-    // extrapolate cost across MPI processes, to have numbers in the right range.
-    // When doing the actual log, we then aggregate across MPI processes to get the accurate number.
+    // collect costs from all nodes if training with MPI
     if(mpi_) {
-      rationalLoss.loss  *= mpi_->numMPIProcesses();
-      rationalLoss.count *= mpi_->numMPIProcesses();
+      mpi_->allReduce(&rationalLoss.loss,  &rationalLoss.loss,  1, MPI_FLOAT, MPI_SUM);
+      mpi_->allReduce(&rationalLoss.count, &rationalLoss.count, 1, MPI_FLOAT, MPI_SUM);
     }
+    float currentNormalizedLoss = rationalLoss.loss / rationalLoss.count;
 
-    // @BUGBUG: rationalLoss.count is float, not a count. Possible solution: make (costSum, costCount) a StaticLoss object as well
-    state_->costSum      += rationalLoss.loss;   // aggregate sum cost since last display
-    state_->costCount    += rationalLoss.count; // cost gets normalized w.r.t. this in display
+    state_->costSum   += rationalLoss.loss;
+    state_->costCount += rationalLoss.count;
 
     state_->updatesDisp  += 1;
     state_->samplesDisp  += batchSize;
     state_->wordsDisp    += batchLabels; // words at given input processed since last display, for speed display
 
     state_->samplesEpoch += batchSize;   // sentences processed in this epoch
-    state_->labelsTotal  += batchLabels; // total labels processed
+    state_->labelsTotal  += batchLabels; // total labels processed      
 
     state_->newUpdate(numReadBatches);
 
+    // true if --throw-on-divergence [lossAvgWindowSlow_] [lossAvgWindowFast_] [divergenceTolerance_] is enabled, false otherwise
+    if(throwOnDivergence_) {
+      size_t windowSlow = std::min(lossAvgWindowSlow_, state_->batches); // we compare the running exponential average over a longer window
+      size_t windowFast = std::min(lossAvgWindowFast_, state_->batches); // with the running exponential everage over a shorter window (for smoothing)
+      
+      // By default we set windowSlow = 100 and windowFast = 10, so if values diverge the average from the shorter window should pick this up quickly
+      // vs the longer window while still smoothing over multiple updates avoiding detecting random single spikes as divergence.
+      float alphaSlow = 2.f / (float)(windowSlow + 1); // about 90% of the mass will come from the windowSlow last steps
+      float alphaFast = 2.f / (float)(windowFast + 1); // about 90% of the mass will come from the windowFast last steps
+  
+      // set some reasonable defaults during training start. Cost shouldn't be zero unless fresh start without *.progress.yml
+      if(state_->lossAvgSlow == 0) {
+        state_->lossAvgSlow = currentNormalizedLoss;
+        state_->lossAvgFast = currentNormalizedLoss;
+        state_->lossVarSlow = 0;
+      }
+        
+      // allow statistics to see at least lossAvgWindowSlow_ updates before using for divergence detection
+      if(state_->batches > lossAvgWindowSlow_) {
+        // we compare the faster moving average against the slower moving exponential loss average
+        float delta = state_->lossAvgFast - state_->lossAvgSlow;
+        // running standard deviation
+        float sigma = std::sqrt(state_->lossVarSlow);
+
+        // negative delta is always safe (indicates convergence) and sigma should always be larger than zero (safe for division) after a few first steps
+        if(delta > 0 && sigma > 0) {
+          // how many standard deviations (sigmas) above slow-moving average?
+          float sigmasDiverged = delta / sigma;
+          if(sigmasDiverged > divergenceTolerance_) { // uh-oh - by default assume training diverged if slow-moving average is exceeded by e.g. 3 sigmas
+            LOG(warn, 
+                "Detected training divergence: slow-moving average loss {:.4f} exceeded by fast-moving average loss {:.4f} by {:.4f} = {:.4f} * sigmas", 
+                state_->lossAvgSlow, state_->lossAvgFast, delta, sigmasDiverged);
+
+            // this gets propagated to the main training loop in training/training.h and will either fail the whole training process with
+            // an unhandled exception (thus exiting with error code) or trigger another training run with fallback to fp32 if we were 
+            // training with fp16 and --fp16-fallback-to-fp32 is enabled.
+            throw DivergenceException(state_->lossAvgSlow, state_->lossAvgFast, sigmasDiverged);
+          }
+        }
+
+        if(state_->enteredNewPeriodOf(options_->get<std::string>("disp-freq")) || state_->batches <= options_->get<size_t>("disp-first")) {
+          if(!mpi_ || mpi_->isMainProcess()) {
+            LOG(debug, 
+                "delta(={:.4f}) = avgFast(={:.4f}) - avgSlow(={:.4f}) = {:.4f} * sigma(={:.4f}) < {:.4f} * sigma",
+                delta, state_->lossAvgFast, state_->lossAvgSlow, delta / sigma, sigma, divergenceTolerance_);
+          }
+        }
+      }
+      
+      // log slow-moving exponential average and variance of training cost stats
+      float deltaSlow = currentNormalizedLoss - state_->lossAvgSlow;
+      state_->lossAvgSlow = state_->lossAvgSlow + alphaSlow * deltaSlow;
+      state_->lossVarSlow  = (1.0f - alphaSlow) * (state_->lossVarSlow + alphaSlow * deltaSlow * deltaSlow);
+      
+      // log fast-moving exponential average of training cost stats
+      float deltaFast = currentNormalizedLoss - state_->lossAvgFast;
+      state_->lossAvgFast = state_->lossAvgFast + alphaFast * deltaFast;
+    }
+
     if(gradientNorm) {
       size_t range = std::min(gradientNormAvgWindow_, state_->batches);
       float alpha = 2.f / (float)(range + 1);
@@ -445,38 +535,30 @@ class Scheduler : public TrainingObserver {
 
     if(state_->enteredNewPeriodOf(options_->get<std::string>("disp-freq")) || state_->batches <= options_->get<size_t>("disp-first")) {
       // if MPI then aggregate precise cost across workers
-      if(mpi_) {
-        state_->costSum   /= mpi_->numMPIProcesses(); // undo the extra scaling
-        state_->costCount /= mpi_->numMPIProcesses(); // undo the extra scaling
-        mpi_->allReduce(&state_->costSum, &state_->costSum, 1, MPI_FLOAT, MPI_SUM);
-        mpi_->allReduce(&state_->costCount, &state_->costCount, 1, MPI_FLOAT, MPI_SUM);
-      }
-
-      if(mpi_ && mpi_->myMPIRank() != 0) {
-        // skip the report on alternate worker processes
-      } else if(options_->get<bool>("lr-report")) {
-        LOG(info,
-            "Ep. {} : Up. {} : Sen. {} : {} : Time {:.2f}s : {:.2f} words/s : gNorm {:.4f} : L.r. {:.4e}",
-            formatLogicalEpoch(),
-            state_->batches,
-            utils::withCommas(state_->samplesEpoch),
-            formatLoss(lossType, dispLabelCounts, batchLabels, state_),
-            timer_.elapsed(),
-            state_->wordsDisp / timer_.elapsed(),
-            state_->gradientNormAvg,
-            state_->eta);
-      } else {
-        LOG(info,
-            "Ep. {} : Up. {} : Sen. {} : {} : Time {:.2f}s : {:.2f} words/s : gNorm {:.4f}",
-            formatLogicalEpoch(),
-            state_->batches,
-            utils::withCommas(state_->samplesEpoch),
-            formatLoss(lossType, dispLabelCounts, batchLabels, state_),
-            timer_.elapsed(),
-            state_->wordsDisp / timer_.elapsed(),
-            state_->gradientNormAvg);
+      if(!mpi_ || mpi_->isMainProcess()) {
+        if(options_->get<bool>("lr-report")) {
+          LOG(info,
+              "Ep. {} : Up. {} : Sen. {} : {} : Time {:.2f}s : {:.2f} words/s : gNorm {:.4f} : L.r. {:.4e}",
+              formatLogicalEpoch(),
+              state_->batches,
+              utils::withCommas(state_->samplesEpoch),
+              formatLoss(lossType, dispLabelCounts, batchLabels, state_),
+              timer_.elapsed(),
+              state_->wordsDisp / timer_.elapsed(),
+              state_->gradientNormAvg,
+              state_->eta);
+        } else {
+          LOG(info,
+              "Ep. {} : Up. {} : Sen. {} : {} : Time {:.2f}s : {:.2f} words/s : gNorm {:.4f}",
+              formatLogicalEpoch(),
+              state_->batches,
+              utils::withCommas(state_->samplesEpoch),
+              formatLoss(lossType, dispLabelCounts, batchLabels, state_),
+              timer_.elapsed(),
+              state_->wordsDisp / timer_.elapsed(),
+              state_->gradientNormAvg);
+        }
       }
-
       timer_.start();
       state_->costSum      = 0;
       state_->costCount    = 0;
diff --git a/src/training/training.h b/src/training/training.h
index 7f6176879..cbca3eff2 100644
--- a/src/training/training.h
+++ b/src/training/training.h
@@ -45,78 +45,126 @@ class Train : public ModelTask {
 
     dataset->prepare();
 
-    Ptr<BatchStats> stats;
-    if(options_->get<bool>("mini-batch-fit")) {
-      LOG(info,
-          "[batching] Collecting statistics for batch fitting with step size {}",
-          options_->get<size_t>("mini-batch-fit-step"));
-      // @TODO this should receive a function object that can generate a fake batch;
-      // that way vocabs would not be exposed.
-      auto model = New<ModelWrapper>(options_, mpi);
-
-      // use temporary scheduler to make sure everything gets destroyed properly
-      // otherwise the scheduler believes that registered objects still exist
-      auto tempTrainState = New<TrainingState>(options_->get<float>("learn-rate"));
-      auto tempScheduler = New<Scheduler>(options_, tempTrainState, mpi);
-
-      model->setScheduler(tempScheduler); // collectStats() needs to know about dynamic MB scaling
-      stats = model->collectStats(dataset->getVocabs());
-      LOG(info, "[batching] Done. Typical MB size is {} target words", utils::withCommas(stats->estimateTypicalTrgWords()));
-    }
-
-    auto trainState = New<TrainingState>(options_->get<float>("learn-rate"));
-    auto scheduler = New<Scheduler>(options_, trainState, mpi);
-
-    if((options_->hasAndNotEmpty("valid-sets") || options_->hasAndNotEmpty("valid-script-path"))
-       && SchedulingParameter::parse(options_->get<std::string>("valid-freq"))) {
-      for(auto validator : Validators(dataset->getVocabs(), options_))
-        scheduler->addValidator(validator);
-    }
-
-    auto batchGenerator = New<CorpusBatchGenerator>(dataset, options_, stats);
-
-    scheduler->registerTrainingObserver(batchGenerator);
-
-    auto model = New<ModelWrapper>(options_, mpi);
-    model->setScheduler(scheduler);
-    model->setTypicalTrgBatchWords(batchGenerator->estimateTypicalTrgBatchWords()); // needed for dynamic MB scaling
-    model->load();
-
-    bool restored = !options_->get<bool>("no-restore-corpus")
-                    && batchGenerator->restore(trainState);
-
-    // We only want custom behavior once training starts.
-    installCustomSignalHandlers();
-
-    // -- main training loop
-    scheduler->started();
-    while(scheduler->keepGoing()) {
-      if(!restored)
-        batchGenerator->prepare();
-      restored = false;
-
-      // main training loop for one epoch
-      for(auto batch : *batchGenerator) {
-        if (!scheduler->keepGoing())
-          break;
-        model->update(batch);
+    // We run training in a do-while loop. It should only restart if a fp16 training run was interrupted
+    // via the throwing of a DivergenceException from training/scheduler.h and if --throw-on-divergence and
+    // --fp16-fallback-to-fp32 are enabled. 
+    // The repeated training run will continue from last checkpoint (similar to a manually interrupted training) 
+    // but attempt training in fp32. If that training run or any other fp32 training happens to diverge, 
+    // training will exit with an unhandled DivergenceException. This is on purpose to indicate a fatal error.
+    bool restartTraining;
+    do {
+      try {
+        // there will be only one training loop execution unless in special situations,
+        // for example, when fp16 training diverges and it is restarted with fp32
+        restartTraining = false;
+
+        Ptr<BatchStats> stats;
+        if(options_->get<bool>("mini-batch-fit")) {
+          LOG(info,
+              "[batching] Collecting statistics for batch fitting with step size {}",
+              options_->get<size_t>("mini-batch-fit-step"));
+          // @TODO this should receive a function object that can generate a fake batch;
+          // that way vocabs would not be exposed.
+          auto model = New<ModelWrapper>(options_, mpi);
+
+          // use temporary scheduler to make sure everything gets destroyed properly
+          // otherwise the scheduler believes that registered objects still exist
+          auto tempTrainState = New<TrainingState>(options_->get<float>("learn-rate"));
+          auto tempScheduler = New<Scheduler>(options_, tempTrainState, mpi);
+
+          model->setScheduler(tempScheduler); // collectStats() needs to know about dynamic MB scaling
+          stats = model->collectStats(dataset->getVocabs());
+          LOG(info, "[batching] Done. Typical MB size is {} target words", utils::withCommas(stats->estimateTypicalTrgWords()));
+        }
+
+        auto trainState = New<TrainingState>(options_->get<float>("learn-rate"));
+        auto scheduler = New<Scheduler>(options_, trainState, mpi);
+
+        if((options_->hasAndNotEmpty("valid-sets") || options_->hasAndNotEmpty("valid-script-path"))
+          && SchedulingParameter::parse(options_->get<std::string>("valid-freq"))) {
+          for(auto validator : Validators(dataset->getVocabs(), options_))
+            scheduler->addValidator(validator);
+        }
+
+        auto batchGenerator = New<CorpusBatchGenerator>(dataset, options_, stats);
+
+        scheduler->registerTrainingObserver(batchGenerator);
+
+        auto model = New<ModelWrapper>(options_, mpi);
+        model->setScheduler(scheduler);
+        model->setTypicalTrgBatchWords(batchGenerator->estimateTypicalTrgBatchWords()); // needed for dynamic MB scaling
+        model->load();
+
+        bool restored = !options_->get<bool>("no-restore-corpus")
+                        && batchGenerator->restore(trainState);
+
+        // We only want custom behavior once training starts.
+        installCustomSignalHandlers();
+
+        // -- main training loop
+        scheduler->started();
+        while(scheduler->keepGoing()) {
+          if(!restored)
+            batchGenerator->prepare();
+          restored = false;
+
+          // main training loop for one epoch
+          for(auto batch : *batchGenerator) {
+            if (!scheduler->keepGoing())
+              break;
+            model->update(batch);
+          }
+
+          if(scheduler->keepGoing())
+            scheduler->increaseEpoch();
+        }
+        scheduler->finished();
+
+        model->finalize(); // allow async to sync before final save   --@TODO: rename, or move into save()
+
+        // Avoid saving the model twice if it has been loaded and training did not progress
+        if(!trainState->loaded)
+          model->save(true);
+
+        // Signal success to a potential MPI runner
+        model = nullptr;     // release any reference to MPI that model may hold
+        scheduler = nullptr; // as above
+        finalizeMPI(std::move(mpi));
+
+      } catch(DivergenceException& e) { // handling divergent training if scheduler is configured 
+        // to throw via --throw-on-divergence
+        if(options_->get<bool>("fp16-fallback-to-fp32", false)) {
+          auto precisions = options_->get<std::vector<std::string>>("precision");
+          Type parameterType = typeFromString(precisions[0]);
+          if(parameterType == Type::float16) {
+            // we diverged, but we were apparently training with fp16 and fallback to fp32
+            // is enabled. There is a chance we can rescue the training run by restarting
+            // from the last checkpoint but using fp32 precision training.
+            LOG(warn, "Training diverged, but --fp16-fallback-to-fp32 is enabled. "
+                      "Attempting restart from the last checkpoint with fp32 precision.");
+
+            // undo all options that would be set for fp16 training
+            options_ = options_->with(
+              "fp16", false,
+              "precision", std::vector<std::string>({"float32", "float32"}),
+              "cost-scaling", std::vector<std::string>({})
+            );
+
+            // this gets checked at final do-while condition
+            restartTraining = true;
+          } else {
+            // We diverged and fallback is enabled, but we are already training with fp32, 
+            // hence rethrow and let training die with error.
+            LOG(warn, "Training diverged, rethrowing divergence exception");
+            throw e;
+          }
+        } else {
+          // We diverged and no fallback enabled, hence rethrow and let training die with error.
+          LOG(warn, "Training diverged, rethrowing divergence exception");
+          throw e;
+        }
       }
-
-      if(scheduler->keepGoing())
-        scheduler->increaseEpoch();
-    }
-    scheduler->finished();
-
-    model->finalize(); // allow async to sync before final save   --@TODO: rename, or move into save()
-
-    // Avoid saving the model twice if it has been loaded and training did not progress
-    if(!trainState->loaded)
-      model->save(true);
-
-    // Signal success to a potential MPI runner
-    model = nullptr;     // release any reference to MPI that model may hold
-    scheduler = nullptr; // as above
-    finalizeMPI(std::move(mpi));
+    } while(restartTraining);
   }
 };
 
diff --git a/src/training/training_state.h b/src/training/training_state.h
index 2fb9209fa..800dd60c7 100644
--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@@ -73,6 +73,12 @@ class TrainingState {
   // Number of updates seen since last display
   size_t updatesDisp{0};
 
+  // Running average of training cost per label
+  float lossAvgSlow{0};
+  float lossAvgFast{0};
+  // Running variance of training cost per label
+  float lossVarSlow{0};
+
   // Running average of gradient norm
   float gradientNormAvg{0};
   // Running variance of gradient norm
@@ -230,6 +236,10 @@ class TrainingState {
     samplesDisp = config["disp-samples"].as<size_t>();
     updatesDisp = config["disp-updates"].as<size_t>();
 
+    lossAvgSlow = config["loss-avg-slow"].as<float>();
+    lossAvgFast = config["loss-avg-fast"].as<float>();
+    lossVarSlow = config["loss-var-slow"].as<float>();
+
     gradientNormAvg = config["gradient-norm-avg"].as<float>();
     gradientNormVar = config["gradient-norm-var"].as<float>();
 
@@ -277,6 +287,10 @@ class TrainingState {
     config["disp-samples"] = samplesDisp;
     config["disp-words"] = wordsDisp;
 
+    config["loss-avg-slow"] = lossAvgSlow;
+    config["loss-avg-fast"] = lossAvgFast;
+    config["loss-var-slow"] = lossVarSlow;
+
     config["gradient-norm-avg"] = gradientNormAvg;
     config["gradient-norm-var"] = gradientNormVar;
 

From ea8a2db445310ead64df7c3ffbf401819307c0f6 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Wed, 28 Jun 2023 15:55:03 +0000
Subject: [PATCH 11/26] Merged PR 30038: Add a comment that automatic builds
 are disabled

---
 azure-pipelines.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 3b1bfff3f..0f19a0f8d 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -13,8 +13,11 @@ parameters:
   type: boolean
   default: true
 
-# The pipeline CI trigger is set on the branch master only and PR trigger on a
-# (non-draft) pull request to any branch
+# Warning: the current branch policies disable the automatic triggering to
+# minimize VM usage!
+# The configuration below specifies that the pipeline CI trigger is set on the
+# branch master only and a PR trigger is on a (non-draft) pull request to any
+# branch.
 trigger:
   # This minimizes the number of parallel pipeline runs. When a pipeline is
   # running, the CI waits until it is completed before starting another one.

From 0fa11f5cb4461857ea34f08a85168cdd683bb86f Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Wed, 28 Jun 2023 16:07:02 +0000
Subject: [PATCH 12/26] Merged PR 30034: Automatically create
 marian-YYYY-MM-DD-GIT_REV.tgz

Small simplification to create the correctly named tarball via `make marian_tgz`  resulting in e.g. `marian-2023-06-28-8390b1d.tgz`

This will be executed every time make `marian_tgz` is invoked, but depends on the correct targets and will update changed commit revisions etc. Uses PST time zone.
---
 cmake/Tarball.cmake | 30 ++++++++++++++++++++++++++++++
 src/CMakeLists.txt  | 29 +++--------------------------
 2 files changed, 33 insertions(+), 26 deletions(-)
 create mode 100644 cmake/Tarball.cmake

diff --git a/cmake/Tarball.cmake b/cmake/Tarball.cmake
new file mode 100644
index 000000000..8611f5553
--- /dev/null
+++ b/cmake/Tarball.cmake
@@ -0,0 +1,30 @@
+# marian-YYYY-MM-DD-revision.tgz
+# This combines marian, marian_decoder in a single TAR file for
+# execution in MSFT internal tools FLO and Singularity.
+
+execute_process(
+        COMMAND bash -c "TZ=America/Los_Angeles date +%Y-%m-%d"
+        OUTPUT_VARIABLE TGZ_DATE
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+execute_process(
+        COMMAND git rev-parse --short=7 HEAD
+        OUTPUT_VARIABLE TGZ_REV
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+message("Generating ${CWD}/marian-${TGZ_DATE}-${TGZ_REV}.tgz")
+
+# check if pigz is available for faster compression
+execute_process(
+        COMMAND bash -c "which pigz || which gzip"
+        OUTPUT_VARIABLE COMPRESS
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+execute_process(
+        COMMAND tar -I ${COMPRESS} -cvvf "${CWD}/marian-${TGZ_DATE}-${TGZ_REV}.tgz" -C "${CWD}"
+            marian 
+            marian-decoder 
+            marian-scorer 
+            marian-vocab 
+            marian-conv
+        WORKING_DIRECTORY "${CWD}")        
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f9d5a5e5b..d1f119335 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -235,33 +235,10 @@ if (NOT COMPILE_LIBRARY_ONLY)
 
   set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv)
 
-  # marian.zip and marian.tgz
-  # This combines marian, marian_decoder in a single ZIP or TAR file for
-  # execution in MSFT internal tools FLO and Philly.
-  # For Philly submission, we need statically-linked versions to deal with
-  # library dependencies, so this target is only enabled for static builds.
-  add_custom_command(
-    OUTPUT "${CMAKE_BINARY_DIR}/marian.zip"
-    COMMAND zip -v -0 -j "${CMAKE_BINARY_DIR}/marian.zip"
-                "${CMAKE_BINARY_DIR}/marian"
-                "${CMAKE_BINARY_DIR}/marian-decoder"
-                "${CMAKE_BINARY_DIR}/marian-scorer"
-                "${CMAKE_BINARY_DIR}/marian-vocab"
-                "${CMAKE_BINARY_DIR}/marian-conv"
+  # generate the tgz file via a custom script. This will always re-create the tarball
+  add_custom_target(marian_tgz
+    COMMAND ${CMAKE_COMMAND} -DCWD=${CMAKE_BINARY_DIR} -P ${CMAKE_SOURCE_DIR}/cmake/Tarball.cmake
     DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
-  add_custom_target(marian_zip DEPENDS "${CMAKE_BINARY_DIR}/marian.zip")
-
-  add_custom_command(
-    OUTPUT "${CMAKE_BINARY_DIR}/marian.tgz"
-    COMMAND tar -cvvzf "${CMAKE_BINARY_DIR}/marian.tgz" -C "${CMAKE_BINARY_DIR}"
-                "marian"
-                "marian-decoder"
-                "marian-scorer"
-                "marian-vocab"
-                "marian-conv"
-    DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
-  add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz")
-  add_custom_target(philly DEPENDS marian_tgz marian_zip)
 
   if(COMPILE_SERVER)
     add_executable(marian_server command/marian_server.cpp)

From 0df870c12b87b5c43634de1498192261dfb1f6f8 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hihoan@microsoft.com>
Date: Thu, 29 Jun 2023 23:26:10 +0000
Subject: [PATCH 13/26] Merged PR 28958: LSH for GPU

LSH vocab filtering for GPU.

Speed is not competitive with non-LSH. Checking in for completeness and possible future use of LSH on GPU for non-filtering stuff

eg. decoding $22k sentences, mini-batch 256, maxi-batch 10 using production SSRU model:
Without LSH:  53.86 sec. With LSH: 108.27
---
 CHANGELOG.md                        |   1 +
 VERSION                             |   2 +-
 src/data/shortlist.cpp              |   4 -
 src/layers/lsh.cpp                  | 123 ++++++++++------
 src/tensors/gpu/tensor_operators.cu | 220 ++++++++++++++++++++++++++++
 src/tensors/tensor_operators.h      |   5 +
 6 files changed, 304 insertions(+), 51 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a2a9a9bdd..8778abeed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- LSH vocab filtering for GPU. Speed is not competitive with non-LSH. Checking in for completeness and possible future use of LSH on GPU for non-filtering stuff
 - Add --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) 
   diverged runs. If not recoverable, exception gets rethrown and goes unhandled to force fatal error and shutdown.
 - Re-implementation of COMET-QE for inference and training; conversion scripts from Unbabel-Comet to Marian.
diff --git a/VERSION b/VERSION
index 21decde5d..97cc69d7f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.4
+v1.12.5
diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp
index da5a6572f..909734ea6 100644
--- a/src/data/shortlist.cpp
+++ b/src/data/shortlist.cpp
@@ -95,10 +95,6 @@ Expr LSHShortlist::getIndicesExpr() const {
 }
 
 void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) {
-
-  ABORT_IF(input->graph()->getDeviceId().type == DeviceType::gpu,
-           "LSH index (--output-approx-knn) currently not implemented for GPU");
-
   indicesExpr_ = callback(lsh::search(input, weights, k_, nbits_, (int)lemmaSize_, abortIfDynamic_),
                           [this](Expr node) { 
                             node->val()->get(indices_); // set the value of the field indices_ whenever the graph traverses this node
diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp
index eedf227ee..7dfe83d22 100644
--- a/src/layers/lsh.cpp
+++ b/src/layers/lsh.cpp
@@ -51,7 +51,14 @@ void fillRandomRotationMatrix(Tensor output, Ptr<Allocator> allocator) {
 void encode(Tensor output, Tensor input) {
   int nBits = input->shape()[-1]; // number of bits is equal last dimension of float matrix
   int nRows = input->shape().elements() / nBits;
-  faiss::fvecs2bitvecs(input->data<float>(), output->data<uint8_t>(), (size_t)nBits, (size_t)nRows);
+  if (input->getDeviceId().type == DeviceType::cpu) {
+    faiss::fvecs2bitvecs(input->data<float>(), output->data<uint8_t>(), (size_t)nBits, (size_t)nRows);
+  }
+  else {
+#ifdef CUDA_FOUND
+    marian::gpu::Float2Bit(output, input);
+#endif
+}
 }
 
 void encodeWithRotation(Tensor output, Tensor input, Tensor rotation, Ptr<Allocator> allocator) {
@@ -123,56 +130,80 @@ Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int dimK, int firstNR
     Expr encodedQuery   = inputs[0];
     Expr encodedWeights = inputs[1];
 
-    int bytesPerVector = encodedWeights->shape()[-1];
-    int wRows = encodedWeights->shape().elements() / bytesPerVector;
-    
-    // we use this with Factored Segmenter to skip the factor embeddings at the end
-    if(firstNRows != 0)
-      wRows = firstNRows;
+    if (encodedQuery->val()->getDeviceId().type == DeviceType::cpu) {
+      int bytesPerVector = encodedWeights->shape()[-1];
+      int wRows = encodedWeights->shape().elements() / bytesPerVector;
+      
+      // we use this with Factored Segmenter to skip the factor embeddings at the end
+      if(firstNRows != 0)
+        wRows = firstNRows;
 
-    ABORT_IF(dimK > wRows, "k is larger than number of candidate values?"); // @TODO: use min(k, wRows) silently?
+      ABORT_IF(dimK > wRows, "k is larger than number of candidate values?"); // @TODO: use min(k, wRows) silently?
 
 #if _MSC_VER // unfortunately MSVC is horrible at loop unrolling, so we fall back to the old code (hrmph!) @TODO: figure this out one day
-    int qRows = encodedQuery->shape().elements() / bytesPerVector;
-
-    uint8_t* qCodes = encodedQuery->val()->data<uint8_t>();
-    uint8_t* wCodes = encodedWeights->val()->data<uint8_t>();
-
-    // use actual faiss code for performing the hamming search. 
-    std::vector<int> distances(qRows * dimK);
-    std::vector<faiss::Index::idx_t> ids(qRows * dimK);
-    faiss::int_maxheap_array_t res = {(size_t)qRows, (size_t)dimK, ids.data(), distances.data()};
-    faiss::hammings_knn_hc(&res, qCodes, wCodes, (size_t)wRows, (size_t)bytesPerVector, 0);
-
-    // Copy int64_t indices to Marian index type and sort by increasing index value per hypothesis.
-    // The sorting is required as we later do a binary search on those values for reverse look-up.
-    uint32_t* outData = out->val()->data<uint32_t>();
-    
-    int numHypos = out->shape().elements() / dimK;
-    for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) {
-      size_t startIdx = dimK * hypoIdx;
-      size_t endIdx = startIdx + dimK;
-      for(size_t i = startIdx; i < endIdx; ++i)
-        outData[i] = (uint32_t)ids[i];
-      if(!noSort)
-        std::sort(outData + startIdx, outData + endIdx);
-    }
+      int qRows = encodedQuery->shape().elements() / bytesPerVector;
+
+      uint8_t* qCodes = encodedQuery->val()->data<uint8_t>();
+      uint8_t* wCodes = encodedWeights->val()->data<uint8_t>();
+
+      // use actual faiss code for performing the hamming search. 
+      std::vector<int> distances(qRows * dimK);
+      std::vector<faiss::Index::idx_t> ids(qRows * dimK);
+      faiss::int_maxheap_array_t res = {(size_t)qRows, (size_t)dimK, ids.data(), distances.data()};
+      faiss::hammings_knn_hc(&res, qCodes, wCodes, (size_t)wRows, (size_t)bytesPerVector, 0);
+
+      // Copy int64_t indices to Marian index type and sort by increasing index value per hypothesis.
+      // The sorting is required as we later do a binary search on those values for reverse look-up.
+      uint32_t* outData = out->val()->data<uint32_t>();
+      
+      int numHypos = out->shape().elements() / dimK;
+      for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) {
+        size_t startIdx = dimK * hypoIdx;
+        size_t endIdx = startIdx + dimK;
+        for(size_t i = startIdx; i < endIdx; ++i)
+          outData[i] = (uint32_t)ids[i];
+        if(!noSort)
+          std::sort(outData + startIdx, outData + endIdx);
+      }
 #else // this is using the new code for search, other parts of the code, like conversion are fine.
-    IndexType* outData = out->val()->data<IndexType>();
-    auto gather = [outData, dimK](IndexType rowId, IndexType k, IndexType kthColId, DistType /*dist*/) { 
-      outData[rowId * dimK + k] = kthColId; 
-    };
-
-    Parameters params;
-    params.k              = dimK;
-    params.queryRows      = encodedQuery->val()->data<uint8_t>();
-    params.numQueryRows   = encodedQuery->shape().elements() / bytesPerVector;
-    params.codeRows       = encodedWeights->val()->data<uint8_t>();
-    params.numCodeRows    = wRows;
-    params.bytesPerVector = bytesPerVector;
-
-    hammingTopK(params, gather);
+      IndexType* outData = out->val()->data<IndexType>();
+      auto gather = [outData, dimK](IndexType rowId, IndexType k, IndexType kthColId, DistType /*dist*/) { 
+        outData[rowId * dimK + k] = kthColId; 
+      };
+
+      Parameters params;
+      params.k              = dimK;
+      params.queryRows      = encodedQuery->val()->data<uint8_t>();
+      params.numQueryRows   = encodedQuery->shape().elements() / bytesPerVector;
+      params.codeRows       = encodedWeights->val()->data<uint8_t>();
+      params.numCodeRows    = wRows;
+      params.bytesPerVector = bytesPerVector;
+
+      hammingTopK(params, gather);
+#endif
+    }
+    else {
+#ifdef CUDA_FOUND
+      Ptr<Backend> backend = out->val()->getBackend();
+
+      const size_t CHUNK = 128;
+      const size_t MBYTE = 1024 * 1024;
+      const size_t GROW = CHUNK * MBYTE;
+      Ptr<Allocator> alloc = marian::New<marian::Allocator>(backend->getDeviceId(), 0, GROW);
+
+      auto memory = alloc->alloc(requiredBytes(out->shape(), marian::Type::uint32));
+
+      // not required for calculations. Useful for debugging
+      Tensor outCounts = nullptr; //marian::TensorBase::New(memory, out->shape(), marian::Type::uint32, backend);
+
+      uint16_t numHash = (uint16_t) encodedWeights->shape()[-1] * 8;
+
+      marian::gpu::HammmingAndSort(out->val(), outCounts,
+                  encodedWeights->val(), encodedQuery->val(),
+                  dimK, 0, numHash, 
+                  alloc, backend);
 #endif
+    }
   };
 
   Shape kShape({currBeamSize, batchSize, dimK});
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index 51e6f2f2d..508e1e3e7 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -1,3 +1,9 @@
+# if defined(_MSC_VER)
+#define NPP_MAX_32U     ( 4294967295U )              /**<  Maximum 32-bit unsigned integer */
+#else
+#include <nppdefs.h>
+#endif
+
 #include "common/types.h"
 #include "tensors/tensor_operators.h"
 
@@ -3391,5 +3397,219 @@ void PoolingWithMaskingBackward(Tensor adj,
                                            width,
                                            lastWidth);
 }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Calc sign(x) for vectors of float. GPU counterpart to Faiss' CPU fvecs2bitvecs()
+__global__ void Float2Bit(const float *in, uint32_t *out, int batch, int dim, int outDim)
+{
+  int batchIdx = blockIdx.x;
+  const float *inBatchOffset = in + batchIdx * dim;
+  uint32_t *outBatchOffset = out + batchIdx * outDim;
+  
+  int outDimIdx = threadIdx.x;
+  while (outDimIdx < outDim) {
+    const float *inDimOffset = inBatchOffset + outDimIdx * 32;
+    uint32_t &outDimOffset = outBatchOffset[outDimIdx];
+    uint32_t outVal = 0;
+    uint32_t mask = 1;
+    
+    for (int bitIdx = 0; bitIdx < 32; ++bitIdx) {
+      if (inDimOffset[bitIdx] >= 0) 
+        outVal |= mask;
+
+        mask <<= 1;
+    }
+    //printf("outVal=%lu \n", outVal);
+    outDimOffset = outVal;
+    outDimIdx += blockDim.x;
+  }
+}
+
+// Calc sign(x) for vectors of float. GPU counterpart to Faiss' CPU fvecs2bitvecs()
+void Float2Bit(marian::Tensor output, const marian::Tensor input)
+{
+  int dim = input->shape()[-1];
+  assert(dim % 32 == 0);
+  int batch = input->shape().elements() / input->shape()[-1];
+  int outDim = output->shape()[-1] / 4;
+
+  unsigned threads = std::min((unsigned)MAX_THREADS, (unsigned)outDim);
+
+  Float2Bit<<<batch, threads>>>(input->data(), output->data<uint32_t>(), batch, dim, outDim);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Calc hamming distance between input and weight hash. Return sorted indices and counts accoding to counting sort algo
+// https://www.geeksforgeeks.org/counting-sort/
+__global__ void HammmingAndSort(const uint32_t *weightHash, 
+                      const uint32_t *inputHash,
+                      uint16_t *hamming,
+                      uint32_t *outCounts, 
+                      uint32_t *outIdx, 
+                      uint32_t kBest, uint16_t minVal, uint16_t maxVal, uint16_t range, 
+                      int hashDim, int dim, int batch)
+{
+  extern __shared__ uint32_t sharedCounts[];
+
+  int batchIdx = blockIdx.x;
+
+  uint32_t *stopVal = sharedCounts + range;
+  uint16_t *hammingBatchOffset = hamming 
+                              ? hamming + batchIdx * dim 
+                              : (uint16_t*) (sharedCounts + range);
+
+  uint32_t *outCountsBatchOffset = outCounts ? outCounts + batchIdx * kBest : nullptr;
+  uint32_t *outIdxBatchOffset = outIdx ? outIdx + batchIdx * kBest : nullptr;
+  const uint32_t *inputHashOffset = inputHash + batchIdx * hashDim;
+
+  // init count array
+  int countsIdx = threadIdx.x;
+  while (countsIdx < range) {
+    sharedCounts[countsIdx] = 0;
+    countsIdx += blockDim.x;
+  }
+
+  __syncthreads();
+  int dimIdx = threadIdx.x;
+  while (dimIdx < dim) {
+    // Hamming distance between input and hashes
+    const uint32_t *weightHashOffset = weightHash + dimIdx * hashDim;
+
+    uint16_t dist = 0;
+    for (int hashDimIdx = 0; hashDimIdx < hashDim; ++hashDimIdx) {
+      const uint32_t &inputHashes = inputHashOffset[hashDimIdx];
+      const uint32_t &weightHashes = weightHashOffset[hashDimIdx];
+      uint32_t diff = inputHashes ^ weightHashes;
+      uint16_t distT = __popc(diff);
+      dist += distT;
+    }
+
+    hammingBatchOffset[dimIdx] = dist;
+
+    // counts
+    uint32_t countIdx = dist - minVal;
+    assert(countIdx < range);
+#if __CUDA_ARCH__ >= 600
+    atomicAdd_block(&sharedCounts[countIdx], 1);
+#endif
+    dimIdx += blockDim.x;
+  }
+
+  // Start counting sort algorithm
+  __syncthreads();
+  // Calc acumulate counts
+  if (threadIdx.x == 0) {
+    if (sharedCounts[0] >= kBest) {
+      (*stopVal) = 0;
+    }
+    else {
+      for (int rangeIdx = 1; rangeIdx < range; ++rangeIdx) {
+        uint32_t preval = sharedCounts[rangeIdx - 1];
+        sharedCounts[rangeIdx] += preval;
+        if (sharedCounts[rangeIdx] >= kBest) {
+          (*stopVal) = rangeIdx;
+          break;
+        }
+      }
+    }
+  }
+
+  // init output - reuse count array
+  __syncthreads();
+  int rangeIdx = (*stopVal) + threadIdx.x + 1;
+  while (rangeIdx < range) {
+    sharedCounts[rangeIdx] = NPP_MAX_32U;
+    rangeIdx += blockDim.x;
+  }
+
+  __syncthreads();
+  // Reduce
+  dimIdx = threadIdx.x;
+  while (dimIdx < dim) {
+    uint16_t val = hammingBatchOffset[dimIdx];
+    assert(val >= minVal);
+    assert(val <= maxVal);
+
+    uint32_t countIdx = val - minVal;
+    assert(countIdx < range);
+    uint32_t &outIdx = sharedCounts[countIdx];
+    
+    if (outIdx != NPP_MAX_32U) {
+      uint32_t prevOutIdx;
+// Not supported in Maxwells or older
+// Not supported in Maxwells or older
+#if __CUDA_ARCH__ >= 600
+      prevOutIdx = atomicAdd_block(&outIdx, (uint32_t) -1);
+#else
+      prevOutIdx = 0;
+#endif
+      assert(prevOutIdx > 0);
+      assert(prevOutIdx - 1 < dim);
+
+      if (prevOutIdx - 1 < kBest) {
+        if (outCountsBatchOffset) outCountsBatchOffset[prevOutIdx - 1] = val;
+        if (outIdxBatchOffset) outIdxBatchOffset[prevOutIdx - 1] = dimIdx;
+      }
+    }
+
+    dimIdx += blockDim.x;
+  }
+}
+
+// Calc hamming distance between input and weight hash. Return sorted indices and counts accoding to counting sort algo
+// https://www.geeksforgeeks.org/counting-sort/
+void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts,
+                  const marian::Tensor weightHash, 
+                  const marian::Tensor inputHash,
+                  uint32_t kBest, uint16_t minVal, uint16_t maxVal, 
+                  marian::Ptr<marian::Allocator> &alloc, 
+                  marian::Ptr<marian::Backend> &backend)
+{
+  size_t SHARED_MEM_SIZE = 48000;
+
+  assert(weightHash->shape()[-1] == inputHash->shape()[-1]);
+  int hashDim = weightHash->shape()[-1] / 4;
+
+  int dim = weightHash->shape().elements() / weightHash->shape()[-1];
+  int inputBatch = inputHash->shape().elements() / inputHash->shape()[-1];
+
+  uint16_t range = maxVal - minVal + 1;
+
+  marian::Shape hammingShape = inputHash->shape();
+  hammingShape.set(-1, (int) kBest);
+
+
+  size_t mem = range * sizeof(uint32_t) // counts
+              + sizeof(uint32_t)    // stopval
+              + dim * sizeof(uint16_t); // hamming;
+  
+  marian::Tensor hamming;
+  if (mem > SHARED_MEM_SIZE) {
+    // shared memory too small. Write haming distance to global mem instead
+    mem = range *sizeof(uint32_t) + sizeof(uint32_t);
+    assert(mem <= SHARED_MEM_SIZE);
+
+    hammingShape.set(-1, dim);
+    auto memory = alloc->alloc(requiredBytes(hammingShape, marian::Type::uint16));
+
+    hamming = marian::TensorBase::New(memory, hammingShape, marian::Type::uint16, backend);
+  }
+
+  HammmingAndSort<<<inputBatch, 256, mem>>>
+              (weightHash->data<uint32_t>(), 
+              inputHash->data<uint32_t>(),
+              hamming ? hamming->data<uint16_t>() : nullptr,
+              outCounts ? outCounts->data<uint32_t>() : nullptr,
+              outIdx ? outIdx->data<uint32_t>() : nullptr,
+              kBest, minVal, maxVal, range, 
+              hashDim, dim, inputBatch);
+  CUDA_CHECK(cudaGetLastError());
+
+  if (hamming) {
+    alloc->free(hamming->memory());
+  }
+}
+
 }  // namespace gpu
 }  // namespace marian
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index 178bb6920..31bd1e14f 100644
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -44,6 +44,11 @@ DISPATCH4(IsNaN, const Tensor, Ptr<Allocator>, bool&, bool&);
 #ifdef CUDA_FOUND
 namespace gpu {
 bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf);
+void Float2Bit(marian::Tensor output, const marian::Tensor input);
+void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts,
+                  const marian::Tensor weightHash, const marian::Tensor inputHash,
+                  uint32_t kBest, uint16_t minVal, uint16_t maxVal, 
+                  marian::Ptr<marian::Allocator> &alloc, marian::Ptr<marian::Backend> &backend);
 }
 #endif
 

From cc66cf617e931a60f6f3a05df4524b5d670266ef Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Thu, 29 Jun 2023 23:51:17 +0000
Subject: [PATCH 14/26] Merged PR 29966: More metrics in Marian and MBR scripts

This PR adds:
* An implementation of BLEURT with conversion script
* Some code refactoring for COMET models
* A more cleanly separated "evaluate" and "embed" functionality for COMET/COMET-QE/BLEURT
* A number of MBR-related scripts.
---
 CHANGELOG.md                                  |   7 +
 VERSION                                       |   2 +-
 scripts/bleurt/bleurt2marian.py               | 223 ++++++++++++++++++
 scripts/comet/comet2marian.py                 |  18 +-
 scripts/mbr/README.md                         |  54 +++++
 scripts/mbr/comet/comet_mbr.sh                | 133 +++++++++++
 .../mbr/comet/comet_mbr_with_embeddings.py    | 125 ++++++++++
 scripts/mbr/generic/explode_collapse.pl       |  43 ++++
 scripts/mbr/generic/metrics/bleu.sh           |   3 +
 scripts/mbr/generic/metrics/bleurt.sh         |  12 +
 scripts/mbr/generic/metrics/chrf.sh           |   3 +
 scripts/mbr/generic/rescore.pl                |  68 ++++++
 scripts/mbr/generic/stupid_mbr.sh             |  60 +++++
 scripts/metrics/.gitignore                    |   2 +
 scripts/metrics/Dockerfile                    |  43 ++++
 scripts/metrics/README.md                     |  36 +++
 scripts/metrics/compare.sh                    | 116 +++++++++
 scripts/metrics/docker-run.sh                 |  20 ++
 scripts/metrics/marian-score.sh               | 126 ++++++++++
 scripts/metrics/setup.sh                      |  15 ++
 src/CMakeLists.txt                            |   1 +
 src/command/marian_evaluator.cpp              |  15 ++
 src/command/marian_main.cpp                   |   4 +
 src/common/config.cpp                         |  51 +---
 src/common/config.h                           |   7 +-
 src/common/config_parser.cpp                  | 130 +++++++++-
 src/common/config_parser.h                    |   3 +-
 src/common/config_validator.cpp               |  20 +-
 src/common/config_validator.h                 |   6 +-
 src/data/corpus_base.cpp                      |  21 +-
 src/data/corpus_base.h                        |  20 +-
 src/data/text_input.cpp                       |   2 +-
 src/embedder/vector_collector.cpp             |  49 +++-
 src/embedder/vector_collector.h               |  37 ++-
 src/evaluator/evaluator.h                     | 155 ++++++++++++
 src/graph/expression_operators.cpp            |  10 +-
 src/graph/node_operators_binary.h             |   2 +
 src/layers/embedding.cpp                      |   3 +-
 src/layers_new/attention.h                    |   3 +-
 src/layers_new/neuralnet.h                    |  12 +-
 src/layers_new/transformer.h                  |   5 +-
 src/models/bleurt.h                           | 217 +++++++++++++++++
 src/models/comet_qe.h                         | 138 +++++++----
 src/models/model_base.h                       |  12 +-
 src/models/model_factory.cpp                  | 134 ++++++-----
 src/tensors/gpu/gpu_info.cpp                  |  19 ++
 46 files changed, 1999 insertions(+), 186 deletions(-)
 create mode 100644 scripts/bleurt/bleurt2marian.py
 create mode 100644 scripts/mbr/README.md
 create mode 100755 scripts/mbr/comet/comet_mbr.sh
 create mode 100644 scripts/mbr/comet/comet_mbr_with_embeddings.py
 create mode 100755 scripts/mbr/generic/explode_collapse.pl
 create mode 100755 scripts/mbr/generic/metrics/bleu.sh
 create mode 100755 scripts/mbr/generic/metrics/bleurt.sh
 create mode 100755 scripts/mbr/generic/metrics/chrf.sh
 create mode 100755 scripts/mbr/generic/rescore.pl
 create mode 100755 scripts/mbr/generic/stupid_mbr.sh
 create mode 100644 scripts/metrics/.gitignore
 create mode 100644 scripts/metrics/Dockerfile
 create mode 100644 scripts/metrics/README.md
 create mode 100755 scripts/metrics/compare.sh
 create mode 100755 scripts/metrics/docker-run.sh
 create mode 100755 scripts/metrics/marian-score.sh
 create mode 100755 scripts/metrics/setup.sh
 create mode 100644 src/command/marian_evaluator.cpp
 create mode 100644 src/evaluator/evaluator.h
 create mode 100644 src/models/bleurt.h
 create mode 100644 src/tensors/gpu/gpu_info.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8778abeed..a436308c7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Implementations of COMET-20 (reference-based) and BLEURT-20 for inference with conversion scripts.
+- `./marian evaluate` sub command for evaluation with COMET-QE-20, COMET-20 and BLEURT-20
+- A bunch of scripts for metrics use and early MBR experiments
 - LSH vocab filtering for GPU. Speed is not competitive with non-LSH. Checking in for completeness and possible future use of LSH on GPU for non-filtering stuff
 - Add --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) 
   diverged runs. If not recoverable, exception gets rethrown and goes unhandled to force fatal error and shutdown.
@@ -21,6 +24,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Handle copying from fp32 or fp16 embeddings in embedder mode correctly.
 - Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp).
 
+### Changed
+- Removed --num-devices N option that wasn't really used by anyone (I assume).
+
+
 ## [1.12.0] - 2023-02-20
 
 ### Added
diff --git a/VERSION b/VERSION
index 97cc69d7f..f15731572 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.5
+v1.12.6
diff --git a/scripts/bleurt/bleurt2marian.py b/scripts/bleurt/bleurt2marian.py
new file mode 100644
index 000000000..25aa8206f
--- /dev/null
+++ b/scripts/bleurt/bleurt2marian.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+"""
+This script converts Google BLEURT models to Marian weight file.
+"""
+
+import argparse
+import logging as log
+import numpy as np
+import yaml
+from pathlib import Path
+
+BLEURT_LOCATION = 'lucadiliello/BLEURT-20'
+
+log.basicConfig(level=log.INFO)
+
+parser = argparse.ArgumentParser(description='Convert Google BLEURT models to Marian weight file.')
+parser.add_argument('--marian', '-m', help='Output path for Marian weight file', required=True)
+parser.add_argument('--spm', '-spm', type=Path, help='Save tokenizer SPM file here', required=False)
+args = parser.parse_args()
+
+def load_bleurt_model():
+    from bleurt_pytorch import BleurtForSequenceClassification, BleurtTokenizer
+
+    bleurt_model = BleurtForSequenceClassification.from_pretrained(BLEURT_LOCATION)
+    bleurt_model.eval()
+    tokenizer = BleurtTokenizer.from_pretrained(BLEURT_LOCATION)
+    vocab_file = None
+    if tokenizer.vocab_file and Path(tokenizer.vocab_file).exists():
+        vocab_file = tokenizer.vocab_file
+    return bleurt_model, vocab_file
+
+bleurt_model, vocab_file = load_bleurt_model()
+
+if args.spm:
+    vocab_file = vocab_file and Path(vocab_file)
+    if vocab_file and vocab_file.exists():
+        if not args.spm.parent.exists():
+            raise Exception(f"Directory {args.spm.parent} does not exist")
+        log.info(f"Copying {vocab_file} to {args.spm}")
+        args.spm.write_bytes(vocab_file.read_bytes())
+    else:
+        raise Exception(f"Could not locate or save the vocab file: {vocab_file}; please remove --spm argument and try downloading the file manually")
+
+marianModel = dict()
+config = dict()
+
+config["type"] = "bleurt"
+config["tied-embeddings-all"] = True
+config["tied-embeddings-src"] = False
+config["transformer-ffn-depth"] = 2
+config["transformer-ffn-activation"] = "gelu" # figure this out dynamically
+config["transformer-train-position-embeddings"] = True
+config["transformer-preprocess"] = ""
+config["transformer-postprocess"] = "dan"
+config["transformer-postprocess-emb"] = "nd"
+config["bert-train-type-embeddings"] = True
+config["bert-type-vocab-size"] = 2
+config["comet-prepend-zero"] = True
+config["input-join-fields"] = True
+config["version"] = "bleurt2marian.py conversion"
+config["enc-depth"] = 0
+
+def yaml2np(config):
+    configYamlStr = yaml.dump(config, default_flow_style=False)
+    print("\nMarian config:")
+    print(configYamlStr)
+
+    desc = bytes(configYamlStr, 'ascii') + b'\x00'
+    npDesc = np.chararray((len(desc),))
+    npDesc.dtype = np.int8
+    for i, b in enumerate(desc):
+        npDesc[i] = b
+    return npDesc
+
+def convert(pd, srcs, trg, transpose=True, bias=False):
+    if len(srcs) == 1:
+        for src in srcs:
+            num = pd[src].detach().numpy()
+            if bias:
+                marianModel[trg] = num.copy()
+            else:
+                if transpose:
+                    marianModel[trg] = np.transpose(num).copy()
+                else:
+                    marianModel[trg] = num
+    else: # path that joins matrices together for fused self-attention
+        nums = [pd[src].detach().numpy() for src in srcs]
+        if bias:
+            nums = [np.transpose(num) for num in nums]
+        marianModel[trg] = np.stack(nums, axis=0).copy()
+
+def extract(layer, nth, level):
+    name = type(layer).__name__
+    print("  " * level, nth, name)
+
+    if "BleurtEncoder" in name:
+        # embedding projection
+        prefix = "BleurtEncoder"
+
+        pd = dict(layer.named_parameters())
+        for n in pd:
+            if "embedding_projection" in n:
+                print("  " * (level + 1), n, pd[n].shape)
+
+        convert(pd, ["embedding_projection.weight"], f"{prefix}->encoder->eProj->weight")
+        convert(pd, ["embedding_projection.bias"],   f"{prefix}->encoder->eProj->bias", bias=True)
+        
+        # continue recursing down the model structure
+        recurse(layer, level + 1)
+
+    elif "BleurtLayer" in name:
+        pd = dict(layer.named_parameters())
+        for n in pd:
+            print("  " * (level + 1), n, pd[n].shape)
+
+        prefix = "BleurtEncoder"
+        blockPrefix = f"{prefix}->encoder->layers->at({nth})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock"
+
+        if not "transformer-dim-model" in config:
+            query = pd["attention.self.query.weight"].detach().numpy()
+            config["transformer-dim-model"] = query.shape[1]
+
+        # self-attention    
+        # query transformation
+        convert(pd, ["attention.self.query.weight"],       f"{blockPrefix}->selfAttention->qProj->weight")
+        convert(pd, ["attention.self.query.bias"],         f"{blockPrefix}->selfAttention->qProj->bias", bias=True)
+        
+        # key transformation
+        convert(pd, ["attention.self.key.weight"],         f"{blockPrefix}->selfAttention->kProj->weight")
+        convert(pd, ["attention.self.key.bias"],           f"{blockPrefix}->selfAttention->kProj->bias", bias=True)
+        
+        # values transformation
+        convert(pd, ["attention.self.value.weight"],       f"{blockPrefix}->selfAttention->vProj->weight")
+        convert(pd, ["attention.self.value.bias"],         f"{blockPrefix}->selfAttention->vProj->bias", bias=True)
+
+        # output transformation
+        convert(pd, ["attention.output.dense.weight"],     f"{blockPrefix}->selfAttention->oProj->weight")
+        convert(pd, ["attention.output.dense.bias"],       f"{blockPrefix}->selfAttention->oProj->bias", bias=True)
+
+        # self-attention layer-norm
+        convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True)
+        convert(pd, ["attention.output.LayerNorm.bias"],   f"{blockPrefix}->postprocessor->norm->bias", bias=True)
+
+        # ffn 
+        # first ffn layer
+        blockPrefix = f"{prefix}->encoder->layers->at({nth})->as<marian::nn::TransformerEncoderLayer>()->filterBlock"
+
+        convert(pd, ["intermediate.dense.weight"],         f"{blockPrefix}->layers->at(0)->as<marian::nn::Linear>()->weight")
+        convert(pd, ["intermediate.dense.bias"],           f"{blockPrefix}->layers->at(0)->as<marian::nn::Linear>()->bias", bias=True)
+        # second ffn layer
+        convert(pd, ["output.dense.weight"],               f"{blockPrefix}->layers->at(3)->as<marian::nn::Linear>()->weight")
+        convert(pd, ["output.dense.bias"],                 f"{blockPrefix}->layers->at(3)->as<marian::nn::Linear>()->bias", bias=True)
+        # ffn layer-norm
+        convert(pd, ["output.LayerNorm.weight"],           f"{blockPrefix}->postprocessor->norm->weight", bias=True)
+        convert(pd, ["output.LayerNorm.bias"],             f"{blockPrefix}->postprocessor->norm->bias", bias=True)
+
+        config["transformer-dim-ffn"] = pd["intermediate.dense.bias"].shape[-1]
+        config["transformer-heads"] = layer.attention.self.num_attention_heads
+        config["enc-depth"] += 1
+
+    elif "BleurtEmbeddings" in name:
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+        pd = dict(layer.named_parameters())
+
+        # @TODO: this is a dirty trickery and should be solved differently in the future
+        npWemb = pd["word_embeddings.weight"].detach().numpy()
+        # put embedding of [CLS] in place of [PAD] (0)
+        npWemb[0, :] = npWemb[312, :]
+        # put embedding of [SEP] in place of </s>
+        npWemb[1, :] = npWemb[313, :]
+        marianModel["Wemb"] = npWemb
+
+        prefix = "BleurtEncoder"
+        
+        npPos = pd["position_embeddings.weight"].detach().numpy()
+        # this should be moved out of the encoder into a special embedding layer
+        marianModel[f"{prefix}->encoder->positionEmbedding->embeddings"] = npPos
+        
+        npType = pd["token_type_embeddings.weight"].detach().numpy()
+        marianModel[f"{prefix}->typeEmbedding->embeddings"] = npType
+
+        # post-embedding layer normalization
+        convert(pd, ["LayerNorm.weight"], f"{prefix}->encoder->preprocessor->norm->weight", bias=True)
+        convert(pd, ["LayerNorm.bias"],   f"{prefix}->encoder->preprocessor->norm->bias", bias=True)
+
+        config["dim-emb"]    =   npWemb.shape[1]
+        config["dim-vocabs"] = [ npWemb.shape[0] ]
+        config["max-length"] = npPos.shape[0]        
+
+    # this will be the bleurt pooler right here:
+    elif name == "BleurtPooler":
+        for n, p in layer.named_parameters():
+            print("  " * (level + 1), n, p.shape)
+        pd = dict(layer.named_parameters())
+
+
+        prefix = "BleurtPooler"
+        convert(pd, ["dense.weight"], f"{prefix}->layers->at(0)->as<marian::nn::Linear>()->weight")
+        convert(pd, ["dense.bias"],   f"{prefix}->layers->at(0)->as<marian::nn::Linear>()->bias", bias=True)
+
+    else:
+        recurse(layer, level + 1)
+
+def recurse(parent, level=0):
+    for i, child in enumerate(parent.children()):
+        extract(child, i, level)
+
+recurse(bleurt_model)
+
+# last layer
+prefix = "BleurtPooler"
+pd = dict(bleurt_model.named_parameters())
+convert(pd, ["classifier.weight"], f"{prefix}->layers->at(3)->as<marian::nn::Linear>()->weight")
+convert(pd, ["classifier.bias"],   f"{prefix}->layers->at(3)->as<marian::nn::Linear>()->bias", bias=True)
+
+marianModel["special:model.yml"] = yaml2np(config)
+
+for m in marianModel:
+    print(m, marianModel[m].shape)
+
+print("Saving Marian model to %s" % (args.marian,))
+np.savez(args.marian, **marianModel)
\ No newline at end of file
diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py
index 8ef4d29fc..69c8abf59 100755
--- a/scripts/comet/comet2marian.py
+++ b/scripts/comet/comet2marian.py
@@ -7,13 +7,15 @@
 import logging as log
 import numpy as np
 import yaml
-
 from pathlib import Path
 
 ## Uncomment to see model names supported by your installed version of unbabel-comet
 # from comet.models import available_metrics
 # supported_comets = [m for m in available_metrics if 'qe' in m.lower()]
-supported_comets = ['wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da']
+supported_comets = [
+    'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da',
+    'wmt20-comet-da', 'wmt21-comet-da'
+]
 log.basicConfig(level=log.INFO)
 
 parser = argparse.ArgumentParser(description='Convert Unbabel COMET-QE models to Marian weight file.')
@@ -80,11 +82,17 @@ def load_comet_model(model_path):
     else:
         raise Exception(f"Could not locate or save the vocab file: {vocab_file}; please remove --spm argument and try downloading the file manually")
 
-
 marianModel = dict()
-
 config = dict()
-config["type"] = "comet-qe"
+
+model_type = type(cometModel).__name__
+if model_type == "RegressionMetric":
+    config["type"] = "comet"
+elif model_type == "ReferencelessRegression":
+    config["type"] = "comet-qe"
+else:
+    raise Exception(f'Unknown type of model {model_type}')
+
 config["tied-embeddings-all"] = True
 config["tied-embeddings-src"] = False
 config["transformer-ffn-depth"] = 2
diff --git a/scripts/mbr/README.md b/scripts/mbr/README.md
new file mode 100644
index 000000000..1ccdb370b
--- /dev/null
+++ b/scripts/mbr/README.md
@@ -0,0 +1,54 @@
+# Some notes on MBR
+
+All of this is experimental, use at your own risk.
+
+## MBR with COMET
+
+This concerns the scipts in the `comet` folder:
+
+This script is for efficient MBR with COMET. COMET allows to embed source and hypotheses separatly which makes it very easy to optimize.
+Only the final embbedings are used to create the NxN scores.
+
+Example usage:
+
+### prepare the source and samples
+sacrebleu -t wmt21 -l en-de --echo src > wmt21.src
+cat wmt21.src | perl -pe '\$_ = \$_ x 128' > wmt21.128.src
+cat wmt21.128.src | ~/marian-dev/build/marian-decoder -m translation-model.npz \
+ -v translation-model-vocab.{spm,spm} -b1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src \
+ --max-length 256 --max-length-crop -d all --output-sampling > wmt21.128.out
+
+### run MBR with COMET
+cat wmt21.128.out | ~/marian-dev/scripts/mbr/comet/comet_mbr.sh -m wmt20-comet-da.npz -n 128 -s wmt21.src -g 8 > wmt21.128.mbr.out
+cat wmt21.128.mbr.out | cut -f 4 | sacrebleu -t wmt21 -l en-de --metrics bleu chrf -w 2 --format text
+
+
+## "Stupid" MBR (generic)
+
+This concerns the scipts in the `generic` folder
+
+This script can be used to do "stupid" MBR (i.e. all-vs-all MBR with any reference-based metric specfied in the metrics folder).
+The subscipt in the metrics folder need to be able to calculate sentence-level results. This should be done as efficiently as possible
+in order to score all NxN variants (where N is sample size). The explode_collape.pl script below does some smart deduping as far as 
+possible, but the complexity will still be close to NxN.
+
+Example usage:
+
+### prepare the sample
+```
+sacrebleu -t wmt21 -l en-de --echo src | perl -pe '\$_ = \$_ x 128' > wmt21.128.src
+cat wmt21.128.src | ~/marian-dev/build/marian-decoder -m translation-model.npz \
+ -v translation-model-vocab.{spm,spm} -b1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src \
+ --max-length 256 --max-length-crop -d all --output-sampling > wmt21.128.out
+```
+
+### run MBR, here with ChrF
+```
+cat wmt21.128.out | ~/marian-dev/scripts/mbr/generic/stupid_mbr.sh 128 128 chrf > wmt21.128.sorted.out
+```
+
+### select the top translation according to ChrF MBR and evaluate result
+
+```
+cat wmt21.128.sorted.out | grep ^BEST | cut -f 3 | sacrebleu -t wmt21 -l en-de --metrics bleu chrf -w 2 --format text
+```
\ No newline at end of file
diff --git a/scripts/mbr/comet/comet_mbr.sh b/scripts/mbr/comet/comet_mbr.sh
new file mode 100755
index 000000000..9ba97b4a7
--- /dev/null
+++ b/scripts/mbr/comet/comet_mbr.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+OPTIONS=$(getopt -o t:s:o:n:m:g:h --long hyps:,source:,output:,num_hyps:,model:,num_gpus:,help -- "$@")
+eval set -- "$OPTIONS"
+
+while true; do
+  case "$1" in
+    -t|--hyps)
+      >&2 echo "Option hyps=$2"
+      hyps_file=$2
+      shift 2;;
+    -s|--source)
+      >&2 echo "Option source=$2"
+      source_file=$2
+      shift 2;;
+    -o|--output)
+      >&2 echo "Option output=$2"
+      out_file=$2
+      shift 2;;
+    -n|--num_hyps)
+      >&2 echo "Option num_hyps=$2"
+      num_hyps=$2
+      shift 2;;
+    -m|--model)
+      >&2 echo "Option model=$2"
+      comet_model=$2
+      shift 2;;
+    -g|--num_gpus)
+      >&2 echo "Option num_gpus=$2"
+      num_gpus=$2
+      shift 2;;
+    -h|--help)
+      help=1
+      shift;;
+    --)
+      shift; break;;
+    *)
+      >&2 echo "Internal error!" ; exit 1 ;;
+  esac
+done
+
+if [[ "$help" = "1" ]]
+then
+cat >&2 <<END
+This script is for efficient MBR with COMET. COMET allows to embed source and hypotheses separatly which makes it very easy to optimize.
+Only the final embbedings are used to create the NxN scores.
+
+Example usage:
+
+# prepare the source and samples
+sacrebleu -t wmt21 -l en-de --echo src > wmt21.src
+cat wmt21.src | perl -pe '\$_ = \$_ x 128' > wmt21.128.src
+cat wmt21.128.src | ~/marian-dev/build/marian-decoder -m translation-model.npz \
+ -v translation-model-vocab.{spm,spm} -b1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src \
+ --max-length 256 --max-length-crop -d all --output-sampling > wmt21.128.out
+
+# run MBR with COMET
+cat wmt21.128.out | ~/marian-dev/scripts/mbr/comet/comet_mbr.sh -m wmt20-comet-da.npz -n 128 -s wmt21.src -g 8 > wmt21.128.mbr.out
+cat wmt21.128.mbr.out | cut -f 4 | sacrebleu -t wmt21 -l en-de --metrics bleu chrf -w 2 --format text
+
+END
+exit
+fi
+
+
+hyps_file=${hyps_file:-/dev/stdin}
+out_file=${out_file:-/dev/stdout}
+num_hyps=${num_hyps:-128}
+comet_model=${comet_model:-wmt20-comet-da.npz}
+num_gpus=${num_gpus:-8}
+
+script_path=$(dirname $0)
+marian=$script_path/../../../build/marian
+
+comet_path=$(dirname $comet_model)
+devices=$(seq 0 $(($num_gpus-1)))
+
+tmp=/tmp
+
+# create temporary files and delete them right after, use file descriptor instead
+# (will free disk space after script ends, even when interrupted)
+samples=$(mktemp $tmp/samples.XXXXXX)
+exec 3>"$samples"
+rm "$samples"
+samples=/dev/fd/3
+
+source=$(mktemp $tmp/source.XXXXXX)
+exec 4>"$source"
+rm "$source"
+source=/dev/fd/4
+
+source_embeddings=$(mktemp $tmp/source.embeddings.bin.XXXXXX)
+exec 5>"$source_embeddings"
+rm "$source_embeddings"
+source_embeddings=/dev/fd/5
+
+hyps_embeddings=$(mktemp $tmp/sample.embeddings.bin.XXXXXX)
+exec 6>"$hyps_embeddings"
+rm "$hyps_embeddings"
+hyps_embeddings=/dev/fd/6
+
+# done with creating temporary files
+
+lines_hyps=$(cat $hyps_file | tee $samples | wc -l)
+lines_source=$(cat $source_file | tee $source | wc -l)
+
+>&2 echo "Computing source embeddings ($lines_source lines) with $comet_model"
+
+cat $source \
+| pv -ptel -s $lines_source \
+| $marian embed -m $comet_model -v $comet_path/roberta-vocab.spm \
+  --like roberta -d $devices --fp16 --binary --quiet \
+> $source_embeddings
+
+>&2 echo "Computing sample embeddings ($lines_hyps lines, $num_hyps per sentence) with $comet_model"
+
+cat $samples \
+| pv -ptel -s $lines_hyps \
+| $marian embed -m $comet_model -v $comet_path/roberta-vocab.spm \
+  --like roberta -d $devices --fp16 --binary --quiet \
+> $hyps_embeddings
+ 
+>&2 echo "Computing MBR scores"
+
+cat $samples \
+| pv -ptel -s $lines_hyps \
+| python $script_path/comet_mbr_with_embeddings.py \
+  -m $comet_model -s $source_embeddings -t $hyps_embeddings \
+  --num_source $lines_source --num_hyps $num_hyps \
+  -d $devices --batch_size 128 --fp16 \
+> $out_file
+
+>&2 echo "Done"
diff --git a/scripts/mbr/comet/comet_mbr_with_embeddings.py b/scripts/mbr/comet/comet_mbr_with_embeddings.py
new file mode 100644
index 000000000..f14207af8
--- /dev/null
+++ b/scripts/mbr/comet/comet_mbr_with_embeddings.py
@@ -0,0 +1,125 @@
+import numpy as np
+import cupy as cp
+import sys
+import argparse
+from pathlib import Path
+
+parser = argparse.ArgumentParser(description='Apply MBR with COMET top layers')
+parser.add_argument('-m', '--model',  type=Path,  help='COMET model path', required=True)
+parser.add_argument('-s', '--source', type=Path,  help='Source file embeddings', required=True)
+parser.add_argument('-t', '--hyps',  type=Path,   help='Sample file embeddings', required=True)
+parser.add_argument('--num_source', type=int, help='Number of sentence', required=True)
+parser.add_argument('--num_hyps', type=int, help='Number of samples per sentence', required=True)
+parser.add_argument('--fp16', help='Use fp16 for computation', action='store_true')
+parser.add_argument('--batch_size', type=int, help='Batch size during MBR', default=32)
+parser.add_argument('-d', '--devices', nargs='+', type=int, help="GPU device id to use", default=[0, 1, 2, 3, 4, 5, 6, 7])
+args = parser.parse_args()
+
+
+model_path   = args.model
+src_emb_path = args.source 
+smp_emb_path = args.hyps
+
+num_sents = args.num_source
+num_samps = args.num_hyps
+
+emb_size = 1024
+
+compute_type=cp.float32
+if args.fp16:
+    compute_type=cp.float16
+
+batch_size = args.batch_size
+devices = args.devices
+
+sources = np.memmap(src_emb_path, mode='r', dtype=np.float32, shape=(num_sents, emb_size))
+samples = np.memmap(smp_emb_path, mode='r', dtype=np.float32, shape=(num_sents, num_samps, emb_size))
+
+def mbr_decode_batch(pooler, mt, src, ref):
+    batch_size = mt.shape[0]
+
+    diffRef = abs(mt - ref)
+    prodRef = mt * ref
+
+    diffSrc = cp.repeat(abs(mt - src), repeats=num_samps, axis=-2);
+    prodSrc = cp.repeat(mt * src,      repeats=num_samps, axis=-2);
+
+    mt  = cp.repeat(mt,  repeats=num_samps, axis=-2)
+    ref = cp.repeat(ref, repeats=batch_size,  axis=-3) 
+
+    emb = cp.concatenate([mt, ref, prodRef, diffRef, prodSrc, diffSrc], axis=-1)
+    
+    layer1    = cp.tanh(cp.dot(emb,    pooler[0]["weight"]) + pooler[0]["bias"])
+    layer2    = cp.tanh(cp.dot(layer1, pooler[1]["weight"]) + pooler[1]["bias"]) 
+    comet     =         cp.dot(layer2, pooler[2]["weight"]) + pooler[2]["bias"]
+    
+    mbr_score = cp.reshape(cp.mean(comet, axis=-2), (batch_size,))
+    
+    return mbr_score
+
+    
+def mbr_decode(pooler, i, batch_size=50):
+    sources_gpu = cp.asarray(sources[i, :], compute_type)
+    samples_gpu = cp.asarray(samples[i, :, :], compute_type)
+
+    src = cp.reshape(sources_gpu, (1, 1, emb_size))
+    mt  = cp.reshape(samples_gpu, (num_samps, 1, emb_size))
+    ref = cp.reshape(mt, (1, num_samps, emb_size))
+
+    batches = cp.array_split(mt, int(num_samps / batch_size))
+    scores = []
+    for batch in batches:
+        mbr_scores_batch = mbr_decode_batch(pooler, batch, src, ref)
+        scores.append(mbr_scores_batch)
+    
+    mbr_scores = cp.concatenate(scores, axis=-1)
+    best_index = cp.argmax(mbr_scores, axis=-1)
+    best_score = cp.max(mbr_scores, axis=-1)
+    
+    return best_index, best_score
+
+def consume(k):
+    j = 0
+    candidates = []
+    for line in sys.stdin:
+        line = line.rstrip()
+        candidates.append(line)
+
+        if len(candidates) == num_samps:
+            best = best_gpu[k + j]
+            best_index = cp.asnumpy(best[0])
+            best_score = cp.asnumpy(best[1]) 
+            print(f"{k + j}\t{best_index}\t{best_score:.4f}\t{candidates[best_index]}")
+            candidates = []
+            j += 1
+            if j == step:
+                k += step
+                break
+    return k
+
+#####################################################
+
+model = np.load(model_path)
+
+poolers = []
+for id in devices:
+    with cp.cuda.Device(id):
+        pooler = []
+        for i, layerNo in enumerate([0, 3, 6]):
+            w = cp.asarray(model[f"CometQEPooler->layers->at({layerNo})->as<marian::nn::Linear>()->weight"], compute_type)
+            b = cp.asarray(model[f"CometQEPooler->layers->at({layerNo})->as<marian::nn::Linear>()->bias"],   compute_type)
+            pooler.append({"weight": w, "bias": b})
+        poolers.append(pooler)
+
+step = batch_size
+best_gpu = []
+k = 0
+for i in range(num_sents):
+    gpu_id = i % len(devices)
+    with cp.cuda.Device(devices[gpu_id]):
+        best_gpu.append(mbr_decode(poolers[gpu_id], i, batch_size=batch_size))    
+    if len(best_gpu) % step == 0:
+        k = consume(k)
+
+# get rest
+k = consume(k)
diff --git a/scripts/mbr/generic/explode_collapse.pl b/scripts/mbr/generic/explode_collapse.pl
new file mode 100755
index 000000000..df1dbb085
--- /dev/null
+++ b/scripts/mbr/generic/explode_collapse.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl
+
+# Helper script that takes the sample file with N samples and M references (first M among N samples)
+# and creates deduped(!) N' x M' pairs (N' is N after deduplication, same for M') for scoring. 
+# Creating the pairs is "exploding", deduping is "collapsing", hence the name. 
+# Includes ids so that the original order from before deduplication can be restored.
+
+my $N = $ARGV[0];
+my $R = $ARGV[1];
+$R = $N if not defined($R);
+
+sub explodeCollapse {
+  my $id = shift;
+  my @samples = @_;
+
+  my %cnd;
+  foreach(@samples) {
+    $cnd{$_} = scalar keys %cnd if not exists($cnd{$_});
+  }
+
+  my @uniq = sort { $cnd{$a} <=> $cnd{$b} } keys %cnd;
+  foreach my $t (@uniq) {
+    my $c = 0;
+    foreach my $r (@uniq) {
+      last if($c >= $R);
+      # this outputs the pseudo-reference first!
+      printf("%d\t%d\t%d\t%s\t%s\n", $id, $cnd{$r}, $cnd{$t}, $r, $t);
+      $c++;
+    }
+  }
+}
+
+my @samples;
+my $id = 0;
+while(<STDIN>) {
+  chomp;
+  push(@samples, $_);
+  if(@samples == $N) {
+    explodeCollapse($id, @samples);
+    @samples = ();
+    $id++;
+  }
+}
diff --git a/scripts/mbr/generic/metrics/bleu.sh b/scripts/mbr/generic/metrics/bleu.sh
new file mode 100755
index 000000000..e94d74d77
--- /dev/null
+++ b/scripts/mbr/generic/metrics/bleu.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+parallel --cat -k -j32 --block 10M "sacrebleu <(cut -f 1 {}) < <(cut -f 2 {}) -b -w 4 -sl --format text --metrics bleu"
diff --git a/scripts/mbr/generic/metrics/bleurt.sh b/scripts/mbr/generic/metrics/bleurt.sh
new file mode 100755
index 000000000..a7095825d
--- /dev/null
+++ b/scripts/mbr/generic/metrics/bleurt.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+gpus=${1:-8}
+scriptPath=$(dirname $0)
+root=$scriptPath/../../../../.
+marian=$root/build/marian
+bleurt=$root/scripts/bleurt
+devices=$(seq 0 $(($gpus-1)))
+
+# we reverse the input here since the scorer expects "hyp<tab>ref" but we output pseudo-references first
+perl -F'\t' -ane 'chomp(@F); print "$F[1]\t$F[0]\n"' \
+| $marian evaluate -m $bleurt/bleurt-20.npz -v $bleurt/bleurt-vocab.{spm,spm} --like bleurt -d $devices --fp16 --quiet
diff --git a/scripts/mbr/generic/metrics/chrf.sh b/scripts/mbr/generic/metrics/chrf.sh
new file mode 100755
index 000000000..05a51de10
--- /dev/null
+++ b/scripts/mbr/generic/metrics/chrf.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+parallel --cat -k -j32 --block 10M "sacrebleu <(cut -f 1 {}) < <(cut -f 2 {}) -b -w 4 -sl --format text --metrics chrf"
diff --git a/scripts/mbr/generic/rescore.pl b/scripts/mbr/generic/rescore.pl
new file mode 100755
index 000000000..7374056ad
--- /dev/null
+++ b/scripts/mbr/generic/rescore.pl
@@ -0,0 +1,68 @@
+#!/usr/bin/perl
+
+# Helper scripts that takes the pairs created with explode_collapse.pl and the metric scores
+# for each pair, computes MBR and by highest score. Restores original sample number (not order, due to sorting).
+# Grepping for "^BEST:" will result in a file with as many hypotheses as orignal input sentences in the right order.
+
+my $N = $ARGV[0];
+my $R = $ARGV[1];
+open(IDS,    "cat < $ARGV[2] |");
+open(SCORES, "cat < $ARGV[3] |");
+
+$| = 1; 
+
+sub score {
+  my $samples = shift;
+  my $scores  = shift;
+
+  my %cnd;
+  foreach(@$samples) {
+    $cnd{$_} = scalar keys %cnd if not exists($cnd{$_});
+  }
+
+  my @scored;
+  foreach my $t (@$samples) {
+    my $sum = 0;
+    my $tid = $cnd{$t};
+    my $c = 0;
+    foreach my $r (@$samples) {
+      my $rid = $cnd{$r};
+      if(exists($scores->{$tid}->{$rid}) and $c < $R) {
+        $sum += $scores->{$tid}->{$rid};
+        $c++;
+      }
+    }
+    push(@scored, [$sum / $c, $t]);
+  }
+  my ($best, @rest) = sort { $b->[0] <=> $a->[0] } @scored;
+  printf("BEST\t%.4f\t%s\n", @$best);
+  printf("REST\t%.4f\t%s\n", @$_) foreach(@rest);
+}
+
+my $samples = [];
+my $scores = {};
+my $id1 = 0;
+while(<STDIN>) {
+  chomp;
+  push(@$samples, $_);
+  if(@$samples == $N) {
+    my ($ids, $score);
+    while(($ids = <IDS>) and ($score = <SCORES>)) {
+      chomp($ids, $score);
+      my($id2, $r, $t) = split(/\t/, $ids);
+      if($id1 == $id2) {
+        $scores->{$t}->{$r} = $score;
+      } else {
+        score($samples, $scores);
+        $samples = [];
+        $scores = {};
+        $scores->{$t}->{$r} = $score;
+        last;
+      }
+    }
+    $id1++;
+  }
+}
+score($samples, $scores);
+
+close(SCORES)
diff --git a/scripts/mbr/generic/stupid_mbr.sh b/scripts/mbr/generic/stupid_mbr.sh
new file mode 100755
index 000000000..b19c0d0ae
--- /dev/null
+++ b/scripts/mbr/generic/stupid_mbr.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+if [[ "$1" = "--help" ]]
+then
+cat >&2 <<END
+This script can be used to do "stupid" MBR (i.e. all-vs-all MBR with any reference-based metric specfied in the metrics folder).
+The subscipt in the metrics folder need to be able to calculate sentence-level results. This should be done as efficiently as possible
+in order to score all NxN variants (where N is sample size). The explode_collape.pl script below does some smart deduping as far as 
+possible, but the complexity will still be close to NxN.
+
+Example usage:
+
+# prepare the sample
+sacrebleu -t wmt21 -l en-de --echo src | perl -pe '\$_ = \$_ x 128' > wmt21.128.src
+cat wmt21.128.src | ~/marian-dev/build/marian-decoder -m translation-model.npz \
+ -v translation-model-vocab.{spm,spm} -b1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src \
+ --max-length 256 --max-length-crop -d all --output-sampling > wmt21.128.out
+
+# run MBR, here with ChrF
+cat wmt21.128.out | ~/marian-dev/scripts/mbr/generic/stupid_mbr.sh 128 128 chrf > wmt21.128.sorted.out
+
+# select the top translation according to ChrF MBR and evaluate result
+cat wmt21.128.sorted.out | grep ^BEST | cut -f 3 | sacrebleu -t wmt21 -l en-de --metrics bleu chrf -w 2 --format text
+
+END
+exit
+fi
+
+num_samples=${1:-128}
+num_references=${2:-$num_samples}
+metric=${3:-bleu}
+gpus=${4:-8}
+
+scriptPath=$(dirname $0)
+tmp=$(mktemp -d)
+
+cat \
+| tee >(wc -l > $tmp/lines_input) \
+| pigz > $tmp/input.txt.gz
+
+lines_input=$(cat $tmp/lines_input)
+
+>&2 echo "Computing $metric scores"
+
+pigz -dc $tmp/input.txt.gz \
+| pv -ptel -s $lines_input \
+| perl $scriptPath/explode_collapse.pl $num_samples $num_references 2>/dev/null \
+| tee >(cut -f 1,2,3 > $tmp/ids) \
+| cut -f 4,5 \
+| $scriptPath/metrics/$metric.sh $gpus \
+> $tmp/scores
+
+>&2 echo "Computing MBR scores"
+
+pigz -dc $tmp/input.txt.gz \
+| pv -ptel -s $lines_input \
+| perl $scriptPath/rescore.pl $num_samples $num_references $tmp/ids $tmp/scores
+
+rm -rf $tmp
+>&2 echo "Done"
diff --git a/scripts/metrics/.gitignore b/scripts/metrics/.gitignore
new file mode 100644
index 000000000..5d66dfcd9
--- /dev/null
+++ b/scripts/metrics/.gitignore
@@ -0,0 +1,2 @@
+bins/
+tmp.*
\ No newline at end of file
diff --git a/scripts/metrics/Dockerfile b/scripts/metrics/Dockerfile
new file mode 100644
index 000000000..4641e6571
--- /dev/null
+++ b/scripts/metrics/Dockerfile
@@ -0,0 +1,43 @@
+FROM nvidia/cuda:11.1.1-devel-ubuntu20.04
+
+LABEL description="Marian image - Ubuntu 20.04"
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG NCPU=24
+ARG MARIAN_REPO="https://github.com/marian-nmt/marian-dev"
+ARG MARIAN_BRANCH=master
+
+RUN apt-get update \
+    && apt-get install -y wget apt-utils python3-pip git cmake build-essential \
+        intel-mkl openmpi-common openmpi-bin libopenmpi-dev  pkg-config \
+    && apt-get clean
+
+RUN ln -sf /usr/bin/python3 /usr/bin/python && \
+    ln -sf /usr/bin/pip3 /usr/bin/pip
+
+# install unbabel-comet (requires pytorch) and bleurt (requires tensorflow and cudnn)
+# note: unabel-comet 2.x is broken use 1.x. requires numpy < 1.24
+RUN  pip install --upgrade pip \
+    && pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html \
+    && pip install sacrebleu unbabel-comet==1.1.3 numpy==1.23.5 nvidia-cudnn-cu11==8.6.0.163 git+https://github.com/google-research/bleurt.git \
+    && rm -rf  ~/.cache/pip/
+
+# Install sentencepiece
+RUN pip3 uninstall -y sentencepiece && \
+    mkdir -p src && \
+    cd src && \
+    git clone https://github.com/marian-nmt/sentencepiece && \
+    cd sentencepiece && \
+    mkdir build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    make -j install && \
+    cd ../python && \
+    python3 setup.py install && \
+    cd ../../.. && \
+    rm -rf src
+
+RUN git clone -b ${MARIAN_BRANCH} ${MARIAN_REPO} /marian \
+    && mkdir /marian/build && cd /marian/build \
+    && cmake .. -DUSE_MPI=on -DUSE_STATIC_LIBS=off -DCOMPILE_PASCAL=on -DCOMPILE_VOLTA=on -DCOMPILE_AMPERE=off -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off \
+    && make -j $NCPU && cp -v marian spm_encode spm_decode /usr/bin/ \
diff --git a/scripts/metrics/README.md b/scripts/metrics/README.md
new file mode 100644
index 000000000..4d04c20b7
--- /dev/null
+++ b/scripts/metrics/README.md
@@ -0,0 +1,36 @@
+# Marian Evaluate
+The main script is `compare.sh`, however it needs to be run in an environment where all three -- marian, unbabel-comet(pytorch), and bleurt(tensorflow) are available.
+Hence, 1) we create a docker container with all the necessary libs.
+    and 2) run compare.sh inside the docker environment
+
+## Setup: build docker image
+
+```bash
+./setup.sh
+```
+
+## Run compare.sh in docker container
+
+```bash
+./docker-run.sh
+```
+The `docker-run.sh` script mounts cache directory from the host to container.
+The necessary files (weights and vocabularies) will be automatically downloaded and cached for unbabel-comet and Bleurt metrics.
+However, for `marian-score.sh` expects the cache to be prepared under `$HOME/.cache/marian/metrics`.
+The structure/format of the cache directory for marian-score.sh looks as follows:
+```bash
+/home/$USER/.cache/marian/metrics/
+├── bleurt20-ref
+│   ├── bleurt-20.model.npz
+│   ├── bleurt.vocab.spm
+├── comet20-da-src
+│   ├── comet20-qe-da.model.npz
+│   └── roberta.vocab.spm
+└── comet20-da-src+ref
+    ├── comet20-da.model.npz
+    └── roberta.vocab.spm
+```
+Each metric subdir should have a `*model.npz` and a `*vocab.spm` files, and the name of metric directory should end with `-src|-qe|-ref|-src+ref` suffix to indicate the category of metric.
+
+> TODO: Upload Marian compatible comet and bleurt models to public blob storage and modify script to automatically download
+
diff --git a/scripts/metrics/compare.sh b/scripts/metrics/compare.sh
new file mode 100755
index 000000000..902258863
--- /dev/null
+++ b/scripts/metrics/compare.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+export PATH=$MYDIR:$PATH
+
+log() {
+    echo -e "\e[1;32m[$(date '+%Y-%m-%d %H:%M:%S')]\e[0m $@" >&2
+}
+
+get_sacrebleu_names(){
+    # using sacrebleu to get the list of systems
+    testset=wmt21/systems
+    while read line; do
+        pair=$(cut -f1 -d':' <<< $line)
+        refs=()
+        mts=()
+        while read name; do
+            # skip if name starts with $pair or src or docid
+            if [[ $name == $pair* || $name == src || $name == docid || $name == origlang ]]; then
+                continue
+            fi
+            if [[ $name == ref* ]]; then
+                refs+=($name)
+            else
+                mts+=($name)
+            fi
+        done < <(sed 's/,//g;s/ /\n/g' <<< $line)
+
+        # flatten: ref x mt
+        for ref in ${refs[@]}; do
+            for mt in ${mts[@]}; do
+                echo -e "$testset\t$pair\t$ref\t$mt"
+            done
+        done
+    done < <(sacrebleu -t $testset --list)
+}
+
+unbabel_score(){
+    local metric=$1
+    local prefix=$2
+    log "Running $metric"
+    local batch_size=64
+    comet-score --batch_size $batch_size --model $metric -s $prefix.src -r $prefix.ref -t $prefix.mt \
+        | awk -F '[:\t]' 'NF==4{print $NF}'
+}
+
+
+bleurt_score() {
+    local metric_name=$1
+    local prefix=$2
+    [[ $metric_name == "BLEURT-20" ]] || {
+        log "ERROR: BLEURT-20 is the only supported metric; given: $metric_name"
+        exit 1
+    }
+    local cache_dir=$HOME/.cache/bleurt
+    local metric_path=$cache_dir/$metric_name
+    [[ -f $metric_path/._OK ]] || {
+        log "BLEURT model not found in $HOME/.cache/bleurt .. Downloading"
+        mkdir -p $cache_dir
+        rm -rf $metric_path.zip   # remove incomplete file
+        wget https://storage.googleapis.com/bleurt-oss-21/$metric_name.zip -P $cache_dir \
+            && unzip $metric_path.zip -d $cache_dir/ && touch $metric_path/._OK
+    }
+
+    # to check if cuda libs are configured and GPU is available
+    # python -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
+    export LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH
+    python -m bleurt.score_files --bleurt_checkpoint=$metric_path \
+        --candidate_file=$prefix.mt --reference_file=$prefix.ref \
+        --bleurt_batch_size 64 2> /dev/null
+}
+
+marian_score() {
+    local metric=$1
+    local prefix=$2
+    case $metric in
+        wmt20-comet-qe-da)  metric="comet20-da-src" ;;
+        wmt20-comet-da)     metric="comet20-da-src+ref" ;;
+        BLEURT-20)          metric="bleurt20-ref" ;;
+        *) echo "Unknown metric $metric";  exit 1;;
+    esac
+    marian-score.sh -d '0' -n $metric --src $prefix.src --ref $prefix.ref --mt $prefix.mt --seg
+}
+
+
+main() {
+    cd $MYDIR
+    local metric_names=(BLEURT-20 wmt20-comet-da wmt20-comet-qe-da)
+    export CUDA_VISIBLE_DEVICES=0
+    local max_tests=10
+    local max_lines=100  # in each testset
+    while IFS=$'\t' read tset pair ref mt; do
+        for mn in ${metric_names[@]}; do
+            log "Comparing >> $mn << on $tset $pair $ref $mt"
+            local data=$(sacrebleu -t $tset -l $pair --echo src ref $mt)
+            local tmp_pref=tmp.testset
+            rm -rf $tmp_pref.{src,ref,mt}
+            cut -f1 <<< "$data" | head -n $max_lines > $tmp_pref.src
+            cut -f2 <<< "$data" | head -n $max_lines > $tmp_pref.ref
+            cut -f3 <<< "$data" | head -n $max_lines > $tmp_pref.mt
+            if [[ $mn =~ BLEURT* ]]; then
+                local orig_out=$(bleurt_score $mn $tmp_pref)
+            else
+                local orig_out=$(unbabel_score $mn $tmp_pref 2> /dev/null)
+            fi
+            local marian_out=$(marian_score $mn $tmp_pref)
+            paste <(echo "$marian_out") <(echo "$orig_out") \
+                | awk -F '\t' -v OFS='\t' -v mn=$mn \
+                        'BEGIN {tot=0.0} {diff=sqrt(($1-$2)^2); tot+=diff; print diff,$0}
+                         END {printf "\n===Avg diff in %s: %f===\n\n", mn, tot/NR}'
+            #TODO1: extract averages and write to a report file
+            #TODO2: benchmark speeds
+        done
+    done <  <(get_sacrebleu_names | head -n $max_tests)
+}
+
+main "$@"
\ No newline at end of file
diff --git a/scripts/metrics/docker-run.sh b/scripts/metrics/docker-run.sh
new file mode 100755
index 000000000..c379c4415
--- /dev/null
+++ b/scripts/metrics/docker-run.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd $MYDIR
+
+IMAGE="marian-dev"
+
+VISIBLE_GPUS="1"   # exlcude 0 for now; run on single GPU
+
+MOUNTS="-v $PWD:$PWD"
+for cache in .sacrebleu .cache/{marian,torch,huggingface,bleurt}; do
+    MOUNTS+=" -v $HOME/$cache:/root/$cache"
+done
+
+
+cmd="docker run --rm -i $MOUNTS --gpus "\"device=$VISIBLE_GPUS\"" -t $IMAGE"
+
+# uncomment for an interactive shell
+# $cmd bash
+
+$cmd $PWD/compare.sh $@
diff --git a/scripts/metrics/marian-score.sh b/scripts/metrics/marian-score.sh
new file mode 100755
index 000000000..873ef5921
--- /dev/null
+++ b/scripts/metrics/marian-score.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+set -eu
+
+MYDIR=$(realpath $(dirname ${BASH_SOURCE[0]}))
+
+
+METRICS_CACHE=$HOME/.cache/marian/metrics
+
+log() {
+    echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $@" >&2
+}
+
+which marian > /dev/null || {
+    log "marian not found in PATH. Please add marian binary to \$PATH and rerun"
+    exit 2
+}
+
+metric_name=
+src_file=
+ref_file=
+hyp_file=
+is_seg=
+debug_mode=
+batch_size=32
+pool_size=10
+max_length=256
+devices=0
+workspace=-4000
+
+usage() {
+    log " ${BASH_SOURCE##*/} -n METRIC -m HYP [-s SRC] [-r REF] [-d DEVICES] [--seg] [--debug] [-h|--help]
+
+Args:
+    -n|--name|--metric NAME  Metric name; required. See below for details.
+    -m|--mt|--hyp FILE       MT hypothesis, required for all metrics.
+    -s|--src FILE     Source file, required for source based metrics.
+    -r|--ref FILE     Reference file, required for reference based metrics.
+    -d|--devices DEV  IDs of GPU devices to use. Use quoted string to pass multiple values. Default: '$devices'
+    --seg             Output segment-level scores. Default: print only the corpus-level score (mean of segment scores)
+    --debug           Enable verbose mode (default is quiet)
+    -h|--help         Print this help message
+
+Metric name (-n|--name) shuld be a subdir name under $METRICS_CACHE.
+The metric name should have a suffix (-src|-qe|-ref|-src+ref) indicating the type of metric:
+    *-src|*-qe   Source-based metric and requires --src arg, e.g., comet20-src or comet20-da-qe
+    *-ref        Reference-based metric and requires --ref arg, e.g., bleurt20-ref
+    *-src+ref    Both source and reference based and requires --src and --ref args e.g., comet20-src+ref
+"
+}
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -s|--src)       src_file=$2; shift 2;;
+        -r|--ref)       ref_file=$2; shift 2;;
+        -m|--mt|--hyp)  hyp_file=$2; shift 2;;
+        -n|--name|--metric) metric_name=$2; shift 2;;
+        -d|--devices)   devices=$2; shift 2;;
+        --seg)          is_seg=1; shift 1;;
+        --debug)        debug_mode=1; shift 1;;
+        -h|--help)      usage; exit 0;;
+        *) log "ERROR: unknown option $1"; usage; exit 1;;
+    esac
+done
+
+[[ -n $metric_name ]] || { log "ERROR: metric_name=$metric_name name not provided"; usage; exit 1; }
+[[ -e $hyp_file ]] || { log "ERROR: hyp file not provided"; usage; exit 1; }
+
+metric_dir=$METRICS_CACHE/$metric_name
+checkpoint=$(echo $metric_dir/*model.npz)  # file model.npz or <blah>.model.npz
+vocab=$(echo $metric_dir/*vocab.spm)
+[[ -f $checkpoint && -f $vocab ]] || {
+    log "ERROR: metric $metric_name is not valid. See ls $METRICS_CACHE/$metric_name/{*model.npz,*vocab.spm}"
+    exit 1
+}
+
+# args common to all models
+cmd="marian evaluate -w -4000"
+[[ -n $devices ]] && cmd+=" -d $devices"
+[[ -n $debug_mode ]] || cmd+=" --quiet"
+cmd+=" -m $checkpoint --max-length $max_length --max-length-crop --mini-batch $batch_size --maxi-batch $pool_size -t stdin --tsv"
+input=  # to be filled later
+
+
+check_file(){
+    local name=$1
+    local file=$2
+    [[ -e $file ]] || { log "ERROR: $name file $file does not exist"; exit 1; }
+    [[ -s $file ]] || { log "ERROR: $name file $file is empty"; exit 1; }
+}
+
+metric_type=${metric_name##*-}   # suffix expected: src, ref, src+ref
+case $metric_type in
+    src|qe)
+        # two sequences: src, hyp
+        check_file src $src_file
+        cmd+=" --like comet-qe -v $vocab $vocab"
+        input="paste $src_file $hyp_file"
+        ;;
+    ref)
+        check_file ref $ref_file
+        # two sequences: ref, hyp
+        cmd+=" --like bleurt -v $vocab $vocab"
+        input="paste $ref_file $hyp_file"
+        ;;
+    src+ref)
+        # three sequences: src, hyp, ref;  three vocabularies
+        check_file src $src_file
+        check_file ref $ref_file
+        cmd+=" --like comet -v $vocab $vocab $vocab"
+        input="paste $src_file $hyp_file $ref_file"
+        ;;
+    *)
+        log "ERROR: $metric_name is not valid. Valid metrics have suffix '-{src|qe|ref|src+ref}'"
+        exit 3
+        ;;
+esac
+
+if [[ -z $is_seg ]]; then
+    cmd+=" --average only";
+fi
+pipeline="$input | $cmd | cut -f1 -d' '"
+
+# mean (default) or segment-level scores
+
+log "Running: $pipeline"
+eval $pipeline
diff --git a/scripts/metrics/setup.sh b/scripts/metrics/setup.sh
new file mode 100755
index 000000000..df16563a6
--- /dev/null
+++ b/scripts/metrics/setup.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd $MYDIR
+
+#SSH_KEY=$HOME/.ssh/id_rsa    # for git clone inside docker build
+IMAGE=marian-dev
+echo "Building docker image $IMAGE"
+#DOCKER_BUILDKIT=1 docker build --ssh default=$SSH_KEY . -f Dockerfile -t $IMAGE
+DOCKER_BUILDKIT=1 docker build . -f Dockerfile -t $IMAGE
+
+
+# Optional build args:
+# --build-arg MARIAN_COMMIT=master \
+# --build-arg MARIAN_REPO=https://github.com/marian-nmt/marian-dev.git \
+# --build-arg NCPUS=16
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d1f119335..77c455946 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -61,6 +61,7 @@ set(MARIAN_SOURCES
   tensors/cpu/tensor_operators.cpp
   tensors/cpu/integer_common.cpp
   tensors/cpu/fbgemm/packed_gemm.cpp
+  tensors/gpu/gpu_info.cpp
 
   graph/expression_graph.cpp
   graph/expression_operators.cpp
diff --git a/src/command/marian_evaluator.cpp b/src/command/marian_evaluator.cpp
new file mode 100644
index 000000000..bd9617b54
--- /dev/null
+++ b/src/command/marian_evaluator.cpp
@@ -0,0 +1,15 @@
+#include "marian.h"
+
+#include "models/model_task.h"
+#include "evaluator/evaluator.h"
+#include "common/timer.h"
+
+int main(int argc, char** argv) {
+  using namespace marian;
+
+  // @TODO: add mode evaluating
+  auto options = parseOptions(argc, argv, cli::mode::evaluating);
+  New<Evaluate<Evaluator>>(options)->run();
+  
+  return 0;
+}
diff --git a/src/command/marian_main.cpp b/src/command/marian_main.cpp
index dcdea4662..e838fe808 100644
--- a/src/command/marian_main.cpp
+++ b/src/command/marian_main.cpp
@@ -28,6 +28,9 @@
 #define main mainEmbedder
 #include "marian_embedder.cpp"
 #undef main
+#define main mainEvaluator
+#include "marian_evaluator.cpp"
+#undef main
 #define main mainVocab
 #include "marian_vocab.cpp"
 #undef main
@@ -49,6 +52,7 @@ int main(int argc, char** argv) {
     else if(cmd == "decode")     return mainDecoder(argc, argv);
     else if (cmd == "score")     return mainScorer(argc, argv);
     else if (cmd == "embed")     return mainEmbedder(argc, argv);
+    else if (cmd == "evaluate")  return mainEvaluator(argc, argv);
     else if (cmd == "vocab")     return mainVocab(argc, argv);
     else if (cmd == "convert")   return mainConv(argc, argv);
     std::cerr << "Command must be train, decode, score, embed, vocab, or convert." << std::endl;
diff --git a/src/common/config.cpp b/src/common/config.cpp
index a1c4ed5ac..efdd29c12 100644
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@@ -247,48 +247,23 @@ std::vector<DeviceId> Config::getDevices(Ptr<Options> options,
   }
   // GPU: devices[] are interpreted in a more complex way
   else {
-    size_t numDevices = options->get<size_t>("num-devices", 0);
     std::vector<size_t> deviceNos;
-    for(auto d : devicesArg)
-      deviceNos.push_back((size_t)std::stoull(d));
+    for(auto d : devicesArg) {
+      if(d == "all") {
+        // on encoutering "all" overwrite all given ids with all available ids
+        size_t numDevices = gpu::availableDevices();
+        deviceNos.resize(numDevices);
+        std::iota(deviceNos.begin(), deviceNos.end(), 0);
+        break;
+      } else {
+        deviceNos.push_back((size_t)std::stoull(d));
+      }
+    }
 
-    // if devices[] is empty then default to 0..N-1, where N = numDevices or 1
     if (deviceNos.empty()) {
-      if(numDevices == 0)  // if neither is given, then we default to 1 device, which is device[0]
-        numDevices = 1;
-      for(size_t i = 0; i < numDevices; ++i) // default to 0..N-1
-        deviceNos.push_back(i);
-    }
-    // devices[] is not empty
-    else if(numDevices == 0) // if device list then num devices defaults to list size
-      numDevices = deviceNos.size(); // default to #devices
-
-    // If multiple MPI processes then we can either have one set of devices shared across all
-    // MPI-processes, or the full list across all MPI processes concatenated.  E.g. --num-devices 1
-    // --devices 0 2 4 5 means 4 processes using devices 0, 2, 4, and 5, respectively.  In that
-    // case, we cut out and return our own slice. In the above example, for MPI process 1, we would
-    // return {2}.
-
-    // special-case the error message (also caught indirectly below, but with a msg that is
-    // confusing when one does not run multi-node)
-    if(numMPIProcesses == 1)
-      // same as requiring numPerMPIProcessDeviceNos == 1
-      // @TODO: improve logging message as devices[] and numDevices are not informative for the user
-      ABORT_IF(numDevices != deviceNos.size(), "devices[] size must be equal to numDevices");
-
-    // how many lists concatenated in devices[]? Allowed is either 1 (=shared) or numWorkers
-    size_t numPerMPIProcessDeviceNos = deviceNos.size() / numDevices;
-    // @TODO: improve logging message as devices[] and numDevices are not informative for the user
-    ABORT_IF(numDevices * numPerMPIProcessDeviceNos != deviceNos.size(),
-             "devices[] size must be equal to or a multiple of numDevices");  // (check that it is a multiple)
-
-    // if multiple concatenated lists are given, slice out the one for myMPIRank
-    if(numPerMPIProcessDeviceNos != 1) {
-      ABORT_IF(numPerMPIProcessDeviceNos != numMPIProcesses,
-               "devices[] must either list a shared set of devices, or one set per MPI process");
-      deviceNos.erase(deviceNos.begin(), deviceNos.begin() + myMPIRank * numDevices);
-      deviceNos.resize(numDevices);
+      deviceNos.push_back(0);
     }
+
     // form the final vector
     for(auto d : deviceNos)
       devices.push_back({ d, DeviceType::gpu });
diff --git a/src/common/config.h b/src/common/config.h
index c5a016e68..c22d7415e 100644
--- a/src/common/config.h
+++ b/src/common/config.h
@@ -14,12 +14,17 @@
 
 namespace marian {
 
+namespace gpu {
+  // defined in src/tensors/gpu/gpu_info.cpp
+  size_t availableDevices();
+}
+
 // TODO: Finally refactorize Config, Options, ConfigParser and ConfigValidator
 // classes.
 //
 // TODO: The problem is that there are many config classes in here, plus
 // "configuration" can refer to the high-level concept of the entire program's
-// configuration, and/or any of its representations. Avoidthe term "config" and
+// configuration, and/or any of its representations. Avoid the term "config" and
 // always qualify it what kind of config, e.g. new Options instance.
 //
 // TODO: What is not clear is the different config levels as there are classes
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index d70048fe9..0d8021bf1 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -94,6 +94,9 @@ ConfigParser::ConfigParser(cli::mode mode)
     case cli::mode::embedding:
       addOptionsEmbedding(cli_);
       break;
+    case cli::mode::evaluating:
+      addOptionsEvaluating(cli_);
+      break;
     default:
       ABORT("wrong CLI mode");
       break;
@@ -563,7 +566,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       "Throw exception if training diverges. Divergence is detected if the running average loss over arg1 steps "
       "is exceeded by the running average loss over arg2 steps (arg1 >> arg2) by arg3 standard deviations")
       ->implicit_val("100 10 3.0f");
-  cli.add<bool>("--fp16-fallback-to-fp32", 
+  cli.add<bool>("--fp16-fallback-to-fp32",
       "If fp16 training diverges and throws try to continue training with fp32 precision");
   cli.add<size_t>("--gradient-norm-average-window",
       "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
@@ -824,7 +827,7 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
 }
 
 void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) {
-  auto previous_group = cli.switchGroup("Scorer options");
+  auto previous_group = cli.switchGroup("Embedder options");
 
   // clang-format off
   cli.add<bool>("--no-reload",
@@ -856,17 +859,122 @@ void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) {
       "Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16",
       {"float32"});
 
+  cli.add<std::string>("--like",
+    "Set good defaults for supported embedder types: roberta (works for all COMET flavors)");
+
+  // Short-cut for Unbabel comet-qe metric
+  cli.alias("like", "roberta", [](YAML::Node& config) {
+    // Model options
+    config["train-sets"] = std::vector<std::string>({"stdin"});
+    config["input-types"] = std::vector<std::string>({"sequence"});
+    config["max-length"] = 512;
+    config["max-length-crop"] = true;
+    config["mini-batch"] = 32;
+    config["maxi-batch"] = 100;
+    config["maxi-batch-sort"] = "src";
+    config["workspace"] = -4000;
+    config["devices"] = std::vector<std::string>({"all"});
+  });
+
+  cli.switchGroup(previous_group);
+  // clang-format on
+}
+
+void ConfigParser::addOptionsEvaluating(cli::CLIWrapper& cli) {
+  auto previous_group = cli.switchGroup("Evaluator options");
+
+  cli.add<bool>("--no-reload",
+      "Do not load existing model specified in --model arg");
+  // @TODO: move options like vocabs and train-sets to a separate procedure as they are defined twice
+  cli.add<std::vector<std::string>>("--train-sets,-t",
+      "Paths to corpora to be scored: source target");
+  cli.add<std::string>("--output,-o",
+      "Path to output file, stdout by default",
+      "stdout");
+  cli.add<std::vector<std::string>>("--vocabs,-v",
+      "Paths to vocabulary files have to correspond to --train-sets. "
+      "If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. "
+      "If these files do not exists they are created");
+  cli.add<size_t>("--width",
+      "Floating point precision of metric outputs",
+      4);
+  cli.add<std::string>("--average",
+      "Report average of all sentence-level values. By default the average is appended as the last line. "
+      "Alternatively, we can provide `--average only` which supresses other values.",
+      "skip")->implicit_val("append");
+
+  addSuboptionsInputLength(cli);
+  addSuboptionsTSV(cli);
+  addSuboptionsDevices(cli);
+  addSuboptionsBatching(cli);
+
+  cli.add<bool>("--fp16",
+      "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
+  cli.add<std::vector<std::string>>("--precision",
+      "Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16",
+      {"float32"});
+
+  cli.add<std::string>("--like",
+      "Set good defaults for supported metric types: comet-qe, comet, bleurt");
+
+  // Short-cut for Unbabel comet-qe metric
+  cli.alias("like", "comet-qe", [](YAML::Node& config) {
+    // Model options
+    config["train-sets"] = std::vector<std::string>({"stdin"});
+    config["tsv"] = true;
+    config["tsv-fields"] = 2;
+    config["input-types"] = std::vector<std::string>({"sequence", "sequence"});
+    config["max-length"] = 512;
+    config["max-length-crop"] = true;
+    config["mini-batch"] = 32;
+    config["maxi-batch"] = 100;
+    config["maxi-batch-sort"] = "src";
+    config["workspace"] = -4000;
+    config["devices"] = std::vector<std::string>({"all"});
+  });
+
+  // Short-cut for Unbabel comet metric
+  cli.alias("like", "comet", [cli](YAML::Node& config) {
+    // Model options
+    config["train-sets"] = std::vector<std::string>({"stdin"});
+    config["tsv"] = true;
+    config["tsv-fields"] = 3;
+    config["input-types"] = std::vector<std::string>({"sequence", "sequence", "sequence"});
+    config["max-length"] = 512;
+    config["max-length-crop"] = true;
+    config["mini-batch"] = 32;
+    config["maxi-batch"] = 100;
+    config["maxi-batch-sort"] = "src";
+    config["workspace"] = -4000;
+    config["devices"] = std::vector<std::string>({"all"});
+  });
+
+  // Short-cut for Google bleurt metric
+  cli.alias("like", "bleurt", [](YAML::Node& config) {
+    // Model options
+    config["train-sets"] = std::vector<std::string>({"stdin"});
+    config["tsv"] = true;
+    config["tsv-fields"] = 2;
+    config["input-types"] = std::vector<std::string>({"sequence", "sequence"});
+    config["max-length"] = 512;
+    config["max-length-crop"] = true;
+    config["mini-batch"] = 32;
+    config["maxi-batch"] = 100;
+    config["maxi-batch-sort"] = "src";
+    config["workspace"] = -4000;
+    config["devices"] = std::vector<std::string>({"all"});
+  });
+
   cli.switchGroup(previous_group);
   // clang-format on
 }
 
+
 void ConfigParser::addSuboptionsDevices(cli::CLIWrapper& cli) {
   // clang-format off
   cli.add<std::vector<std::string>>("--devices,-d",
-      "Specifies GPU ID(s) to use for training. Defaults to 0..num-devices-1",
+      "Specifies GPU ID(s) (e.g. '0 1 2 3' or 'all') to use for training. Defaults to GPU ID 0",
       {"0"});
-  cli.add<size_t>("--num-devices",
-      "Number of GPUs to use for this process. Defaults to length(devices) or 1");
 #ifdef USE_NCCL
   if(mode_ == cli::mode::training) {
     cli.add<bool>("--no-nccl",
@@ -1093,10 +1201,6 @@ Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate)
       cli_.updateConfig(config, cli::OptionPriority::CommandLine, "A shortcut for STDIN failed.");
   }
 
-  if(doValidate) {
-    ConfigValidator(config_).validateOptions(mode_);
-  }
-
   // remove extra config files from the config to avoid redundancy
   config_.remove("config");
 
@@ -1109,6 +1213,10 @@ Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate)
       cli_.parseAliases();
     }
 
+    if(doValidate) {  // validate before options are dumped and we exit
+      ConfigValidator(config_, true).validateOptions(mode_);
+    }
+
     bool minimal = (dumpMode == "minimal" || dumpMode == "expand");
     std::cout << cli_.dumpConfig(minimal) << std::endl;
     exit(0);
@@ -1186,6 +1294,10 @@ Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate)
 #endif
 
   cli_.parseAliases();
+  if(doValidate) {  // validate the options after aliases are expanded
+    ConfigValidator(config_).validateOptions(mode_);
+  }
+
   auto opts = New<Options>();
   opts->merge(Config(*this).get());
   return opts;
diff --git a/src/common/config_parser.h b/src/common/config_parser.h
index 18b6eccb7..617b86e5a 100644
--- a/src/common/config_parser.h
+++ b/src/common/config_parser.h
@@ -14,7 +14,7 @@
 namespace marian {
 
 namespace cli {
-enum struct mode { training, translation, scoring, server, embedding };
+enum struct mode { training, translation, scoring, server, embedding, evaluating };
 }  // namespace cli
 
 /**
@@ -130,6 +130,7 @@ class ConfigParser {
   void addOptionsTranslation(cli::CLIWrapper&);
   void addOptionsScoring(cli::CLIWrapper&);
   void addOptionsEmbedding(cli::CLIWrapper&);
+  void addOptionsEvaluating(cli::CLIWrapper&);
 
   void addAliases(cli::CLIWrapper&);
 
diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp
index 5563b240d..1b31b96a2 100644
--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@@ -17,6 +17,10 @@ ConfigValidator::ConfigValidator(const YAML::Node& config)
       dumpConfigOnly_(config["dump-config"] && !config["dump-config"].as<std::string>().empty()
                       && config["dump-config"].as<std::string>() != "false") {}
 
+ConfigValidator::ConfigValidator(const YAML::Node& config, bool dumpConfigOnly)
+    : config_(config),
+      dumpConfigOnly_(dumpConfigOnly) {}
+
 ConfigValidator::~ConfigValidator() {}
 
 void ConfigValidator::validateOptions(cli::mode mode) const {
@@ -33,6 +37,10 @@ void ConfigValidator::validateOptions(cli::mode mode) const {
       validateOptionsParallelData();
       validateOptionsScoring();
       break;
+    case cli::mode::evaluating:
+      validateOptionsParallelData();
+      validateOptionsScoring();
+      break;
     case cli::mode::training:
       validateOptionsParallelData();
       validateOptionsTraining();
@@ -49,9 +57,13 @@ void ConfigValidator::validateOptions(cli::mode mode) const {
 
 void ConfigValidator::validateOptionsTranslation() const {
   auto models = get<std::vector<std::string>>("models");
-  auto configs = get<std::vector<std::string>>("config");
+  bool no_configs = true;
+  if(has("config")) {
+      auto configs = get<std::vector<std::string>>("config");
+      no_configs = configs.empty();
+  }
 
-  ABORT_IF(models.empty() && configs.empty(),
+  ABORT_IF(models.empty() && no_configs,
            "You need to provide at least one model file or a config file");
 
 #ifdef COMPILE_CPU
@@ -195,8 +207,8 @@ void ConfigValidator::validateDevices(cli::mode /*mode*/) const {
   std::string help;
 
   // valid strings: '0', '0 1 2 3', '3 2 0 1'
-  pattern = "[0-9]+( *[0-9]+)*";
-  help = "Supported formats: '0 1 2 3'";
+  pattern = "([0-9]+|all)( *([0-9]+|all))*";
+  help = "Supported formats: '0 1 2 3' or 'all'";
 
   ABORT_IF(!regex::regex_match(devices, pattern),
            "the argument '{}' for option '--devices' is invalid. {}",
diff --git a/src/common/config_validator.h b/src/common/config_validator.h
index 0e73a9e39..e5742194c 100644
--- a/src/common/config_validator.h
+++ b/src/common/config_validator.h
@@ -10,13 +10,14 @@ class ConfigValidator {
   const YAML::Node& config_;
 
   bool has(const std::string& key) const;
+
   template <typename T>
   T get(const std::string& key) const {
     return config_[key].as<T>();
   }
 
-  // The option --dump-config is used, so alleviate some constraints, e.g. we don't want to require
-  // --train-sets or --vocabs
+  // When --dump-config is used, alleviate some constraints, for example, do not
+  // require --train-sets or --vocabs
   bool dumpConfigOnly_{false};
 
   void validateOptionsTranslation() const;
@@ -29,6 +30,7 @@ class ConfigValidator {
 
 public:
   ConfigValidator(const YAML::Node& config);
+  ConfigValidator(const YAML::Node& config, bool dumpConfigOnly);
   virtual ~ConfigValidator();
 
   // Validate options according to the given mode. Abort on first validation error
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index a429ae2f3..5fbfe636b 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -61,7 +61,8 @@ CorpusBase::CorpusBase(const std::vector<std::string>& paths,
       rightLeft_(options_->get<bool>("right-left")),
       prependZero_(options_->get<bool>("comet-prepend-zero", false)),
       tsv_(options_->get<bool>("tsv", false)),
-      tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
+      tsvNumInputFields_(getNumberOfTSVInputFields(options)),
+      joinFields_(options_->get<bool>("input-join-fields", false)) {
   // TODO: support passing only one vocab file if we have fully-tied embeddings
   if(tsv_) {
     ABORT_IF(tsvNumInputFields_ != vocabs_.size(),
@@ -87,7 +88,8 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate, size_t seed)
       rightLeft_(options_->get<bool>("right-left")),
       prependZero_(options_->get<bool>("comet-prepend-zero", false)),
       tsv_(options_->get<bool>("tsv", false)),
-      tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
+      tsvNumInputFields_(getNumberOfTSVInputFields(options)),
+      joinFields_(options_->get<bool>("input-join-fields", false)) {
   bool training = !translate;
 
   if(training)
@@ -426,8 +428,12 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
 
   auto inputTypes = options_->get<std::vector<std::string>>("input-types", {}); // empty list by default
 
-  if(prependZero_ && inputTypes[batchIndex] == "sequence")
-    words.insert(words.begin(), Word::fromWordIndex(0));
+  // This handles adding starts symbols for COMET (<s>) and BERT/BLEURT ([CLS])
+  bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && batchIndex == 0));
+  if(prepend && inputTypes[batchIndex] == "sequence") {
+    auto prependedWord = Word::fromWordIndex(0);
+    words.insert(words.begin(), prependedWord);
+  }
 
   if(maxLengthCrop_ && words.size() > maxLength_) {
     words.resize(maxLength_);
@@ -438,7 +444,12 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
   if(rightLeft_)
     std::reverse(words.begin(), words.end() - 1);
 
-  tup.push_back(words);
+  // if true, the numeric indices get joined with the previous sentence, <eos> acts as a separator here
+  // @TODO: make this cleaner.
+  if(joinFields_)
+    tup.appendToBack(words);
+  else
+    tup.pushBack(words);
 }
 
 void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h
index 123250d97..7a03414b4 100644
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@@ -72,7 +72,22 @@ class SentenceTupleImpl {
    *
    * @param words A vector of word indices.
    */
-  void push_back(const Words& words) { tuple_.push_back(words); }
+  void pushBack(const Words& words) { tuple_.push_back(words); }
+
+  /**
+   * @brief Appends mroe words to the last sentence of the tuple.
+   *
+   * @param words A vector of word indices.
+   */
+  void appendToBack(const Words& words) { 
+    if(tuple_.empty()) {
+      tuple_.push_back(words);
+    } else {
+      for(auto& w : words) {
+        tuple_.back().push_back(w);
+      }
+    }
+  }
 
   /**
    * @brief The size of the tuple, e.g. two for parallel data with a source and
@@ -644,6 +659,9 @@ class CorpusBase : public DatasetBase<SentenceTuple, CorpusIterator, CorpusBatch
   size_t tsvNumInputFields_{0};  // number of fields from the TSV input that are associated
                                   // with vocabs, i.e. excluding fields with alignment or
                                   // weights, only if --tsv
+
+  bool joinFields_{false}; // if true when given a TSV file or multiple inputs, join them together with a specified separator.
+
   /**
    * @brief Determine the number of fields from the TSV input that are associated with
    * vocabs, i.e. excluding fields that contain alignment or weights
diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp
index 196cf421a..3485a223f 100644
--- a/src/data/text_input.cpp
+++ b/src/data/text_input.cpp
@@ -52,7 +52,7 @@ SentenceTuple TextInput::next() {
 
       ABORT_IF(words.empty(),   "No words (not even EOS) found in string??");
       ABORT_IF(tup.size() != i, "Previous tuple elements are missing.");
-      tup.push_back(words);
+      tup.pushBack(words);
     }
   }
 
diff --git a/src/embedder/vector_collector.cpp b/src/embedder/vector_collector.cpp
index 11b07b43b..eb55779e0 100644
--- a/src/embedder/vector_collector.cpp
+++ b/src/embedder/vector_collector.cpp
@@ -11,14 +11,16 @@ namespace marian {
 // This class manages multi-threaded writing of embedded vectors to stdout or an output file.
 // It will either output string versions of float vectors or binary equal length versions depending
 // on its binary_ flag.
-VectorCollector::VectorCollector(bool binary)
+VectorCollector::VectorCollector(bool binary, size_t width)
   : nextId_(0),
-    binary_(binary) {}
+    binary_(binary),
+    width_{width} {}
 
-VectorCollector::VectorCollector(std::string outFile, bool binary)
+VectorCollector::VectorCollector(std::string outFile, bool binary, size_t width)
   : nextId_(0),
     outStrm_(new std::ostream(std::cout.rdbuf())),
-    binary_(binary) {
+    binary_(binary),
+    width_(width) {
   if (outFile != "stdout")
     outStrm_.reset(new io::OutputFileStream(outFile));
 }
@@ -63,11 +65,48 @@ void VectorCollector::WriteVector(const std::vector<float>& vec) {
   if(binary_) {
     outStrm_->write((char*)vec.data(), vec.size() * sizeof(float));
   } else {
-    *outStrm_ << std::fixed << std::setprecision(4);
+    *outStrm_ << std::fixed << std::setprecision(width_);
     for(auto v : vec)
       *outStrm_ << v << " ";
     *outStrm_ << std::endl;
   }
 }
 
+void AveragingVectorCollector::WriteVector(const std::vector<float>& vec) {
+  if(!onlyLast_)
+    VectorCollector::WriteVector(vec);
+  
+  if(sum_.size() < vec.size())
+    sum_.resize(vec.size());
+  for(size_t i = 0; i < vec.size(); ++i)
+    sum_[i] += vec[i];
+  count_++;
+}
+
+void AveragingVectorCollector::WriteAverage() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto avg = sum_;
+  for(auto& val : avg)
+    val /= (float)count_;
+  VectorCollector::WriteVector(avg);
+}
+
+Ptr<VectorCollector> VectorCollector::Create(Ptr<Options> options) {
+  std::string average = options->get<std::string>("average", "skip");
+  std::string output  = options->get<std::string>("output");
+  size_t width        = options->get<size_t>("width", DEFAULT_WIDTH);
+
+  Ptr<VectorCollector> collector;
+  if(average == "skip")
+    collector = New<VectorCollector>(output, /*binary=*/false, width);
+  else if(average == "append")
+    collector = New<AveragingVectorCollector>(output, /*binary=*/false, width, /*onlyLast=*/false);
+  else if(average == "only")
+    collector = New<AveragingVectorCollector>(output, /*binary=*/false, width, /*onlyLast=*/true);
+  else
+    ABORT("Unknown configuration for VectorCollector");
+  
+  return collector;
+}
+
 }  // namespace marian
diff --git a/src/embedder/vector_collector.h b/src/embedder/vector_collector.h
index fc39ea6ec..3f1f91e0c 100644
--- a/src/embedder/vector_collector.h
+++ b/src/embedder/vector_collector.h
@@ -11,19 +11,24 @@ namespace marian {
 
 // This class manages multi-threaded writing of embedded vectors to stdout or an output file.
 // It will either output string versions of float vectors or binary equal length versions depending
-// on its binary_ flag.
+// on its binary flag. If binary=false, width can be used to set the number of decimal places.
 class VectorCollector {
 public:
-  VectorCollector(bool binary=false);
-  VectorCollector(std::string outFile, bool binary=false);
+  static const size_t DEFAULT_WIDTH = 4;
+
+  VectorCollector(bool binary=false, size_t width=DEFAULT_WIDTH);
+  VectorCollector(std::string outFile, bool binary=false, size_t width=DEFAULT_WIDTH);
   virtual ~VectorCollector() {}
   
   virtual void Write(long id, const std::vector<float>& vec);
 
+  static Ptr<VectorCollector> Create(Ptr<Options> options);
+
 protected:
   long nextId_{0};
   UPtr<std::ostream> outStrm_;
   bool binary_; // output binary floating point vectors if set
+  size_t width_{DEFAULT_WIDTH};
 
   std::mutex mutex_;
 
@@ -32,4 +37,30 @@ class VectorCollector {
 
   virtual void WriteVector(const std::vector<float>& vec);
 };
+
+// Add a running summation of vector elements and outputs the average vector on destruction.
+// Can also be configured to omit line-by-line results.
+class AveragingVectorCollector : public VectorCollector {
+private:
+  std::vector<float> sum_;
+  size_t count_{0};
+  bool onlyLast_{false};
+
+protected:
+  virtual void WriteVector(const std::vector<float>& vec) override;
+
+public:
+  AveragingVectorCollector(bool binary=false, size_t width=DEFAULT_WIDTH, bool onlyLast=false) 
+  : VectorCollector(binary, width), onlyLast_(onlyLast) {}
+  
+  AveragingVectorCollector(std::string outFile, bool binary=false, size_t width=DEFAULT_WIDTH, bool onlyLast=false) 
+  : VectorCollector(outFile, binary, width), onlyLast_(onlyLast) {}
+  
+  virtual ~AveragingVectorCollector() {
+    WriteAverage();
+  }
+
+  virtual void WriteAverage();
+};
+
 }  // namespace marian
diff --git a/src/evaluator/evaluator.h b/src/evaluator/evaluator.h
new file mode 100644
index 000000000..31fe00e87
--- /dev/null
+++ b/src/evaluator/evaluator.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include "marian.h"
+
+#include "common/config.h"
+#include "common/options.h"
+#include "data/batch_generator.h"
+#include "data/corpus.h"
+#include "data/corpus_nbest.h"
+#include "models/costs.h"
+#include "models/model_task.h"
+#include "embedder/vector_collector.h"
+#include "training/scheduler.h"
+#include "training/validator.h"
+
+namespace marian {
+
+using namespace data;
+
+/*
+ * The tool is used to calculate metric score for various neural metrics.
+ * @TODO: add the string-based matrics that we have already implemented like bleu and chrf.
+ */
+class Evaluator {
+private:
+  Ptr<models::IModel> model_;
+
+public:
+  Evaluator(Ptr<Options> options)
+    : model_(createModelFromOptions(options, models::usage::evaluating)) {}
+
+  void load(Ptr<ExpressionGraph> graph, const std::vector<io::Item>& items) {
+    model_->load(graph, items);
+  }
+
+  void load(Ptr<ExpressionGraph> graph, const std::string& fileName) {
+    model_->load(graph, fileName);
+  }
+
+  Expr build(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
+    auto evaluator = std::dynamic_pointer_cast<EncoderPooler>(model_);
+    ABORT_IF(!evaluator, "Could not cast to EncoderPooler");
+    return evaluator->apply(graph, batch, /*clearGraph=*/true)[0];
+  }
+};
+
+/*
+ * Actual Evaluate task. @TODO: this should be simplified in the future.
+ */
+template <class Model>
+class Evaluate : public ModelTask {
+private:
+  Ptr<Options> options_;
+  Ptr<CorpusBase> corpus_;
+  std::vector<Ptr<ExpressionGraph>> graphs_;
+  std::vector<Ptr<Model>> models_;
+  std::vector<marian::io::Item> ioItems_;
+
+public:
+  Evaluate(Ptr<Options> options) : options_(options) {
+    options_ = options_->with("inference", true, 
+                              "shuffle", "none");
+
+    corpus_ = New<Corpus>(options_);
+    corpus_->prepare();
+
+    auto devices = Config::getDevices(options_);
+
+    auto modelPath = options_->get<std::string>("model");
+    LOG(info, "Loading model from {}", modelPath);
+    ioItems_ = io::loadItems(modelPath);
+
+    graphs_.resize(devices.size());
+    models_.resize(devices.size());
+
+    ThreadPool pool(devices.size(), devices.size());
+    for(size_t i = 0; i < devices.size(); ++i) {
+      pool.enqueue(
+          [=](size_t j) {
+            auto graph     = New<ExpressionGraph>(true);
+            auto precison  = options_->get<std::vector<std::string>>("precision", {"float32"});
+            graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph   
+            graph->setDevice(devices[j]);        
+            graph->reserveWorkspaceMB(options_->get<int>("workspace"));
+          
+            auto model = New<Model>(options_);
+            model->load(graph, ioItems_);
+
+            models_[j] = model;
+            graphs_[j] = graph; 
+          },
+          i);
+    }
+  }
+
+  void run() override {
+    LOG(info, "Evaluating");
+    timer::Timer timer;
+    
+    auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_);
+    batchGenerator->prepare();
+
+    Ptr<VectorCollector> output = VectorCollector::Create(options_);
+
+    size_t batchId = 0;
+    {
+      ThreadPool pool(graphs_.size(), graphs_.size());
+      
+      for(auto batch : *batchGenerator) {
+        auto task = [=](size_t id) {
+          thread_local Ptr<ExpressionGraph> graph;
+          thread_local Ptr<Model> builder;
+
+          if(!graph) {
+            graph = graphs_[id % graphs_.size()];
+            builder = models_[id % graphs_.size()];
+          }
+
+          auto scores = builder->build(graph, batch);
+          graph->forward();
+
+          // handle copying from fp32 or fp16 scores correctly.
+          std::vector<float> sentVectors;
+          if(scores->value_type() == Type::float32) {
+            scores->val()->get(sentVectors);
+          } else if (scores->value_type() == Type::float16) {
+            std::vector<float16> sentVectors16;
+            scores->val()->get(sentVectors16);
+            sentVectors.reserve(sentVectors16.size());
+            for(auto& v: sentVectors16)
+              sentVectors.push_back(v);
+          } else {
+            ABORT("Unknown value type {}", scores->value_type());
+          }
+          
+          // collect embedding vector per sentence.
+          // if we compute similarities this is only one similarity per sentence pair.
+          for(size_t i = 0; i < batch->size(); ++i) {
+              auto numScores = scores->shape()[-1];
+              auto beg = i * numScores;
+              auto end = (i + 1) * numScores;
+              std::vector<float> sentVector(sentVectors.begin() + beg, sentVectors.begin() + end);
+              output->Write((long)batch->getSentenceIds()[i], sentVector);
+          }
+        };
+
+        pool.enqueue(task, batchId++);
+      }
+    }
+    LOG(info, "Total time: {:.5f}s wall", timer.elapsed());
+  }
+
+};
+
+}  // namespace marian
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index c928e8ce0..0ec6f7e67 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -582,10 +582,14 @@ Expr bdot_legacy(Expr a, Expr b, bool transA, bool transB, float scale) {
 
 Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
   // general version, MKL, CBlas or CUDA
+  std::vector<Expr> nodes = { a, b, bias };
 
-  int rows = a->shape().elements() / a->shape()[-1];
-  Expr ones = a->graph()->ones({ rows, 1 });
-  std::vector<Expr> nodes = { a, b, bias, ones };
+  auto graph = a->graph();
+  if(!graph->isInference()) {
+    int rows = a->shape().elements() / a->shape()[-1];
+    Expr ones = a->graph()->ones({ rows, 1 }, bias->value_type());
+    nodes.push_back(ones);
+  }
   return Expression<AffineNodeOp>(nodes, transA, transB, scale);
 }
 
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 2c997d577..d35ca6fff 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -317,6 +317,8 @@ class AffineNodeOp : public NaryNodeOp {
     if(!isParameter(child(2)) && computeTypeC == Type::float16)
       computeTypeC = Type::float32;
 
+    ABORT_IF(children().size() != 4, "Did we lose the column of ones required for backprob of bias??");
+
     // We reduce bias gradients with a matrix multiply
     if(!transA_ && transB_)
       return {
diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp
index 85c14f51b..93c6d9b33 100644
--- a/src/layers/embedding.cpp
+++ b/src/layers/embedding.cpp
@@ -191,8 +191,7 @@ Expr Embedding::applyIndices(const std::vector<WordIndex>& embIdx, const Shape&
   // clang-format on
   if(options_->hasAndNotEmpty("embedding-vectors")) {
     auto embFiles = opt<std::vector<std::string>>("embedding-vectors");
-    options->set(
-        "embFile", embFiles[batchIndex_], "normalization", opt<bool>("embedding-normalization"));
+    options->set("embFile", embFiles[batchIndex_], "normalization", opt<bool>("embedding-normalization"));
   }
   return New<Embedding>(graph_, options);
 }
diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h
index 035e6c51d..4f4838e48 100644
--- a/src/layers_new/attention.h
+++ b/src/layers_new/attention.h
@@ -178,7 +178,8 @@ static Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<
   // in the future we might add SingleHead or Additive or LSH-based as in Reformer
   if(selfAttentionType == "default") {
     int numHeads = options->get<int>("transformer-heads");
-    int modelDim = options->get<int>("dim-emb");
+    int modelDim = options->get<int>("transformer-dim-model", options->get<int>("dim-emb"));
+
     float attentionDropoutProbability = options->get<float>("transformer-dropout-attention", 0.f);
 
     return New<MultiHeadAttention<MultiplicativeAttention>>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability);
diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h
index 51f2ef4e3..278758a96 100644
--- a/src/layers_new/neuralnet.h
+++ b/src/layers_new/neuralnet.h
@@ -130,10 +130,18 @@ struct Linear : public Layer, public IUnaryLayer {
       registerParameterLazy(bias, Shape({ dimOut }), inits::zeros());
     }
 
+    Type outputType = x->value_type();
     if(useBias)
-      return marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed);
+      return marian::affine(x, 
+                            marian::cast(weight, outputType), 
+                            marian::cast(bias, outputType), 
+                            /*transA=*/false, 
+                            /*transB=*/transposed);
     else
-      return marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed);
+      return marian::dot(x, 
+                         marian::cast(weight, outputType), 
+                         /*transA=*/false, 
+                         /*transB=*/transposed);
   }
 };
 
diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h
index 8776820ef..e808694de 100644
--- a/src/layers_new/transformer.h
+++ b/src/layers_new/transformer.h
@@ -126,7 +126,7 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
     
-    int modelDim = opt<int>("dim-emb");
+    int modelDim = opt<int>("transformer-dim-model", opt<int>("dim-emb"));
     int ffnDim   = opt<int>("transformer-dim-ffn");
     if(isDecoder && opt<int>("transformer-decoder-dim-ffn") != 0)
       ffnDim = opt<int>("transformer-decoder-dim-ffn");
@@ -370,7 +370,8 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
     registerLayer(preprocessor);
 
     // @TODO: factory to support different attention flavors?
-    rnn = New<RNN<SSRU>>(graph, opt<int>("dim-emb"), opt<bool>("transformer-rnn-projection", false));
+    int modelDim = opt<int>("transformer-dim-model", opt<int>("dim-emb"));
+    rnn = New<RNN<SSRU>>(graph, modelDim, opt<bool>("transformer-rnn-projection", false));
     registerLayer(rnn);
 
     postprocessor = New<TransformerPrePostProcessor>(
diff --git a/src/models/bleurt.h b/src/models/bleurt.h
new file mode 100644
index 000000000..131b675a7
--- /dev/null
+++ b/src/models/bleurt.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include "layers_new/transformer.h"
+
+#include "models/encoder.h"
+#include "layers/constructors.h"
+
+namespace marian {
+namespace models {
+
+class BleurtTypeEmbeddingLayer : public nn::LayerWithOptions {
+public:
+  Expr embeddings;
+
+  BleurtTypeEmbeddingLayer(Ptr<ExpressionGraph> graph, Ptr<Options> options) 
+  : LayerWithOptions(graph, options) {}
+
+  virtual ~BleurtTypeEmbeddingLayer() = default;
+  
+  Expr apply(Ptr<data::SubBatch> subBatch) const {
+    int dimEmb   = opt<int>("dim-emb");
+    int dimTypes = opt<int>("bert-type-vocab-size", 2);
+
+    // Embedding layer initialization should depend only on embedding size, hence fanIn=false
+    auto initFunc = inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true); // -> embedding vectors have roughly unit length
+    registerParameterLazy(embeddings, Shape({dimTypes, dimEmb}), initFunc);
+
+    const auto& words = subBatch->data();
+    const auto vocab = subBatch->vocab();
+    
+    // Get word id of special symbols
+    Word sepId   = vocab->getEosId();
+
+    int dimBatch = (int)subBatch->batchSize();
+    int dimTime  = (int)subBatch->batchWidth();
+    const size_t maxSentPos = dimTypes;
+
+    // create indices for BERT sentence embeddings A and B
+    std::vector<IndexType> sentenceIndices(dimBatch * dimTime, 0); // each word is either in sentence A or B
+    std::vector<IndexType> sentPos(dimBatch, 0); // initialize each batch entry with being A [0]
+    for(int i = 0; i < dimTime; ++i) {   // advance word-wise
+      for(int j = 0; j < dimBatch; ++j) { // scan batch-wise
+        int k = i * dimBatch + j;
+        sentenceIndices[k] = sentPos[j]; // set to current sentence position for batch entry, max position 1.
+        if(words[k] == sepId && sentPos[j] < maxSentPos) { // if current word is a separator and not beyond range
+          sentPos[j]++;                   // then increase sentence position for batch entry (to B [1])
+        }
+      }
+    }
+
+    return reshape(rows(embeddings, sentenceIndices), {dimTime, dimBatch, dimEmb});
+  }
+};
+
+struct BleurtEncoder final : public nn::TransformerEncoder {
+  Ptr<nn::Linear> eProj;
+
+  BleurtEncoder(Ptr<ExpressionGraph> graph, 
+               Ptr<Options> options) 
+    : TransformerEncoder(graph, options) {
+    
+    eProj = New<nn::Linear>(graph, opt<int>("transformer-dim-model"));
+    registerLayer(eProj);
+
+    for(auto norm : allLayers<nn::LayerNorm>())
+      norm->eps = 1e-12f; // hard-coded as in original BLEURT model
+  }
+
+  Expr apply(Expr input, Expr mask) const override {
+    auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
+    
+    mask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
+    auto binMask = mask;
+    mask = marian::nn::transposedLogMask(mask, opt<int>("transformer-heads"));
+  
+    // apply positional embeddings to contextual input
+    output = positionEmbedding->apply(output);
+
+    // apply dropout or layer-norm to embeddings if required
+    output = preprocessor->apply(output);
+    
+    // scale from 256 to 1152
+    output = eProj->apply(output);
+    
+    // traverse the layers, use the same mask for each
+    for(auto layer : *layers)
+      output = layer->apply(output, mask); 
+
+    return output;
+  }
+};
+
+// Wrapper for backwards compatibility that uses current encoder/decoder framework
+struct BleurtBatchEncoder final : public nn::LayerWithOptions, 
+                                  public nn::IEmbeddingLayer,  // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings
+                                  public EncoderBase {         // @TODO: should all encoders be IEmbeddingLayer?
+  Ptr<BleurtTypeEmbeddingLayer> typeEmbedding;
+  Ptr<BleurtEncoder> encoder;
+  
+  BleurtBatchEncoder(Ptr<ExpressionGraph> graph, 
+                    Ptr<Options> options)
+    : LayerWithOptions(graph, options),
+      EncoderBase(graph, options)
+  {
+    typeEmbedding = New<BleurtTypeEmbeddingLayer>(graph, options);
+    registerLayer(typeEmbedding);
+
+    encoder = New<BleurtEncoder>(graph, options);
+    registerLayer(encoder);
+  }
+
+  // @TODO: subBatch should be of type Expr
+  virtual std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override {
+    auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt<bool>("ulr", false));
+    const auto& [batchEmbeddings, batchMask] = embeddingLayer->apply(subBatch);
+    
+#if 1
+    auto typeEmbeddings = typeEmbedding->apply(subBatch);
+    auto embeddings = batchEmbeddings + typeEmbeddings;
+#else
+    auto embeddings = batchEmbeddings;
+#endif
+
+    auto batchContext = encoder->apply(embeddings, batchMask); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+    return std::make_tuple(batchContext, batchMask);
+  }
+
+  virtual Expr apply(const Words& words, const Shape& shape) const override final {
+    return applyIndices(toWordIndexVector(words), shape);
+  }
+
+  // alternative from indices directly
+  virtual Expr applyIndices(const std::vector<WordIndex>& wordIndices, const Shape& shape) const override final {
+    auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt<bool>("ulr", false));
+    Expr batchEmbedding = embeddingLayer->applyIndices(wordIndices, shape);
+    auto batchContext = encoder->apply(batchEmbedding, /*mask=*/nullptr); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+    return batchContext;
+  }
+
+  // @TODO: currently here for backwards compat, should be replaced with apply()
+  virtual Ptr<EncoderState> build(Ptr<ExpressionGraph> graph,
+                                  Ptr<data::CorpusBatch> batch) override {
+#if 1
+    // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors
+    EncoderBase::graph_ = graph;
+    setGraph(graph);
+    // This makes sure that the graph passed into the model during construction and now evaluation are identical.
+    // A good check to have for catching weird situations early. 
+    ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
+#endif
+
+    // @TODO: this needs to convert to a BERT-batch
+    
+    const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]);
+    return New<EncoderState>(batchEmbedding, batchMask, batch);
+  }
+
+  virtual void clear() override {
+    Layer::clear();
+  }
+};
+
+class BleurtPooler final : public nn::LayerWithOptions, 
+                           public PoolerBase {
+private:
+  Ptr<nn::Sequential> layers;
+  std::mt19937 rng{(uint32_t)Config::seed};
+
+public:
+  BleurtPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
+  : LayerWithOptions(graph, options),
+    PoolerBase(graph, options) {
+    
+    float dropoutProb = 0.f;
+    layers = New<nn::Sequential>(
+      graph,
+      New<nn::Linear>(graph, LayerWithOptions::opt<int>("transformer-dim-model")), // @TODO: get rid of amibuigity
+      New<nn::Tanh>(graph),
+      New<nn::Dropout>(graph, dropoutProb),
+      New<nn::Linear>(graph, 1)
+    );
+    
+    registerLayer(layers);
+  }
+
+  std::vector<Expr> apply(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch, const std::vector<Ptr<EncoderState>>& encoderStates) override {
+#if 1
+    // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors
+    PoolerBase::graph_ = graph;
+    setGraph(graph);
+    // This makes sure that the graph passed into the model during construction and now evaluation are identical.
+    // A good check to have for catching weird situations early. 
+    ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
+#endif
+
+    auto modelType = LayerWithOptions::opt<std::string>("type");
+    
+    auto emb = slice(encoderStates[0]->getContext(), -2, 0);
+    emb = marian::cast(emb, Type::float32);
+    
+    Expr output;
+    if(LayerWithOptions::opt<int>("usage") == (int)models::usage::evaluating) {
+      output = layers->apply(emb);
+      int dimBatch = output->shape()[-3];
+      output = reshape(output, {dimBatch, 1, 1});
+      return { output };
+    } else {
+      ABORT("Usage other than evaluating not implemented");  
+    }
+  }
+
+  void clear() override {}
+};
+
+} // namespace models
+} // namespace marian
+
diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h
index cca18cac7..658d754e1 100644
--- a/src/models/comet_qe.h
+++ b/src/models/comet_qe.h
@@ -26,9 +26,6 @@ struct CometEncoder final : public nn::TransformerEncoder {
     // apply positional embeddings to contextual input
     output = positionEmbedding->apply(output);
 
-    // handle for skip connection at top
-    auto prevOutput = output;
-
     // apply dropout or layer-norm to embeddings if required
     output = preprocessor->apply(output);
 
@@ -142,14 +139,34 @@ struct CometBatchEncoder final : public nn::LayerWithOptions,
   }
 };
 
-class CometQEPooler final : public nn::LayerWithOptions, 
-                            public PoolerBase {
+// Dummpy pooler that only returns the encoder context
+class CometEmbeddingPooler final : public nn::LayerWithOptions, 
+                                   public PoolerBase {
+public:
+  CometEmbeddingPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
+  : LayerWithOptions(graph, options),
+    PoolerBase(graph, options) {}
+
+  std::vector<Expr> apply(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch, const std::vector<Ptr<EncoderState>>& encoderStates) override {
+    auto usage = (models::usage)LayerWithOptions::opt<int>("usage");
+    ABORT_IF(usage != models::usage::embedding, "This pooler should only be used for generating embeddings??");
+    ABORT_IF(encoderStates.size() != 1, "Size of encoderStates {} != 1", encoderStates.size());
+
+    return { encoderStates[0]->getContext() };
+  }
+  
+  void clear() override {}
+};
+
+// Actual COMET-like pooler, works for COMET-QE and COMET models (prior to WMT22)
+class CometMetricPooler final : public nn::LayerWithOptions, 
+                                public PoolerBase {
 private:
   Ptr<nn::Sequential> layers;
   std::mt19937 rng{(uint32_t)Config::seed};
 
 public:
-  CometQEPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
+  CometMetricPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
   : LayerWithOptions(graph, options),
     PoolerBase(graph, options) {
     
@@ -221,49 +238,80 @@ class CometQEPooler final : public nn::LayerWithOptions,
       return {xMixup, yMixup};
     };
 
-    ABORT_IF(encoderStates.size() != 2, "Pooler expects exactly two encoder state");
-    
-    auto src = encoderStates[0]->getContext();
-    auto mt  = encoderStates[1]->getContext();
+    auto usage = (models::usage)LayerWithOptions::opt<int>("usage");
+    ABORT_IF(usage == models::usage::embedding, "Wrong pooler for embedding??");
+
+    auto modelType = LayerWithOptions::opt<std::string>("type");
+    ABORT_IF(modelType == "comet-qe" && encoderStates.size() != 2, "Pooler expects exactly two encoder states for comet-qe");
+    ABORT_IF(modelType == "comet"    && encoderStates.size() != 3, "Pooler expects exactly three encoder states for comet");
     
-    auto diff = abs(mt - src);
-    auto prod = mt * src;
-
-    Expr output;
-    if(LayerWithOptions::opt<int>("usage") == (int)models::usage::embedding) {
-      auto embFwd  = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model]
-      auto embBwd  = concatenate({src, mt, prod, diff}, /*axis=*/-1); // [batch, 1, model]
-      auto emb     = concatenate({embFwd, embBwd}, /*axis=*/-2);
-      output = layers->apply(emb);
-
-      int dimBatch = output->shape()[-3];
-      output = reshape(output, {dimBatch, 1, 2});
-      return { output };
-    } else {
-      auto emb = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model]
+    if(modelType == "comet-qe") {
+      auto src = encoderStates[0]->getContext();
+      auto mt  = encoderStates[1]->getContext();
       
-      auto softLabelsWords = batch->front()->data();
-      auto classVocab      = batch->front()->vocab();
+      auto diff = abs(mt - src);
+      auto prod = mt * src;
+
+      Expr output;
+      if(usage == models::usage::evaluating) {
+        auto embFwd  = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model]
+        auto embBwd  = concatenate({src, mt, prod, diff}, /*axis=*/-1); // [batch, 1, model]
+        auto emb     = concatenate({embFwd, embBwd}, /*axis=*/-2);
+        output = layers->apply(emb);
+
+        int dimBatch = output->shape()[-3];
+        output = reshape(output, {dimBatch, 1, 2});
+        return { output };
+      } else {
+        auto emb = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model]
+        
+        auto softLabelsWords = batch->front()->data();
+        auto classVocab      = batch->front()->vocab();
+        
+        int dimBatch = (int)softLabelsWords.size();
+        std::vector<float> softLabels;
+        for(auto w : softLabelsWords) {
+          // @TODO: this is a super-ugly hack to get regression values
+          float score = w != Word::NONE ? std::stof((*classVocab)[w]) : 0.f;
+          softLabels.push_back(score);
+        }
+        auto labels = graph->constant({dimBatch, 1, 1}, inits::fromVector(softLabels), Type::float32);
+
+        if(getMode() == Mode::train) {
+          float mixupAlpha = LayerWithOptions::opt<float>("comet-mixup", 0.f);
+          bool mixupReg    = LayerWithOptions::opt<bool>("comet-mixup-reg", false);
+          auto xy = mixup(emb, labels, mixupAlpha, mixupReg);
+          emb     = get<0>(xy);
+          labels  = get<1>(xy);
+        }
+        output = marian::cast(layers->apply(emb), Type::float32);
+        return { output, labels };
+      }  
+    } else if(modelType == "comet") {
+      auto src = encoderStates[0]->getContext();
+      auto mt  = encoderStates[1]->getContext();
+      auto ref = encoderStates[2]->getContext();
       
-      int dimBatch = (int)softLabelsWords.size();
-      std::vector<float> softLabels;
-      for(auto w : softLabelsWords) {
-        // @TODO: this is a super-ugly hack to get regression values
-        float score = w != Word::NONE ? std::stof((*classVocab)[w]) : 0.f;
-        softLabels.push_back(score);
-      }
-      auto labels = graph->constant({dimBatch, 1, 1}, inits::fromVector(softLabels), Type::float32);
-
-      if(getMode() == Mode::train) {
-        float mixupAlpha = LayerWithOptions::opt<float>("comet-mixup", 0.f);
-        bool mixupReg    = LayerWithOptions::opt<bool>("comet-mixup-reg", false);
-        auto xy = mixup(emb, labels, mixupAlpha, mixupReg);
-        emb     = get<0>(xy);
-        labels  = get<1>(xy);
+      auto diffRef = abs(mt - ref);
+      auto prodRef = mt * ref;
+
+      auto diffSrc = abs(mt - src);
+      auto prodSrc = mt * src;
+
+      Expr output;
+      if(usage == models::usage::evaluating) {
+        auto emb  = concatenate({mt, ref, prodRef, diffRef, prodSrc, diffSrc}, /*axis=*/-1); // [batch, 1, model]
+        output = layers->apply(emb);
+        int dimBatch = output->shape()[-3];
+        output = reshape(output, {dimBatch, 1, 1});
+        return { output };
+      } else {
+        // Currently no training for COMET with reference @TODO: add training
+        ABORT("Usage other than 'evaluating' not implemented");  
       }
-      output = marian::cast(layers->apply(emb), Type::float32);
-      return { output, labels };
-    }  
+    } else {
+      ABORT("Unknown model type {}", modelType);
+    }
   }
 
   void clear() override {}
diff --git a/src/models/model_base.h b/src/models/model_base.h
index 6a327968a..32705bbe7 100644
--- a/src/models/model_base.h
+++ b/src/models/model_base.h
@@ -9,8 +9,16 @@
 namespace marian {
 namespace models {
 
-enum struct usage { raw, training, scoring, translation, embedding };
-}
+enum struct usage {
+  raw, 
+  training, 
+  scoring, 
+  translation, 
+  embedding,   // used for laser and other models to produce embedding vectors
+  evaluating   // evaluating is a special mode for neural metrics, different from (probabilistic) scoring
+};
+
+}  // namespace models
 }  // namespace marian
 
 YAML_REGISTER_TYPE(marian::models::usage, int)
diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp
index 40ba122a6..707a81ca9 100644
--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@@ -17,6 +17,7 @@
 #include "models/transformer_new.h"
 
 #include "models/comet_qe.h"
+#include "models/bleurt.h"
 
 #ifdef CUDNN
 #include "models/char_s2s.h"
@@ -133,40 +134,89 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
   Ptr<ExpressionGraph> graph = nullptr; // graph unknown at this stage
   // clang-format off
 
+  if(type == "comet-qe" || type == "comet") {
+    if(type == "comet") {
+      ABORT_IF(use == usage::training, "Usage {} is not supported for model of type {}", (int)use, type); 
+      ABORT_IF(use == usage::scoring, "Usage {} is not supported for model of type {}", (int)use, type); 
+    }
+    
+    auto inputTypes = options->get<std::vector<std::string>>("input-types");
+    ABORT_IF(inputTypes.empty(),
+      "Required option --input-types for COMET-QE not set. "
+      "For inference that should be --input-types sequence sequence. "
+      "For training set --input-types class sequence sequence");
+
+    int shift = 0;
+    if(inputTypes[0] == "class")
+      shift = 1;
+    
+    auto newOptions = options->with("usage", use);
+    auto res = New<EncoderPooler>(newOptions);
+
+    size_t numEncoders = 0;
+    bool addMetricPooler = false;
+    bool addEmbeddingPooler = false;
+
+    switch(use) {
+      case usage::embedding:  numEncoders = 1; addEmbeddingPooler = true; break;
+      case usage::evaluating:   
+      case usage::scoring:
+      case usage::training:   numEncoders = (type == "comet-qe") ? 2 : 3; addMetricPooler = true; break;
+      default: ABORT("Usage {} is not supported for model of type {}", (int)use, type); 
+    }
+  
+    for(size_t i = 0; i < numEncoders; i++) {
+      auto enc = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", i + shift));
+      enc->setName("CometEncoder"); // parameters will be shared
+      res->push_back(enc);
+    }
+    
+    if(addEmbeddingPooler) {
+      auto pooler = New<CometEmbeddingPooler>(graph, newOptions);
+      pooler->setName("CometEmbeddingPooler"); 
+      res->push_back(pooler);
+    }
+    
+    if(addMetricPooler) {
+      auto pooler = New<CometMetricPooler>(graph, newOptions);
+      pooler->setName("CometQEPooler"); // @TODO: change name for different models
+      res->push_back(pooler);
+    }
+
+    return res;
+  }
+
+  if(type == "bleurt") {
+    ABORT_IF(use != usage::evaluating, "Usage other than 'evaluating' is not supported for model of type {}", type); 
+    
+    auto newOptions = options->with("usage", use);
+    auto res = New<EncoderPooler>(newOptions);
+
+    auto inputTypes = options->get<std::vector<std::string>>("input-types");
+    ABORT_IF(inputTypes.empty(),
+      "Required option --input-types for BLEURT not set. "
+      "For inference that should be --input-types sequence. "
+      "For training set --input-types class sequence");
+
+    int shift = 0;
+    if(inputTypes[0] == "class")
+      shift = 1;
+    
+    auto enc = New<BleurtBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 0 + shift));
+    enc->setName("BleurtEncoder");
+    res->push_back(enc);
+          
+    auto pooler = New<BleurtPooler>(graph, newOptions);
+    pooler->setName("BleurtPooler");
+    res->push_back(pooler);
+    return res;
+  }
+
   bool trainEmbedderRank = options->hasAndNotEmpty("train-embedder-rank");
   if(use == usage::embedding || trainEmbedderRank) { // hijacking an EncoderDecoder model for embedding only
-
     auto dimVocabs = options->get<std::vector<int>>("dim-vocabs");
     size_t fields = trainEmbedderRank ? dimVocabs.size() : 0;
     int dimVocab = dimVocabs[0];
-    
-    if(type == "comet-qe") {
-      auto newOptions = options->with("usage", use);
-      auto res = New<EncoderPooler>(newOptions);
-
-      auto inputTypes = options->get<std::vector<std::string>>("input-types");
-      ABORT_IF(inputTypes.empty(),
-        "Required option --input-types for COMET-QE not set. "
-        "For inference that should be --input-types sequence sequence. "
-        "For training set --input-types class sequence sequence");
-
-      int shift = 0;
-      if(inputTypes[0] == "class")
-        shift = 1;
-      
-      auto enc1 = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 0 + shift));
-      enc1->setName("CometEncoder");
-      res->push_back(enc1);
-      
-      auto enc2 = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 1 + shift));
-      enc2->setName("CometEncoder");
-      res->push_back(enc2);
-      
-      auto pooler = New<CometQEPooler>(graph, newOptions);
-      pooler->setName("CometQEPooler");
-      res->push_back(pooler);
-      return res;
-    }
 
     Ptr<Options> newOptions;
     if(options->get<bool>("compute-similarity", false)) {
@@ -207,28 +257,6 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
     return res;
   }
 
-  if(use == usage::training || use == usage::scoring) {
-    if(type == "comet-qe") {
-      auto newOptions = options->with("usage", use);
-      auto res = New<EncoderPooler>(newOptions);
-      
-      // For training, first rank in batch is class!
-
-      auto enc1 = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 1));
-      enc1->setName("CometEncoder");
-      res->push_back(enc1);
-      
-      auto enc2 = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 2));
-      enc2->setName("CometEncoder");
-      res->push_back(enc2);
-      
-      auto pooler = New<CometQEPooler>(graph, newOptions);
-      pooler->setName("CometQEPooler");
-      res->push_back(pooler);
-      return res;
-    }
-  }
-
   if(type == "s2s" || type == "amun" || type == "nematus") {
     return models::encoder_decoder(options->with(
          "usage", use,
@@ -462,10 +490,10 @@ Ptr<IModel> createModelFromOptions(Ptr<Options> options, usage use) {
     else
       ABORT("'usage' parameter 'translation' cannot be applied to model type: {}", type);
   }
-  else if (use == usage::raw || use == usage::embedding)
+  else if (use == usage::raw || use == usage::embedding || use == usage::evaluating)
     return baseModel;
   else
-    ABORT("'Usage' parameter must be 'translation' or 'raw'");
+    ABORT("'Usage' parameter must be 'translation' or 'raw'"); // I am actually not sure what this is supposed to mean any more.
 }
 
 Ptr<ICriterionFunction> createCriterionFunctionFromOptions(Ptr<Options> options, usage use) {
diff --git a/src/tensors/gpu/gpu_info.cpp b/src/tensors/gpu/gpu_info.cpp
new file mode 100644
index 000000000..f6a59465f
--- /dev/null
+++ b/src/tensors/gpu/gpu_info.cpp
@@ -0,0 +1,19 @@
+#include "common/definitions.h"
+
+#if CUDA_FOUND
+#include "tensors/gpu/cuda_helpers.h"
+#endif
+
+namespace marian {
+namespace gpu {
+  size_t availableDevices() {
+#if CUDA_FOUND
+    int deviceCount;
+    CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+    return (size_t)deviceCount;
+#else
+    return 0;
+#endif
+  }
+}
+}
\ No newline at end of file

From d1d10a46bd34b5e5552b8c0ac91313cf0f829dcb Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Sat, 1 Jul 2023 08:37:29 +0000
Subject: [PATCH 15/26] Merged PR 30079: Fixes and extends unit test for layer
 norm

Fixes and extends unit test for layer norm. Previous version had a weird usage of Glorot Uniform.
---
 CHANGELOG.md                       |  1 +
 VERSION                            |  2 +-
 src/tests/units/operator_tests.cpp | 74 ++++++++++++++++++++++--------
 3 files changed, 57 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a436308c7..0fb1dfd2d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Fixed unit test for LayerNorm
 - Only collect batch statistics during mini-batch-fit up to actual max-length.
 - Implemented fully correct version of GELU instead of using bad approximatin via Swish.
 - Handle copying from fp32 or fp16 embeddings in embedder mode correctly.
diff --git a/VERSION b/VERSION
index f15731572..893904681 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.6
+v1.12.7
diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp
index 236823fe4..34a0dd6f5 100644
--- a/src/tests/units/operator_tests.cpp
+++ b/src/tests/units/operator_tests.cpp
@@ -271,33 +271,69 @@ void tests(DeviceType device, Type floatType = Type::float32) {
     graph->clear();
     values.clear();
 
-#ifdef CUDA_FOUND
-    std::vector<T> vLn({
-      -1.1962, 1.43061, 0.380288, -0.614697, 0.816638, 0.622649,
-      -1.69679, 0.257504, -1.12563, -0.151387, 1.61181, -0.334796,
-      1.07207, -0.622614, 0.862014, -1.31147
-    });
-#else
-    std::vector<T> vLn({
-      -1.49821, -0.152206, 0.394932, 1.25548, -1.51701, -0.28032,
-      0.9483, 0.849025, 0.855183, 1.11657, -0.788354, -1.1834,
-      -0.85939, -1.13109, 0.972076, 1.01841
-    });
-#endif
+    std::vector<T> init = {
+      2.88794374, 4.67853451, 3.96257305, 3.28433037,
+      0.37778997, 0.67662024, 4.24959183, 1.23910618,
+      0.68929380, 2.00369596, 4.38251686, 1.75624943,
+      4.96126175, 3.01947117, 4.72057724, 2.23017120
+    };
+
+    auto a1 = graph->param("test1", {2, 2, 4}, inits::fromVector(init));
+    auto a2 = graph->param("test2", {2, 2, 4}, inits::fromVector(init));
 
-    auto a = graph->constant({2, 2, 4}, inits::glorotUniform());
-    auto gamma = graph->param("gamma", {1, 4}, inits::ones());
-    auto beta = graph->param("beta", {1, 4}, inits::zeros());
-    auto ln = layerNorm(a, gamma, beta);
+    std::vector<T> gammaVec({0.1f, -0.2f, 0.3f, -0.4f});
+    std::vector<T> betaVec({-0.1f, 0.2f, -0.3f, 0.4f});
+    
+    auto gamma1 = graph->param("gamma1", {4}, inits::fromVector(gammaVec));
+    auto beta1  = graph->param("beta1",  {4}, inits::fromVector(betaVec));
+    
+    auto gamma2 = graph->param("gamma2", {4}, inits::fromVector(gammaVec));
+    auto beta2  = graph->param("beta2",  {4}, inits::fromVector(betaVec));
+    
+    // layernorm via special operator
+    auto ln =  layerNorm(a1, gamma1, beta1, 1e-5f);
+
+    // layernorm via elementary operators
+    auto num = a2 - mean(a2, /*axis=*/-1);
+    auto den = sqrt(mean(square(num), /*axis=*/-1) + 1e-5f);
+    auto ln2 = gamma2 * (num / den) + beta2;
+
+    auto top = sum(flatten(ln + ln2));
 
     graph->forward();
+    graph->backward();
 
     CHECK(ln->shape() == Shape({2, 2, 4}));
 
+    std::vector<T> values2;
+
+    // compare values of ln and ln2 to make sure forward computation is correct
     ln->val()->get(values);
+    ln2->val()->get(values2);
+
     CHECK( std::equal(values.begin(), values.end(),
-                      vLn.begin(), floatApprox) );
+                      values2.begin(), floatApprox2) );
 
+    // compare adjoints of a1 and a2 (parameters) to makes sure gradient computation is correct
+    a1->grad()->get(values);
+    a2->grad()->get(values2);
+
+    CHECK( std::equal(values.begin(), values.end(),
+                      values2.begin(), floatApprox2) );
+  
+    // compare adjoints of gamma1 and gamma2 (parameters) to makes sure gradient computation is correct
+    gamma1->grad()->get(values);
+    gamma2->grad()->get(values2);
+
+    CHECK( std::equal(values.begin(), values.end(),
+                      values2.begin(), floatApprox2) );
+
+    // compare adjoints of beta1 and beta2 (parameters) to makes sure gradient computation is correct
+    beta1->grad()->get(values);
+    beta2->grad()->get(values2);
+
+    CHECK( std::equal(values.begin(), values.end(),
+                      values2.begin(), floatApprox2) );
   }
 
   SECTION("RMS normalization") {
@@ -313,7 +349,7 @@ void tests(DeviceType device, Type floatType = Type::float32) {
 
     auto a1 = graph->param("test1", {2, 2, 4}, inits::fromVector(init));
     auto a2 = graph->param("test2", {2, 2, 4}, inits::fromVector(init));
-    auto gamma = graph->param("gamma", {1, 4}, inits::ones());
+    auto gamma = graph->param("gamma",    {4}, inits::ones());
     
     auto rms = rmsNorm(a1, gamma, nullptr, 1e-5f);
     auto rms2 = gamma * (a2 / sqrt(mean(a2 * a2, /*axis=*/-1) + 1e-5f));

From bd63ccec4ddb919dbbdb9f80f76165d663fcd20d Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 3 Jul 2023 04:38:40 +0000
Subject: [PATCH 16/26] Merged PR 28078: Various small improvements

Various small improvements, missing operators, missing gradient computations etc. The two most useful ones are probably:
* Working backward step (gradient) for scatter operation
* Possiblity to use LayerNorm and RMSNorm without scale and bias vectors (especially in new layer framework)
---
 src/common/config_parser.cpp         |   5 +
 src/common/hash.h                    |  20 ++++
 src/common/shape.h                   |  41 ++++----
 src/graph/expression_graph.cpp       |   2 +-
 src/graph/expression_operators.cpp   |  69 +++++++++++---
 src/graph/expression_operators.h     |  37 +++++---
 src/graph/node_operators_binary.h    |  55 +++++++++--
 src/graph/node_operators_tuple.h     |  71 +++++++++++++-
 src/graph/node_operators_unary.h     |  17 ++--
 src/layers/embedding.cpp             |   3 +-
 src/layers/embedding.h               |   2 +-
 src/layers/generic.h                 |   4 +-
 src/layers_new/embeddings.h          |   2 +-
 src/layers_new/neuralnet.h           | 136 ++++++++++++++-------------
 src/layers_new/rnn.h                 |   2 +-
 src/layers_new/transformer.h         |  22 ++++-
 src/models/encoder_decoder.cpp       |   3 +
 src/models/transformer.h             |   9 +-
 src/tensors/cpu/tensor_operators.cpp |   9 +-
 src/tensors/gpu/add.inc              |   3 +-
 src/tensors/gpu/add_all.inc          |   4 +-
 src/tensors/gpu/element.inc          |   5 +
 src/tensors/gpu/tensor_operators.cu  |  88 +++++++++--------
 src/tensors/tensor_operators.h       |  24 ++++-
 src/tests/units/operator_tests.cpp   |  13 ++-
 src/training/graph_group.cpp         |   2 +-
 26 files changed, 459 insertions(+), 189 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 0d8021bf1..3b8d50edf 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -320,6 +320,11 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
   cli.add<bool>("--transformer-depth-scaling",
       "Scale down weight initialization in transformer layers by 1 / sqrt(depth)");
 
+  cli.add<bool>("--transformer-no-bias",
+      "Don't use any bias vectors in linear layers");
+  cli.add<bool>("--transformer-no-affine",
+      "Don't use any scale or bias vectors in layer norm");
+
   cli.add<std::string>("--bert-mask-symbol", "Masking symbol for BERT masked-LM training", "[MASK]");
   cli.add<std::string>("--bert-sep-symbol", "Sentence separator symbol for BERT next sentence prediction training", "[SEP]");
   cli.add<std::string>("--bert-class-symbol", "Class symbol BERT classifier training", "[CLS]");
diff --git a/src/common/hash.h b/src/common/hash.h
index c2df2a63e..a05ffcfbc 100644
--- a/src/common/hash.h
+++ b/src/common/hash.h
@@ -24,5 +24,25 @@ inline HashType hashMem(const T* beg, size_t len, HashType seed = 0) {
   return seed;
 }
 
+/**
+ * Base case for template recursion below (no arguments are hashed to 0)
+ */
+template <class HashType = std::size_t>
+inline HashType hashArgs() {
+  return 0;
+}
+
+/**
+ * Hash an arbitrary number of arguments of arbitrary type via template recursion
+ */
+template <class T, class ...Args, class HashType = std::size_t>
+inline HashType hashArgs(T arg, Args... args) {
+  // Hash arguments without first arg
+  HashType seed = hashArgs(args...);
+  // Hash first arg and combine which above hash
+  hash_combine(seed, arg);
+  return seed;
+}
+
 }
 }
diff --git a/src/common/shape.h b/src/common/shape.h
index 270b35376..ad2be866f 100644
--- a/src/common/shape.h
+++ b/src/common/shape.h
@@ -12,28 +12,20 @@
 
 namespace marian {
 
-class ShapeSizeException : public std::exception {
-private:
-  char* message_;
-
+/**
+ * This exception gets thrown when the requested shape cannot be allocated due to numeric capacity limitations.
+*/
+class ShapeSizeException : public std::runtime_error {
 public:
-  ShapeSizeException(size_t available, size_t asked) {
-    std::string mstr = "Expanded shape size " + std::to_string(asked)
-                       + " exceeds numeric capcacity " + std::to_string(available);
-
-    message_ = new char[mstr.size() + 1];
-    std::copy(mstr.begin(), mstr.end(), message_);
-    message_[mstr.size()] = 0;
-  }
-
-  ~ShapeSizeException() { delete[] message_; }
-
-  virtual const char* what() const noexcept override { return message_; }
+  ShapeSizeException(size_t available, size_t asked) 
+  : std::runtime_error(fmt::format("Expanded shape size {} exceeds numeric capcacity {}", asked, available))
+  {}
 };
 
-
-struct Slice // Python-like slice/index descriptor
-{
+/**
+ * Python-like slice/index descriptor
+ */
+struct Slice {
   Slice(int b, int e, int s) : begin(b), end(e), stride(s) {}
   Slice(int b, int e) : Slice(b, e, 1) {}
   Slice() : Slice(0, END) {}
@@ -46,6 +38,7 @@ struct Slice // Python-like slice/index descriptor
   /*const*/ int begin, end, stride;
   static const int END = INT_MAX;
 };
+
 typedef std::vector<Slice> Slices;
 
 /**
@@ -61,6 +54,8 @@ struct Shape {
   std::vector<int> shape_;
 
 public:
+  typedef std::vector<int> Axes;
+
   Shape() : shape_({1}) {}
 
   Shape(std::initializer_list<int> il) : Shape() {
@@ -254,6 +249,14 @@ struct Shape {
     return shape;
   }
 
+  Shape fromAxes(const Axes& axes) const {
+    Shape subShape;
+    subShape.resize(size());
+    for(Axes::value_type axis : axes)
+      subShape.set(axis, dim(axis));
+    return subShape;
+  }
+
   size_t hash() const {
     size_t seed = util::hash<int>()(shape_[0]);
     for(size_t i = 1; i < shape_.size(); ++i)
diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index 9e90b5413..ce51b0f2b 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -156,7 +156,7 @@ void ExpressionGraph::forward(std::list<Expr>& forwardTape, bool finalPass) {
     if(v->marked_for_debug()) {
       Logger log = spdlog::get("general");
       if(log) {
-        LOG(info, "Debug: {} op={}", v->debug_message(), v->type());
+        LOG(info, "Debug: {} op={} name={}", v->debug_message(), v->type(), v->name());
         LOG(info, v->val()->debug());
       }
       else {
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 0ec6f7e67..ad1a4ff19 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -159,6 +159,16 @@ Expr2 topk(Expr a, int k, int axis, bool descending) {
   return std::make_tuple(swapAxes(topkVal, axis, -1), swapAxes(topkIdx, axis, -1)); // non-op if axes are the same
 }
 
+Expr topkIndices(Expr a, int k, int axis, bool descending) {
+  const auto& [values, indices] = topk(a, k, axis, descending);
+  return choose({values, indices}, 1);
+}
+
+Expr topkValues(Expr a, int k, int axis, bool descending) {
+  const auto& [values, indices] = topk(a, k, axis, descending);
+  return choose({values, indices}, 0);
+}
+
 Expr2 argmax(Expr a, int axis) {
   return topk(a, 1, axis, /*descending=*/true);
 }
@@ -353,10 +363,30 @@ Expr flatten_2d(Expr a) {
 }
 
 Expr stopGradient(Expr a) {
+#if 0 
+  // This is a different implementation which is more reliable than the original, 
+  // but it introduces a full copy which hogs memory. Keeping it around for now
+  // to decide later which one to use.
+
+  auto fwd = [](Expr output, const std::vector<Expr> inputs) {
+    CopyCast(output->val(), inputs[0]->val());
+  };
+
+  auto bwd = [](Expr output, const std::vector<Expr> inputs) {
+    /*Dummy*/
+  };
+
+  return lambda({a}, a->shape(), a->value_type(), fwd, bwd, (size_t)&fwd);
+#else
   // implemented as a dummy reshape that is not trainable
   auto res = Expression<ReshapeNodeOp>(a, a->shape());
   res->setTrainable(false);
   return res;
+#endif
+}
+
+Expr choose(std::vector<Expr> nodes, size_t index) {
+  return Expression<ChooseNodeOp>(nodes, index);
 }
 
 // gather() -- gather arbitrary elements along an axis; batched or non-batched
@@ -693,21 +723,28 @@ Expr affineWithReluDropout(Expr x, Expr W, Expr bias, float dropProb) {
     return Expression<AffineWithReluNodeOp>(x, W, bias);
   } else {
     Expr output = affine(x, W, bias);
-    int dimModel = output->shape()[-1];
-    int dimTime  = output->shape()[-2];
-    output = dropoutReluInplace(output, dropProb, {dimTime, dimModel});
+    output = dropoutReluInplace(output, dropProb, Shape::Axes({-2, -1}));
     return output;
   }
 }
 
+Expr dropoutReluInplace(Expr x, Expr mask) {
+  return Expression<DropoutReluInplaceNodeOp>(x, mask);
+}
+
 Expr dropoutReluInplace(Expr x, float dropProb, Shape shape) {
-  if(dropProb == 0) {
-    return relu(x);
-  } else {
-    auto graph = x->graph();
-    auto mask = graph->dropoutMask(dropProb, shape);
-    return Expression<DropoutReluInplaceNodeOp>(x, mask);
-  }
+  Expr mask = dropProb ? x->graph()->dropoutMask(dropProb, shape) : nullptr;
+  return dropoutReluInplace(x, mask);
+}
+
+Expr dropoutReluInplace(Expr x, float dropProb, const Shape::Axes& axes) {
+  Expr mask = dropProb ? x->graph()->dropoutMask(dropProb, x->shape().fromAxes(axes)) : nullptr;
+  return dropoutReluInplace(x, mask);
+}
+
+Expr dropoutReluInplace(Expr x, float dropProb) {
+  Expr mask = dropProb ? x->graph()->dropoutMask(dropProb, x->shape()) : nullptr;
+  return dropoutReluInplace(x, mask);
 }
 
 // @TODO: Not a great place to check this
@@ -860,24 +897,28 @@ Expr square(Expr a) {
 }
 
 Expr layerNorm(Expr x,
-               Expr gamma,
+               Expr gamma/*= nullptr*/,
                Expr beta /*= nullptr*/,
                float eps /*= 1e-9*/) {
 
   // layerNorm accumulates in float, so small eps is fine
-  std::vector<Expr> nodes = {x, gamma};
+  std::vector<Expr> nodes = {x};
+  if(gamma)
+    nodes.push_back(gamma);
   if(beta)
     nodes.push_back(beta);
   return Expression<LayerNormalizationOp>(nodes, eps);
 }
 
 Expr rmsNorm(Expr x,
-             Expr gamma,
+             Expr gamma /*= nullptr*/,
              Expr beta /*= nullptr*/,
              float eps /*= 1e-9*/) {
 
   // layerNorm accumulates in float, so small eps is fine
-  std::vector<Expr> nodes = {x, gamma};
+  std::vector<Expr> nodes = {x};
+  if(gamma)
+    nodes.push_back(gamma);
   if(beta)
     nodes.push_back(beta);
   return Expression<RMSNormalizationOp>(nodes, eps);
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index faef5c29e..e96d8f7c9 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -386,6 +386,8 @@ Expr get(Expr2 tuple) { return std::get<I>(tuple); }
  * @returns An ordered 2-tuple of Expressions
  */
 Expr2 topk(Expr a, int k, int axis, bool descending = true);
+Expr  topkIndices(Expr a, int k, int axis, bool descending = true);
+Expr  topkValues(Expr a, int k, int axis, bool descending = true);
 
 /**
  * Returns largest elements of an expression along an axis.
@@ -683,6 +685,13 @@ Expr flatten_2d(Expr a);
  */
 Expr stopGradient(Expr a);
 
+/**
+ * Return index-th node from nodes. This is a selector which add `nodes` into the computation graph
+ * and makes sure they do not end up unattached if not used due to some condition that computes `index`
+ * for only one of them. This is a no-op similar to `reshape`.
+*/
+Expr choose(std::vector<Expr> nodes, size_t index);
+
 /**
  * Gathers elements along an axis.
  * @param a       The input expression
@@ -924,7 +933,7 @@ Expr weighted_average(Expr in, Expr weights, int ax = 0);
  * @f]
  * @see LayerNormalizationOp
  */
-Expr layerNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9);
+Expr layerNorm(Expr x, Expr gamma = nullptr, Expr beta = nullptr, float eps = 1e-9);
 
 /**
  * Applies RMS normalization over the last dimension. 
@@ -936,7 +945,7 @@ Expr layerNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9);
  * @f]
  * @see RMSNormalizationOp
  */
-Expr rmsNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9);
+Expr rmsNorm(Expr x, Expr gamma = nullptr, Expr beta = nullptr, float eps = 1e-9);
 
 /**
  * Highway transformation.
@@ -957,7 +966,7 @@ Expr highway(const std::string prefix, Expr x);
  * Performs dropout using a given mask.
  */
 static inline Expr dropout(Expr x, Expr mask) {
-  if (mask)
+  if(mask)
     return x * mask;
   else
     return x;
@@ -967,24 +976,30 @@ static inline Expr dropout(Expr x, Expr mask) {
  * Performs dropout with a given probably and explicit shape.
  */
 static inline Expr dropout(Expr x, float dropProb, Shape shape) {
-  if(dropProb == 0)
-    return x;
-  auto graph = x->graph();
-  auto mask = graph->dropoutMask(dropProb, shape);
+  auto mask = dropProb ? x->graph()->dropoutMask(dropProb, shape) : nullptr;
   return dropout(x, mask);
 }
 
+/**
+ * Performs dropout with a given probably over explicit axes.
+ */
+static inline Expr dropout(Expr x, float dropProb, const Shape::Axes& axes) {
+  auto mask = dropProb ? x->graph()->dropoutMask(dropProb, x->shape().fromAxes(axes)) : nullptr;
+  return dropout(x, mask);
+}
 
 /**
  * Performs dropout with a given probability.
  */
 static inline Expr dropout(Expr x, float dropProb) {
-  if(dropProb == 0)
-    return x;
-  return dropout(x, dropProb, x->shape());
+  auto mask = dropProb ? x->graph()->dropoutMask(dropProb, x->shape()) : nullptr;
+  return dropout(x, mask);
 }
 
-Expr dropoutReluInplace(Expr x, float dropProb, Shape shape);
+Expr dropoutReluInplace(Expr x, Expr mask=nullptr);
+Expr dropoutReluInplace(Expr x, float dropProb, Shape maskShape);
+Expr dropoutReluInplace(Expr x, float dropProb, const Shape::Axes& axes);
+Expr dropoutReluInplace(Expr x, float dropProb);
 
 /**
  * Shifts the elements of an expression by a per-axis offset @p shift
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index d35ca6fff..29259f983 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -1031,13 +1031,11 @@ struct GatherNodeOp : public NaryNodeOp {
 
   NodeOps forwardOps() override {
     return {NodeOp(
-      // @TODO: rename to gather
-      Select(val_, child(0)->val(), child(1)->val(), axis_))};
+      Select</*add=*/false>(val_, child(0)->val(), child(1)->val(), axis_))};
   }
 
   NodeOps backwardOps() override {
     return {NodeOp(
-      // @TODO: rename to scatter
       Insert</*add=*/true>(child(0)->grad(), adj_, /*indices=*/child(1)->val(), axis_))};
   }
 
@@ -1095,17 +1093,52 @@ struct ScatterNodeOp : public NaryNodeOp {
   NodeOps forwardOps() override {
     return {NodeOp(
       CopyCast(val_, child(0)->val()); // @TODO: use normal copy
-      Insert</*add=*/false>(val_, /*source=*/child(2)->val(), /*indices=*/child(1)->val(), axis_)
+      Insert</*add=*/false>(val_, /*source=*/child(2)->val(), /*indices*/child(1)->val(), axis_);
     )};
   }
 
   NodeOps backwardOps() override {
-    ABORT("backward for ScatterNodeOp not yet implemented");
+    auto backwardForVal = [this]() {
+      auto allocator = graph()->allocator();
+
+      // create temporary tensor of child(0)->grad().shape() == adj_.shape() 
+      // copy adj_ to temporary
+      auto grad = child(0)->grad();
+      auto tempGradMem = allocator->alloc(grad->memory()->size());
+      Tensor tempGrad = TensorBase::New(tempGradMem, grad->shape(), grad->type(), grad->getBackend());
+      CopyCast(tempGrad, adj_);
+
+      // create temporary tensor of zeros of values.shape() and values type
+      auto source = child(2)->val();
+      auto tempZeroMem = allocator->alloc(source->memory()->size());
+      Tensor tempZero = TensorBase::New(tempZeroMem, source->shape(), source->type(), source->getBackend());
+      tempZero->set(0);
+
+      // insert tensor of zeros into temporary
+      Insert</*add=*/false>(tempGrad, /*source=*/tempZero, /*indices*/child(1)->val(), axis_);
+      
+      // add temporary do child(0)->grad()
+      Add(functional::_1, grad, tempGrad);
+
+      // clear temporary memory
+      allocator->free(tempGradMem);
+      allocator->free(tempZeroMem);
+    };
+
+    return {
+      // val - add gradients every where else to gradient of "a"
+      NodeOp(backwardForVal()), 
+      
+      NodeOp(/*no gradient*/[](){}), // indices
+
+      // add gradients on indices to gradient of "source"
+      NodeOp(Select</*add=*/true>(/*source*/child(2)->grad(), adj_, /*indices=*/child(1)->val(), axis_))
+    };
   }
 
   Shape newShape(Expr a, int axis, Expr indices, Expr source) {
     ABORT_IF(axis != -1, "only last dimensions");
-    // ABORT_IF(indices->shape() != source->shape(), "Shapes must match"); or broadcast
+    ABORT_IF(indices->shape() != source->shape(), "Shapes must match");
 
     Shape shape = a->shape();
     // @TODO: do proper checking
@@ -1152,7 +1185,9 @@ struct ColsNodeOp : public NaryNodeOp {
   }
 
   NodeOps backwardOps() override {
-    return {NodeOp(PasteCols(child(0)->grad(), adj_, child(1)->val()))};
+    return {NodeOp(
+      PasteCols(child(0)->grad(), adj_, child(1)->val());
+    )};
   }
 
   Shape newShape(Expr a, Expr indices) {
@@ -1555,7 +1590,7 @@ struct LayerNormalizationOp : public NaryNodeOp {
     return {NodeOp(
         LayerNormalization(val_,
                            child(0)->val(),
-                           child(1)->val(),
+                           (children_.size() >= 2) ? child(1)->val() : nullptr,
                            (children_.size() == 3) ? child(2)->val() : nullptr,
                            eps_))};
   }
@@ -1566,12 +1601,12 @@ struct LayerNormalizationOp : public NaryNodeOp {
       LayerNormalizationGrad(
         graph()->allocator(),
         child(0)->grad(),
-        child(1)->grad(),
+        (children_.size() >= 2) ? child(1)->grad() : nullptr,
         (children_.size() == 3) ? child(2)->grad() : nullptr,
         adj_,
         val_,
         child(0)->val(),
-        child(1)->val(),
+        (children_.size() >= 2) ? child(1)->val() : nullptr,
         (children_.size() == 3) ? child(2)->val() : nullptr,
         eps_))};
   }
diff --git a/src/graph/node_operators_tuple.h b/src/graph/node_operators_tuple.h
index 8acb1bc83..4444e2ef8 100644
--- a/src/graph/node_operators_tuple.h
+++ b/src/graph/node_operators_tuple.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "graph/node_operators.h"
 #include "graph/node_operators_unary.h"
 
 namespace marian {
@@ -133,7 +134,7 @@ struct TopKNodeOp : public UnaryNodeOp,
   }
 
   void backward() override {
-    Insert</*add=*/true>(/*out*/child(0)->grad(), adj_, val_, axis_);
+    Insert</*add=*/true>(/*out*/child(0)->grad(), adj_, tupleVal_, axis_);
   }
 
   const std::string type() override { return "topk"; }
@@ -164,4 +165,72 @@ struct TopKNodeOp : public UnaryNodeOp,
   }
 };
 
+// This node attaches multiple children to a parent node and allows 
+// to select one of them via a given index. This is mostly used to avoid
+// unattached nodes that might nevertheless get created based on some 
+// runtime criterion that is not fully clear during construction.
+class ChooseNodeOp : public NaryNodeOp {
+protected:
+  friend class SerializationHelpers;
+  Expr chosen_;
+  size_t index_;
+  
+public:
+  ChooseNodeOp(std::vector<Expr> nodes, size_t index) 
+  : NaryNodeOp(nodes, nodes[index]->shape(), nodes[index]->value_type()), 
+    chosen_(nodes[index]), index_(index) {
+    Node::destroy_ = false;
+  }
+
+  ~ChooseNodeOp() {}
+
+  void allocate() override {}
+  void free() override {}
+
+  void forward() override {}
+  void backward() override {}
+
+  void init_dependent() override { chosen_->init_dependent(); }
+
+  void set_zero_adjoint() override { chosen_->set_zero_adjoint(); }
+
+  Tensor& val() override {
+    auto childVal = chosen_->val();
+    auto temp = TensorBase::New(childVal->memory(), shape(), childVal->type(), childVal->getBackend());
+    val_.swap(temp);
+    return val_;
+  };
+
+  Tensor& grad() override {
+    auto childGrad = chosen_->grad();
+    auto temp = TensorBase::New(childGrad->memory(), shape(), childGrad->type(), childGrad->getBackend());
+    adj_.swap(temp);
+    return adj_;
+  };
+
+  const std::string type() override { return "choose"; }
+
+  const std::string color() override { return "grey"; }
+
+  virtual size_t hash() override {
+    if(!hash_) {
+      size_t seed = NaryNodeOp::hash();
+      util::hash_combine(seed, index_);
+      hash_ = seed;
+    }
+    return hash_;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<ChooseNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(index_ != cnode->index_)
+      return false;
+    return true;
+  }
+};
+
 }
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index 4e78e7166..6189d3cc9 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -888,8 +888,6 @@ class ReshapeNodeOp : public UnaryNodeOp {
   }
 };
 
-
-
 // @TODO: add version with access to backward step
 // This allows to attach a lambda function to any node during the execution. It is a non-operation otherwise
 // i.e. doesn't consume any memory or take any time to execute (it's a reshape onto itself) other than the
@@ -934,25 +932,32 @@ class DropoutReluInplaceNodeOp : public ReshapeNodeOp {
   Expr mask_;
   
 public:
-  DropoutReluInplaceNodeOp(Expr node, Expr mask)
+  DropoutReluInplaceNodeOp(Expr node, Expr mask = nullptr)
   : ReshapeNodeOp(node, node->shape()), 
     mask_(mask) {}
 
   void forward() override {
     using namespace marian::functional;
-    Element(_1 = ReLU(_1 * _2), val(), mask_->val());
+    if(mask_)
+      Element(_1 = ReLU(_1 * _2), val(), mask_->val());
+    else
+      Element(_1 = ReLU(_1), val());
   }
 
   void backward() override {
     using namespace marian::functional;
-    Element(_1 = _1 * ReLUback(_2) * _3, grad(), val(), mask_->val());
+    if(mask_)
+      Element(_1 = _1 * ReLUback(_2) * _3, grad(), val(), mask_->val());
+    else
+      Element(_1 = _1 * ReLUback(_2), grad(), val());
   }
 
   const std::string type() override { return "dropoutReluInplace"; }
 
   virtual size_t hash() override {
     size_t seed = ReshapeNodeOp::hash();
-    util::hash_combine(seed, mask_->hash());
+    if(mask_)
+      util::hash_combine(seed, mask_->hash());
     return seed;
   }
 
diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp
index 93c6d9b33..377a4010a 100644
--- a/src/layers/embedding.cpp
+++ b/src/layers/embedding.cpp
@@ -169,8 +169,7 @@ Expr Embedding::applyIndices(const std::vector<WordIndex>& embIdx, const Shape&
   // @BUGBUG: We should not broadcast along dimBatch=[-2]. Then we can also dropout before reshape()
   // (test that separately)
   if(!inference_)
-    selectedEmbs = dropout(
-        selectedEmbs, options_->get<float>("dropout", 0.0f), {selectedEmbs->shape()[-3], 1, 1});
+    selectedEmbs = dropout(selectedEmbs, options_->get<float>("dropout", 0.0f), Shape::Axes({-3}));
   return selectedEmbs;
 }
 
diff --git a/src/layers/embedding.h b/src/layers/embedding.h
index af22b980a..6895c4ab8 100644
--- a/src/layers/embedding.h
+++ b/src/layers/embedding.h
@@ -170,7 +170,7 @@ class ULREmbedding : public LayerBase, public IEmbeddingLayer {
     if(!inference_)
       batchEmbeddings = dropout(batchEmbeddings,
                                 options_->get<float>("dropout-embeddings", 0.0f),
-                                {batchEmbeddings->shape()[-3], 1, 1});
+                                Shape::Axes({-3}));
     return std::make_tuple(batchEmbeddings, batchMask);
   }
 
diff --git a/src/layers/generic.h b/src/layers/generic.h
index df11a2337..bd80a09ea 100644
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@@ -239,9 +239,7 @@ static inline Expr denseInline(Expr x,
     x = affine(x, W, b);
     x = activationByName(actName)(x);
     
-    int dimModel = x->shape()[-1];
-    int dimTime  = x->shape()[-2];
-    x = dropout(x, dropProb, {dimTime, dimModel});
+    x = dropout(x, dropProb, Shape::Axes({-2, -1}));
   }
   
   return x;
diff --git a/src/layers_new/embeddings.h b/src/layers_new/embeddings.h
index e080906fe..bbe971d1b 100644
--- a/src/layers_new/embeddings.h
+++ b/src/layers_new/embeddings.h
@@ -113,7 +113,7 @@ class Embedding : public LayerWithOptions, public IEmbeddingLayer {
     auto selectedEmbs = rows(embeddings, embIdx);        // [(B*W) x E]
     selectedEmbs = reshape(selectedEmbs, shape); // [W, B, E]
     // @BUGBUG: We should not broadcast along dimBatch=[-2]. Then we can also dropout before reshape() (test that separately)
-    selectedEmbs = dropout(selectedEmbs, opt<float>("dropout", 0.0f), { selectedEmbs->shape()[-3], 1, 1 });
+    selectedEmbs = dropout(selectedEmbs, opt<float>("dropout", 0.0f), Shape::Axes({-3})); // @TODO: dropout here seems wrong!
     return selectedEmbs;
   }
 
diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h
index 278758a96..33c089624 100644
--- a/src/layers_new/neuralnet.h
+++ b/src/layers_new/neuralnet.h
@@ -146,28 +146,26 @@ struct Linear : public Layer, public IUnaryLayer {
 };
 
 struct Dropout final : public Layer, public IUnaryLayer {
-  float dropoutProbabilty;
-  UPtr<Shape> dropoutMaskShape;
+  float dropoutProbability;
+  Shape::Axes dropoutAxes{{-2, -1}};
   
   Dropout(Ptr<ExpressionGraph> graph, 
-          float dropoutProbabilty,
-          const Shape& dropoutMaskShape) 
-    : Layer(graph), dropoutProbabilty(dropoutProbabilty), dropoutMaskShape(new Shape(dropoutMaskShape))
+          float dropoutProbability,
+          const Shape::Axes& dropoutAxes) 
+    : Layer(graph), dropoutProbability(dropoutProbability), dropoutAxes(dropoutAxes)
   {}
 
   Dropout(Ptr<ExpressionGraph> graph, 
-          float dropoutProbabilty) 
-    : Layer(graph), dropoutProbabilty(dropoutProbabilty), dropoutMaskShape(nullptr)
+          float dropoutProbability) 
+    : Layer(graph), dropoutProbability(dropoutProbability)
   {}
 
   Expr apply(Expr input) const override {
     if(getMode() == Mode::eval)
       return input;
 
-    if(dropoutMaskShape && dropoutProbabilty > 0.f) {
-      return marian::dropout(input, dropoutProbabilty, *dropoutMaskShape);
-    } else if(dropoutProbabilty > 0.f) {
-      return marian::dropout(input, dropoutProbabilty, {input->shape()[-2], input->shape()[-1]});
+    if(dropoutProbability > 0.f) {
+      return marian::dropout(input, dropoutProbability, dropoutAxes);
     } else {
       return input;
     }
@@ -185,30 +183,29 @@ struct LinearReluDropout final : public Linear {
   using Linear::transposed;
   using Linear::init;
 
-  float dropoutProbabilty;
-  UPtr<Shape> dropoutMaskShape;
+  float dropoutProbability;
+  Shape::Axes dropoutAxes{{-2, -1}};
 
   // Typical constructor that can take an initializer function
   LinearReluDropout(Ptr<ExpressionGraph> graph, 
                     int dimOut,
-                    float dropoutProbabilty,
+                    float dropoutProbability,
                     bool useBias = true,
                     bool transposed = false,
                     Ptr<inits::NodeInitializer> init = inits::glorotUniform())
     : Linear(graph, dimOut, useBias, transposed, init),  
-      dropoutProbabilty(dropoutProbabilty), 
-      dropoutMaskShape(nullptr) {}
+      dropoutProbability(dropoutProbability) {}
 
+  // Typical constructor that can take an initializer function
   LinearReluDropout(Ptr<ExpressionGraph> graph, 
                     int dimOut,
-                    float dropoutProbabilty,
-                    const Shape& dropoutMaskShape,
+                    float dropoutProbability,
+                    const Shape::Axes& dropoutAxes,
                     bool useBias = true,
                     bool transposed = false,
                     Ptr<inits::NodeInitializer> init = inits::glorotUniform())
     : Linear(graph, dimOut, useBias, transposed, init),  
-      dropoutProbabilty(dropoutProbabilty), 
-      dropoutMaskShape(new Shape(dropoutMaskShape)) {}
+      dropoutProbability(dropoutProbability), dropoutAxes(dropoutAxes) {}
 
   Expr apply(Expr x) const override {
     int dimIn = x->shape()[-1];
@@ -224,83 +221,94 @@ struct LinearReluDropout final : public Linear {
       registerParameterLazy(bias, Shape({ dimOut }), inits::zeros());
     }
 
-    // @TODO: handle relu inplace for inference etc.
     Expr output;
     if(useBias)
       output = marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed);
     else
       output = marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed);
 
-    if(getMode() == Mode::eval)
-      return relu(output);
-
-    if(dropoutMaskShape && dropoutProbabilty > 0.f) {
-      return marian::dropoutReluInplace(output, dropoutProbabilty, *dropoutMaskShape);
-    } else if(dropoutProbabilty > 0.f) {
-      return marian::dropoutReluInplace(output, dropoutProbabilty, {output->shape()[-2], output->shape()[-1]});
+    if(getMode() == Mode::eval) {
+      return marian::dropoutReluInplace(output); // no dropout
     } else {
-      return relu(output);
+      return marian::dropoutReluInplace(output, dropoutProbability, dropoutAxes);
     }
   }
 
   virtual void clear() override {}
 };
 
-
 struct Norm : public Layer, public IUnaryLayer {
-  Norm(Ptr<ExpressionGraph> graph) : Layer(graph) {}
-  virtual ~Norm() = default;
+  Expr scale{nullptr};
+  Expr bias{nullptr};
+  
+  bool useScale{true};
+  bool useBias{true};
+  bool elementwise{true};
+  float eps{1e-5f};
 
-  Expr apply(Expr x) const override = 0;
-};
+  Norm(Ptr<ExpressionGraph> graph, 
+       bool useScale = true, 
+       bool useBias = true, 
+       bool elementwise = true, 
+       float eps = 1e-5f)
+    : Layer(graph), 
+      useScale(useScale), 
+      useBias(useBias), 
+      elementwise(elementwise), 
+      eps(eps) {}
+
+  virtual Expr getScale(int dimModel) const {
+    Expr scaleVector = nullptr;
+    if(useScale) {
+      registerParameterLazy(scale, Shape({ elementwise ? dimModel : 1 }), inits::ones());
+      // if elementwise==false we multiply with a vector of 1s - that's a trick to make gradient computation faster
+      scaleVector = elementwise ? scale : scale * graph()->ones({dimModel}); // @TODO: make this obsolete
+    }
+    return scaleVector;
+  }
 
-struct LayerNorm final : public Norm {
-  Expr weight;
-  Expr bias;
+  virtual Expr getBias(int dimModel) const {
+    Expr biasVector = nullptr;
+    if(useBias) {
+      registerParameterLazy(bias,  Shape({ elementwise ? dimModel : 1 }), inits::zeros());
+      // if elementwise==false we multiply with a vector of 1s - that's a trick to make gradient computation faster
+      biasVector = elementwise ? bias : bias * graph()->ones({dimModel}); // @TODO: make this obsolete
+    }
+    return biasVector;
+  }
 
-  float eps{1e-5f};
-  bool elementwiseAffine{true};
+  Expr apply(Expr x) const override = 0;
+};
 
+struct LayerNorm : public Norm {
   LayerNorm(Ptr<ExpressionGraph> graph, 
-            float eps = 1e-5f,
-            bool elementwiseAffine = true)
-   : Norm(graph), eps(eps), elementwiseAffine(elementwiseAffine) 
+            bool useScale = true,
+            bool useBias = true,
+            bool elementwise = true,
+            float eps = 1e-5f)
+   : Norm(graph, useScale, useBias, elementwise, eps)
   {}
 
   Expr apply(Expr x) const override {
     int dimModel = x->shape()[-1];
-    if(elementwiseAffine) {
-      registerParameterLazy(weight, Shape({ dimModel }), inits::ones());
-      registerParameterLazy(bias,   Shape({ dimModel }), inits::zeros());
-      return marian::layerNorm(x, weight, bias, eps);
-    } else {
-      return marian::layerNorm(x, nullptr, nullptr, eps);
-    }
+    return marian::layerNorm(x, getScale(dimModel), getBias(dimModel), eps);
   }
 
   virtual void clear() override {}
 };
 
-struct RMSNorm final : public Norm {
-  Expr weight;
-
-  float eps{1e-5f};
-  bool elementwiseAffine{true};
-
+struct RMSNorm : public Norm {
   RMSNorm(Ptr<ExpressionGraph> graph, 
-          float eps = 1e-5f,
-          bool elementwiseAffine = true)
-   : Norm(graph), eps(eps), elementwiseAffine(elementwiseAffine) 
+          bool useScale = true, 
+          bool useBias = true, 
+          bool elementwise = true,
+          float eps = 1e-5f)
+   : Norm(graph, useScale, useBias, elementwise, eps)
   {}
 
   Expr apply(Expr x) const override {
     int dimModel = x->shape()[-1];
-    if(elementwiseAffine) {
-      registerParameterLazy(weight, Shape({ dimModel }), inits::ones());
-      return marian::rmsNorm(x, weight, nullptr, eps);
-    } else {
-      return marian::rmsNorm(x, nullptr, nullptr, eps);
-    }
+    return marian::rmsNorm(x, getScale(dimModel), getBias(dimModel), eps);
   }
 };
 
diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h
index da3ac4f94..281d2dce9 100644
--- a/src/layers_new/rnn.h
+++ b/src/layers_new/rnn.h
@@ -31,7 +31,7 @@ class SSRU final : public Layer, public ICell {
     registerLayer(iProj);
     fProj = New<Linear>(graph, dimState);
     registerLayer(fProj);
-    dropout = New<Dropout>(graph, dropProb, Shape({dimState}));
+    dropout = New<Dropout>(graph, dropProb, Shape::Axes({-1}));
     registerLayer(dropout);
   }
 
diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h
index e808694de..ade61a78e 100644
--- a/src/layers_new/transformer.h
+++ b/src/layers_new/transformer.h
@@ -239,7 +239,17 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
       if(opt<bool>("transformer-depth-scaling", false))
         for(auto linear : transformerEncoderLayer->allLayers<Linear>())
           linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
-
+      
+      if(opt<bool>("transformer-no-bias", false))
+        for(auto linear : transformerEncoderLayer->allLayers<Linear>())
+          linear->useBias = false;
+      
+      if(opt<bool>("transformer-no-affine", false)) {
+        for(auto norm : transformerEncoderLayer->allLayers<Norm>()) {
+          norm->useScale = false;
+          norm->useBias  = false;
+        }
+      }
       layers->append(transformerEncoderLayer);
     }
 
@@ -491,7 +501,17 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
           linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
         for(auto linear : currentLayer->filterBlock->allLayers<Linear>())
           linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
+      }
 
+      if(opt<bool>("transformer-no-bias", false))
+        for(auto linear : currentLayer->allLayers<Linear>())
+          linear->useBias = false;
+      
+      if(opt<bool>("transformer-no-affine", false)) {
+        for(auto norm : currentLayer->allLayers<Norm>()) {
+          norm->useScale = false;
+          norm->useBias = false;
+        }
       }
     }
 
diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp
index 6a298ed0d..f70353a64 100644
--- a/src/models/encoder_decoder.cpp
+++ b/src/models/encoder_decoder.cpp
@@ -68,6 +68,9 @@ EncoderDecoder::EncoderDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options)
   modelFeatures_.insert("lemma-dependency");
   modelFeatures_.insert("factors-combine");
   modelFeatures_.insert("factors-dim-emb");
+
+  modelFeatures_.insert("transformer-no-bias");
+  modelFeatures_.insert("transformer-no-affine");
 }
 
 std::vector<Ptr<EncoderBase>>& EncoderDecoder::getEncoders() {
diff --git a/src/models/transformer.h b/src/models/transformer.h
index a3f6d9b53..0fa52ff82 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -170,11 +170,8 @@ class Transformer : public EncoderOrDecoderBase {
     auto output = input;
     for(auto op : ops) {
       // dropout
-      if (op == 'd') {
-        int dimModel = output->shape()[-1];
-        int dimTime  = output->shape()[-2];
-        output = dropout(output, dropProb, {dimTime, dimModel});
-      }
+      if (op == 'd')
+        output = dropout(output, dropProb, Shape::Axes({-2, -1}));
       // layer normalization
       else if (op == 'n')
         output = layerNorm(output, prefix, "_pre");
@@ -191,7 +188,7 @@ class Transformer : public EncoderOrDecoderBase {
     for(auto op : ops) {
       // dropout
       if(op == 'd')
-        output = dropout(output, dropProb);
+        output = dropout(output, dropProb, Shape::Axes({-2, -1}));
       // skip connection
       else if(op == 'a')
         output = output + prevInput;
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index 5be3eee26..6a075e9c5 100755
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -710,6 +710,7 @@ void SelectAxis2(Tensor out,
 }
 #endif
 
+template <bool add>
 void Select(Tensor out,
             const Tensor in,
             const Tensor indices,
@@ -736,10 +737,16 @@ void Select(Tensor out,
     int idxIndex = idxShape.bindex(dims);                      // return global index for indices based on dimension-specific indices from out, take broadcasting into account;
     dims[axisCPU] = (int)indices->data<IndexType>()[idxIndex]; // substitute index of out-tensor with corresponding axis-local position from in-tensor;
     int inIndex = inShape.index(dims);                         // compute global index from dimension-specific indices, no broadcasting as out and in match in all dimensions apart from axis
-    out->data()[index] = in->data()[inIndex];                  // assign corresponding values.
+    if(add)
+      out->data()[index] += in->data()[inIndex];               // add for gradients.
+    else
+      out->data()[index] = in->data()[inIndex];                // assign corresponding values.
   }
 }
 
+template void Select<true>(Tensor out, const Tensor in, const Tensor indices, int axis);
+template void Select<false>(Tensor out, const Tensor in, const Tensor indices, int axis);
+
 template <bool add>
 void Insert(Tensor out,
             const Tensor in,
diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc
index 1b233bb1b..ed1e72553 100755
--- a/src/tensors/gpu/add.inc
+++ b/src/tensors/gpu/add.inc
@@ -39,4 +39,5 @@ template void marian::gpu::Aggregate<marian::functional::UnaryFunctor<marian::fu
 template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>);
 template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
\ No newline at end of file
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
diff --git a/src/tensors/gpu/add_all.inc b/src/tensors/gpu/add_all.inc
index b983b7b7e..41da1351b 100644
--- a/src/tensors/gpu/add_all.inc
+++ b/src/tensors/gpu/add_all.inc
@@ -1,4 +1,4 @@
-// see element.inc for instructions on how to maintain this
+ // see element.inc for instructions on how to maintain this
 using namespace functional;
 
 template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
@@ -41,6 +41,7 @@ template void marian::AggregateAll<float, float, marian::functional::UnaryFuncto
 template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::AggregateAll<float, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 
 #if COMPILE_FP16
@@ -84,5 +85,6 @@ template void marian::AggregateAll<__half, float, marian::functional::UnaryFunct
 template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::AggregateAll<__half, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 #endif
diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc
index 730817849..27cc641da 100755
--- a/src/tensors/gpu/element.inc
+++ b/src/tensors/gpu/element.inc
@@ -73,6 +73,8 @@ template void marian::gpu::Element<marian::functional::Assign<marian::functional
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::Assignee<2>, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::Assignee<2>, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > >, marian::functional::Assignee<3> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > >, marian::functional::Assignee<3> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture>, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture>, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, marian::functional::Capture> > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, marian::functional::Capture> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > > > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > > > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 
 // How to add new specializations:
@@ -82,3 +84,6 @@ template void marian::gpu::Element<marian::functional::Assign<marian::functional
 //   - replace up to including "undefined reference to `" by "template"
 //   - replace final ' by a semicolon
 //   - replace 'IntrusivePtr<marian::TensorBase>' with 'marian::Tensor'
+
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::Assignee<1> > >>(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::Assignee<1> > >, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index 508e1e3e7..5f8c4c122 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -1304,7 +1304,7 @@ void PasteCols(Tensor out,
   }
 }
 
-template <typename T>
+template <bool add, typename T>
 __global__ void gSelect(T* out,
                         functional::Shape outShape,
                         const T* in,
@@ -1322,7 +1322,10 @@ __global__ void gSelect(T* out,
       int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
       dims[axis] = (int)d_indices[idxIndex];    
       int inIndex = inShape.index(dims);
-      out[index] = in[inIndex];
+      if(add)
+        out[index] += in[inIndex];
+      else
+        out[index] = in[inIndex];
     }
   }
 }
@@ -1353,6 +1356,7 @@ __global__ void gInsert(T* out,
   }
 }
 
+template <bool add>
 void Select(Tensor out,
             const Tensor in,
             const Tensor indices,
@@ -1369,36 +1373,39 @@ void Select(Tensor out,
   int axisGPU = axis + functional::Shape::size() - out->shape().size();
 
   if(out->type() == Type::float32) {
-    gSelect<<<blocks, threads>>>(out->data<float>(),
-                                 out->shape(),
-                                 in->data<float>(),
-                                 in->shape(),
-                                 axisGPU,
-                                 indices->data<IndexType>(), 
-                                 indices->shape());
+    gSelect<add><<<blocks, threads>>>(out->data<float>(),
+                                      out->shape(),
+                                      in->data<float>(),
+                                      in->shape(),
+                                      axisGPU,
+                                      indices->data<IndexType>(), 
+                                      indices->shape());
 #if COMPILE_FP16
   } else if (out->type() == Type::float16) {
-    gSelect<<<blocks, threads>>>(out->data<half>(),
-                                 out->shape(),
-                                 in->data<half>(),
-                                 in->shape(),
-                                 axisGPU,
-                                 indices->data<IndexType>(),
-                                 indices->shape());
+    gSelect<add><<<blocks, threads>>>(out->data<half>(),
+                                      out->shape(),
+                                      in->data<half>(),
+                                      in->shape(),
+                                      axisGPU,
+                                      indices->data<IndexType>(),
+                                      indices->shape());
 #endif
   } else if(out->type() == Type::uint32) {
-    gSelect<<<blocks, threads>>>(out->data<IndexType>(),
-                                 out->shape(),
-                                 in->data<IndexType>(),
-                                 in->shape(),
-                                 axisGPU,
-                                 indices->data<IndexType>(), 
-                                 indices->shape());
+    gSelect<add><<<blocks, threads>>>(out->data<IndexType>(),
+                                      out->shape(),
+                                      in->data<IndexType>(),
+                                      in->shape(),
+                                      axisGPU,
+                                      indices->data<IndexType>(), 
+                                      indices->shape());
   } else {
     ABORT("Select not implemented for type {}", out->type());
   }
 }
 
+template void Select<true>(Tensor out, const Tensor in, const Tensor indices, int axis);
+template void Select<false>(Tensor out, const Tensor in, const Tensor indices, int axis);
+
 template <bool add>
 void Insert(Tensor out,
             const Tensor in,
@@ -2152,7 +2159,7 @@ __global__ void gLNormalization(T* out,
       for(int tid = 0; tid < cols; tid += blockDim.x) {
         int id = tid + threadIdx.x;
         if(id < cols) {
-          AccType gammav = (AccType)gamma[id];
+          AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f;
           AccType xv     = (AccType)xRow[id];
           AccType betav  = beta ? (AccType)beta[id] : (AccType)0.f;
           AccType lv     = (xv - mean) / sigma;
@@ -2182,7 +2189,7 @@ void LayerNormalization(Tensor out,
   if(out->type() == Type::float32) {
     gLNormalization<float, float><<<blocks, threads, shared>>>(out->data<float>(),
                                                  in->data<float>(),
-                                                 gamma->data<float>(),
+                                                 gamma ? gamma->data<float>() : nullptr,
                                                  beta ? beta->data<float>() : nullptr,
                                                  rows,
                                                  cols,
@@ -2191,7 +2198,7 @@ void LayerNormalization(Tensor out,
   } else if (out->type() == Type::float16) {
     gLNormalization<half, float><<<blocks, threads, shared>>>(out->data<half>(),
                                                  in->data<half>(),
-                                                 gamma->data<half>(),
+                                                 gamma ? gamma->data<half>() : nullptr,
                                                  beta ? beta->data<half>() : nullptr,
                                                  rows,
                                                  cols,
@@ -2241,7 +2248,7 @@ __global__ void gLayerNormalizationGrad(T* gradX,
           AccType xv     = xRow[id];
           AccType yv     = yRow[id];
           AccType betav  = beta ? (AccType)beta[id] : (AccType)0.f;
-          AccType gammav = (AccType)gamma[id];
+          AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f;
           AccType adjv   = adjRow[id];
           AccType lv     = (yv - betav) / gammav; // go back to LN(x) from scaled and shifted version for accumulation
 
@@ -2297,7 +2304,7 @@ __global__ void gLayerNormalizationGrad(T* gradX,
         if(id < cols) {
 
           AccType xv     = xRow[id];
-          AccType gammav = (AccType)gamma[id];
+          AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f;
           AccType adjv   = adjRow[id];
           AccType lv     = (xv - mean) / sigma;
 
@@ -2318,10 +2325,12 @@ __global__ void gLayerNormalizationGrad(T* gradX,
           T* gradXRow      = gradX     + j * cols;
           gradXRow[id]    += (T)(gradXv);
 
-          T* gradGammaRow  = gradGamma + j * cols;
-          // assignment is correct here as this gets summed up
-          // in the next kernel via matrix product
-          gradGammaRow[id] = (T)(adjv * lv);
+          if(gamma) {
+            T* gradGammaRow  = gradGamma + j * cols;
+            // assignment is correct here as this gets summed up
+            // in the next kernel via matrix product
+            gradGammaRow[id] = (T)(adjv * lv);
+          }
         }
       }
     }
@@ -2358,12 +2367,12 @@ void LayerNormalizationGrad(Ptr<Allocator> allocator,
     int shared = sizeof(float) * threads * 4;
     gLayerNormalizationGrad<float, float><<<blocks, threads, shared>>>(
       gradX->data<float>(),
-      tempGradGamma->data<float>(),
+      gamma ? tempGradGamma->data<float>() : nullptr,
       adj->data<float>(),
       y->data<float>(),
       x->data<float>(),
-      gamma->data<float>(),
-      (beta) ? beta->data<float>() : nullptr,
+      gamma ? gamma->data<float>() : nullptr,
+      beta ? beta->data<float>() : nullptr,
       rows,
       cols,
       eps);
@@ -2373,12 +2382,12 @@ void LayerNormalizationGrad(Ptr<Allocator> allocator,
     int shared = sizeof(float) * threads * 4;
     gLayerNormalizationGrad<half, float><<<blocks, threads, shared>>>(
       gradX->data<half>(),
-      tempGradGamma->data<half>(),
+      gamma ? tempGradGamma->data<half>() : nullptr,
       adj->data<half>(),
       y->data<half>(),
       x->data<half>(),
-      gamma->data<half>(),
-      (beta) ? beta->data<half>() : nullptr,
+      gamma ? gamma->data<half>() : nullptr,
+      beta ? beta->data<half>() : nullptr,
       rows,
       cols,
       eps);
@@ -2392,7 +2401,8 @@ void LayerNormalizationGrad(Ptr<Allocator> allocator,
   // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. 
   // This preserves precision with larger batches where all batch entries reduce into a single vector.
   // See also AffineNodeOp where we do the same for biases
-  gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add
+  if(gradGamma)
+    gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add
 
   if(gradBeta) // dC/dbeta = adj - inverse broadcasting (reduction)
     gpu::Prod(gradBeta, tempOnes, adj, false, false, 1, 1, Type::float32); // beta set to one to add
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index 31bd1e14f..2747a6d66 100644
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -301,8 +301,6 @@ DISPATCH3(PasteRows, marian::Tensor, const marian::Tensor, const marian::Tensor)
 DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const marian::Tensor)
 DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const marian::Tensor)
 
-DISPATCH4(Select, marian::Tensor, const marian::Tensor, const marian::Tensor, int)
-
 #ifdef CUDA_FOUND
 namespace gpu {
   template <bool add>
@@ -325,6 +323,28 @@ static inline void Insert(Tensor out, const Tensor in, const Tensor indices, int
     cpu::Insert<add>(out, in, indices, axis);
 }
 
+#ifdef CUDA_FOUND
+namespace gpu {
+  template <bool add>
+  void Select(Tensor out, const Tensor in, const Tensor indices, int axis);
+}
+#endif
+
+namespace cpu {
+  template <bool add>
+  void Select(Tensor out, const Tensor in, const Tensor indices, int axis);
+}
+
+template <bool add>
+static inline void Select(Tensor out, const Tensor in, const Tensor indices, int axis) {
+#ifdef CUDA_FOUND
+  if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
+    gpu::Select<add>(out, in, indices, axis);
+  else
+#endif
+    cpu::Select<add>(out, in, indices, axis);
+}
+
 DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr<Allocator>, const marian::Tensor, int, int, bool);
 
 DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>)
diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp
index 34a0dd6f5..5806e94de 100644
--- a/src/tests/units/operator_tests.cpp
+++ b/src/tests/units/operator_tests.cpp
@@ -631,8 +631,15 @@ void tests(DeviceType device, Type floatType = Type::float32) {
     auto aff1 = affine(A, B, bias);
     auto aff2 = dot(A, B) + bias;
 
-    auto affRelu1 = affineWithReluDropout(A, B, bias);
-    auto affRelu2 = relu(dot(A, B) + bias);
+    auto A2 = graph->param("A2", {4, 3}, inits::fromVector(vA));
+    auto B2 = graph->param("B2", {3, 2}, inits::fromVector(vB));
+
+    // @TODO: using this operator here is currently dangerous since the inplace 
+    // operator inside might modify values in-place if the same operation is executed
+    // twice on the same inputs. (Hence the new parameters A2 and B2 here)
+    // This needs to be fixed in the future.
+    auto affRelu1 = affineWithReluDropout(A2, B2, bias);
+    auto affRelu2 = relu(dot(A2, B2) + bias);
 
     graph->forward();
 
@@ -643,7 +650,7 @@ void tests(DeviceType device, Type floatType = Type::float32) {
     values2.clear();
     CHECK(aff2->shape() == aff1->shape());
     aff2->val()->get(values2);
-    CHECK(values2 == values);
+    CHECK(values == values2);
 
     affRelu1->val()->get(values);
     affRelu2->val()->get(values2);
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index 367e47e16..43adddcac 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -638,7 +638,7 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
         auto loss = model->build(graph, batch);
         fits = graph->fits();
       } catch(const ShapeSizeException& e) {
-        LOG(debug, "Exception for maxBatch size {}: {}", maxBatch, e.what());
+        LOG(debug, "Exception for maxBatch size {}: {}", current, e.what());
         fits = false;
       }
 

From a5b50f2ddc54759e65bd8616781eba43cc886973 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Sun, 16 Jul 2023 23:23:38 +0000
Subject: [PATCH 17/26] Merged PR 30282: Fix parameter name for norms in new
 layer framework.

Undoes the accidental renaming of the scale parameter in Norms layer back to "weight".
---
 CHANGELOG.md               | 1 +
 VERSION                    | 2 +-
 src/layers_new/neuralnet.h | 6 +++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0fb1dfd2d..a40214ad5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Fixed wrong paramter name for norm in new layer framework
 - Fixed unit test for LayerNorm
 - Only collect batch statistics during mini-batch-fit up to actual max-length.
 - Implemented fully correct version of GELU instead of using bad approximatin via Swish.
diff --git a/VERSION b/VERSION
index 893904681..d9d998341 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.7
+v1.12.8
diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h
index 33c089624..b81728c77 100644
--- a/src/layers_new/neuralnet.h
+++ b/src/layers_new/neuralnet.h
@@ -238,7 +238,7 @@ struct LinearReluDropout final : public Linear {
 };
 
 struct Norm : public Layer, public IUnaryLayer {
-  Expr scale{nullptr};
+  Expr weight{nullptr}; // = scale
   Expr bias{nullptr};
   
   bool useScale{true};
@@ -260,9 +260,9 @@ struct Norm : public Layer, public IUnaryLayer {
   virtual Expr getScale(int dimModel) const {
     Expr scaleVector = nullptr;
     if(useScale) {
-      registerParameterLazy(scale, Shape({ elementwise ? dimModel : 1 }), inits::ones());
+      registerParameterLazy(weight, Shape({ elementwise ? dimModel : 1 }), inits::ones());
       // if elementwise==false we multiply with a vector of 1s - that's a trick to make gradient computation faster
-      scaleVector = elementwise ? scale : scale * graph()->ones({dimModel}); // @TODO: make this obsolete
+      scaleVector = elementwise ? weight : weight * graph()->ones({dimModel}); // @TODO: make this obsolete
     }
     return scaleVector;
   }

From c8f1e03c0a7c80bf1578f90756c885db224c7982 Mon Sep 17 00:00:00 2001
From: Varun Mathur <vamathur@microsoft.com>
Date: Mon, 17 Jul 2023 12:11:56 +0000
Subject: [PATCH 18/26] Merged PR 30198: [quicksand] cache YAML configs

Reusing these YAML configs helps speed up coreleaf loading. The only consumers of this quicksand API are the leaf, and I think this small memory tradeoff of keeping these in cache is worth the speedup.

Related work items: #146810
---
 src/microsoft/quicksand.cpp | 40 ++++++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 316c66d11..2302819eb 100644
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -1,5 +1,7 @@
 #include "quicksand.h"
 #include "marian.h"
+#include <unordered_map>
+#include <mutex>
 
 #if MKL_FOUND
 #include "mkl.h"
@@ -60,6 +62,8 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
 
   std::vector<Ptr<Vocab>> vocabs_;
 
+  static inline std::unordered_map<std::string, YAML::Node> configCache_;
+  static inline std::mutex configCacheMutex_;
 public:
   BeamSearchDecoder(Ptr<Options> options,
                     const std::vector<const void*>& ptrs,
@@ -87,16 +91,27 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
     for(int i = 0; i < models.size(); ++i) {
       Ptr<Options> modelOpts = New<Options>();
 
+      // serializing this YAML can be costly, so read from cache
       YAML::Node config;
-      if(io::isBin(models[i]) && ptrs_[i] != nullptr)
-        io::getYamlFromModel(config, "special:model.yml", ptrs_[i]);
-      else
-        io::getYamlFromModel(config, "special:model.yml", models[i]);
+      auto cachedConfig = getConfigFromCache(models[i]);
+      if(cachedConfig != nullptr) {
+        config = *cachedConfig;
+      } else {
+        if(io::isBin(models[i]) && ptrs_[i] != nullptr)
+          io::getYamlFromModel(config, "special:model.yml", ptrs_[i]);
+        else
+          io::getYamlFromModel(config, "special:model.yml", models[i]);
+        writeConfigToCache(config, models[i]);
+      }
 
       modelOpts->merge(options_);
       modelOpts->merge(config);
 
-      std::cerr << modelOpts->asYamlString() << std::flush; // @TODO: take a look at why this is even here.
+      // serializing this to YAML is expensive. we only want to do this once 
+      // we can use whether we loaded the cache from config as a signal 
+      if(cachedConfig == nullptr){
+        std::cerr << modelOpts->asYamlString() << std::flush;
+      }
 
       auto encdec = models::createModelFromOptions(modelOpts, models::usage::translation);
 
@@ -119,6 +134,21 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
     graph_->forward();
   }
 
+  YAML::Node* getConfigFromCache(std::string key){
+    const std::lock_guard<std::mutex> lock(configCacheMutex_);
+    bool inCache = configCache_.find(key) != configCache_.end();
+    if (inCache) {
+      return &configCache_[key];
+    } else {
+      // return null if no cache hit
+      return nullptr;
+    }
+  }
+  void writeConfigToCache(YAML::Node config, std::string key) {
+    const std::lock_guard<std::mutex> lock(configCacheMutex_);
+    configCache_[key] = config;
+  }
+
   void setWorkspace(uint8_t* data, size_t size) override { device_->set(data, size); }
 
   QSNBestBatch decode(const QSBatch& qsBatch,

From c83d47f1df77c7ad51fc2bacaf903d688a6c9425 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Sat, 22 Jul 2023 05:00:42 +0000
Subject: [PATCH 19/26] Merged PR 30283: Save full checkpoints at saving
 intervals (with iteration number) when requested.

This PR adds the option `--overwrite-checkpoints` (by default true to mimic current behavior) which can be set to `false` to force full checkpoint saving and preservation at saving intervals. E.g. for a model named `rus.enu.generalnn.replica_1.model.iter37769.npz`, Marian will then also save `rus.enu.generalnn.replica_1.model.iter37769.npz.optimizer.npz` and `rus.enu.generalnn.replica_1.model.iter37769.npz.progress.yml`.
---
 CHANGELOG.md                 |  3 +-
 VERSION                      |  2 +-
 src/common/config_parser.cpp |  5 +++
 src/training/graph_group.cpp | 78 +++++++++++++++++++-----------------
 src/training/graph_group.h   | 14 +++++--
 5 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a40214ad5..79dd3f673 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,11 +8,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Added --overwrite-checkpoint option that (when set to false) can be used to dump checkpoints with iteration numbers.   
 - Implementations of COMET-20 (reference-based) and BLEURT-20 for inference with conversion scripts.
 - `./marian evaluate` sub command for evaluation with COMET-QE-20, COMET-20 and BLEURT-20
 - A bunch of scripts for metrics use and early MBR experiments
 - LSH vocab filtering for GPU. Speed is not competitive with non-LSH. Checking in for completeness and possible future use of LSH on GPU for non-filtering stuff
-- Add --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) 
+- Added --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) 
   diverged runs. If not recoverable, exception gets rethrown and goes unhandled to force fatal error and shutdown.
 - Re-implementation of COMET-QE for inference and training; conversion scripts from Unbabel-Comet to Marian.
 - Validator that generates embeddings and can be used during COMET training with an external script.
diff --git a/VERSION b/VERSION
index d9d998341..2fc612cb1 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.8
+v1.12.9
\ No newline at end of file
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 3b8d50edf..9b36338c1 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -388,6 +388,11 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   cli.add<bool>("--overwrite",
       "Do not create model checkpoints, only overwrite main model file with last checkpoint. "
       "Reduces disk usage");
+  cli.add<bool>("--overwrite-checkpoint",
+      "When --overwrite=false (default) only model files get written at saving intervals (with iterations numbers). " 
+      "Setting --overwrite-checkpoint=false also saves full checkpoints checkpoints with optimizer parameters, etc. "
+      "Uses (a lot) more disk space.",
+      true);
   cli.add<bool>("--no-reload",
       "Do not load existing model specified in --model arg");
   cli.add<std::vector<std::string>>("--train-sets,-t",
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index 43adddcac..054b0ae76 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -358,19 +358,19 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) {
         scheduler_->load(modelFileName);
       
       // we just load it N times from disk (it'll be in disk cache after the first)
-      // this also allocates memory correctly when calling forward() inside restoreFromCheckPoint
+      // this also allocates memory correctly when calling forward() inside restoreOptimizerState
       size_t i = 0;
       for(auto graph : graphs_)
         models_[i++]->load(graph, items, markReloaded);
 
       // try to restore everything from checkpoint now
-      restoreFromCheckpoint(modelFileName, scatterFn);
+      loadOptimizerState(modelFileName, scatterFn);
     }
   }
 }
 
-bool GraphGroup::restoreFromCheckpoint(const std::string& modelFileName, 
-                                       const OptimizerBase::ScatterStateFunc& scatterFn) {
+bool GraphGroup::loadOptimizerState(const std::string& modelFileName, 
+                                    const OptimizerBase::ScatterStateFunc& scatterFn) {
   /*
   if model checkpoint is available:
     - load model from checkpoint, not from model.npz
@@ -436,8 +436,8 @@ bool GraphGroup::restoreFromCheckpoint(const std::string& modelFileName,
   return true; // succeeded to restore
 }
 
-void GraphGroup::saveCheckpoint(const std::string& modelFileName,
-                                const OptimizerBase::GatherStateFunc& gatherFn) {
+void GraphGroup::saveOptimizerState(const std::string& modelFileName,
+                                    const OptimizerBase::GatherStateFunc& gatherFn) {
   // @TODO: change to .checkpoint.npz, would break backwards compat                                  
   std::string checkpointName = modelFileName + ".optimizer.npz";
 
@@ -467,50 +467,56 @@ void GraphGroup::saveCheckpoint(const std::string& modelFileName,
   }
 }
 
-void GraphGroup::save(bool isFinal,
-                      const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn) {
+void GraphGroup::saveCheckPoint(const std::string& modelFileName,
+                                bool isFinal,
+                                bool doSaveOptimizerState, 
+                                const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn) {
   barrier(); // (for better grouping of log messages)
-
   // bring the smoothed model in
   // Note that it is sharded. For multi-node, it is sharded over multiple machines, so this is a network access.
   // Also note that the swap must run on all MPI processes concurrently, although only one actually validates.
-
   swapWithSmoothed();
-  
-  if(isFinal && scheduler_)
-    scheduler_->validate(graphs_, isFinal);
 
-  barrier(); // (for better grouping of log messages)
-  
-  std::string modelFileName = options_->get<std::string>("model");
   if(isMainProcess()) {
     // save main model file
-    if(options_->get<bool>("overwrite")) {
-      models_[0]->save(graphs_[0], modelFileName, /*saveTranslatorConfig=*/true);
-      // save scheduler-related state
-      if(scheduler_)
-        scheduler_->save(modelFileName);
-    } else {
-      if(!isFinal) { // save a model with iteration number
-        std::string numberOfBatches = scheduler_ ? std::to_string(scheduler_->numberOfBatches()) : "unknown";
-        std::string nameOverwrite = modelFileName;
-        nameOverwrite.replace(modelFileName.size() - 4, 4, ".iter" + numberOfBatches + ".npz");
-        models_[0]->save(graphs_[0], nameOverwrite);
-      }
-      models_[0]->save(graphs_[0], modelFileName, /*saveTranslatorConfig=*/true);
-
-      // save scheduler-related state
-      if(scheduler_)
-        scheduler_->save(modelFileName);
-    }
+    models_[0]->save(graphs_[0], modelFileName, /*saveTranslatorConfig=*/true);
+    // save scheduler-related state
+    if(doSaveOptimizerState && scheduler_)
+      scheduler_->save(modelFileName);
   }
 
   swapWithSmoothed();
-  saveCheckpoint(modelFileName, gatherOptimizerStateFn);
-  
+
+  if(doSaveOptimizerState)
+    saveOptimizerState(modelFileName, gatherOptimizerStateFn);
+
   barrier(); // (for better grouping of log messages)
 }
 
+void GraphGroup::save(bool isFinal,
+                      const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn) {
+  if(isFinal && scheduler_) {
+    barrier(); // (for better grouping of log messages)
+    swapWithSmoothed();
+    scheduler_->validate(graphs_, isFinal);
+    swapWithSmoothed();
+    barrier(); // (for better grouping of log messages)
+  }
+
+  std::string modelFileName = options_->get<std::string>("model");
+  bool overwrite = options_->get<bool>("overwrite", false);
+
+  if(!overwrite && !isFinal) { // save a model with iteration number
+    std::string numberOfBatches = scheduler_ ? std::to_string(scheduler_->numberOfBatches()) : "unknown";
+    std::string nameOverwrite = modelFileName;
+    nameOverwrite.replace(modelFileName.size() - 4, 4, ".iter" + numberOfBatches + ".npz");
+
+    bool overwriteCheckpoint = options_->get<bool>("overwrite-checkpoint", true);
+    saveCheckPoint(nameOverwrite, isFinal, /*doSaveOptimizerState=*/!overwriteCheckpoint, gatherOptimizerStateFn);
+  }
+  saveCheckPoint(modelFileName, isFinal, /*doSaveOptimizerState=*/true, gatherOptimizerStateFn);
+}
+
 void GraphGroup::swapWithSmoothed() {
   auto swap = [&](size_t i, size_t begin, size_t end) {
     auto curParam = graphs_[i]->params()->vals()->subtensor(begin, end-begin);
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index d7525a102..4cfd079aa 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -104,14 +104,20 @@ class GraphGroup {
 
 private:
   void load(const OptimizerBase::ScatterStateFunc& scatterFn);
+
+  bool loadOptimizerState(const std::string& modelFileName,
+                          const OptimizerBase::ScatterStateFunc& scatterFn);
+
   void save(bool isFinal,
             const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn);
 
-  bool restoreFromCheckpoint(const std::string& modelFileName,
-                             const OptimizerBase::ScatterStateFunc& scatterFn);
+  void saveCheckPoint(const std::string& modelFileName,
+                      bool isFinal,
+                      bool doSaveOptimizerState, 
+                      const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn);
 
-  void saveCheckpoint(const std::string& modelFileName,
-                      const OptimizerBase::GatherStateFunc& gatherFn);
+  void saveOptimizerState(const std::string& modelFileName,
+                          const OptimizerBase::GatherStateFunc& gatherFn);
 
 public:
   // This function swaps out the current optimizer parameters with the smoothed version (provided smoothing is enabled).

From 9af4740a9524c5611eb7910464f4bb5ab36636e1 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Mon, 24 Jul 2023 12:44:21 +0000
Subject: [PATCH 20/26] Merged PR 30415: Fix macOS clang builds

This PR explicitly disables server compilation in macOS build with clang. It seems an update to the macos-12 environment provided openssl and boost, which when found by cmake, enables compilation of marian-server, which doesn't work with clang.
---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0f19a0f8d..29e8e6219 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -371,7 +371,7 @@ stages:
           -DCOMPILE_CPU=on \
           -DCOMPILE_CUDA=off \
           -DCOMPILE_EXAMPLES=on \
-          -DCOMPILE_SERVER=on \
+          -DCOMPILE_SERVER=off \
           -DCOMPILE_TESTS=on \
           -DUSE_FBGEMM=on \
           -DUSE_SENTENCEPIECE=on \

From b67489ec50c7b586dc19258be89dfad2eb947003 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Tue, 25 Jul 2023 00:13:18 +0000
Subject: [PATCH 21/26] Merged PR 30419: Fix Python modules in GPU regression
 tests

Set compatible versions of Python modules after Cython 3.0 release.
---
 azure-regression-tests.yml | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml
index cb3730c19..0448b172a 100644
--- a/azure-regression-tests.yml
+++ b/azure-regression-tests.yml
@@ -64,6 +64,24 @@ stages:
       displayName: Collect system info
       workingDirectory: regression-tests
 
+    # Always run regression tests from the master branch
+    # The current SAS token will expire on 12/31/2023 and a new one will need to be set in Marian > Pipelines > Library
+    # This is run at the beginning for easier debugging of the Python environment
+    - bash: |
+        set -x
+        git checkout master
+        git pull origin master
+        # Uninstall Cython because the newest 3.0.0 is incompatible with newest available versions of pyyaml and numpy as of July 2023
+        python3 -m pip uninstall -y cython
+        python3 -m pip install 'cython<3'
+        # These modules will be installed via `make install` below, but Cython needs to be installed before
+        python3 -m pip install 'pyyaml<6.0.1' 'numpy>=1.22,<2' websocket-client
+        make install
+      displayName: Prepare regression tests
+      env:
+        AZURE_STORAGE_SAS_TOKEN: $(marian-pub-tests-blob-sas-token)
+      workingDirectory: regression-tests
+
     # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
     - bash: |
         wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
@@ -106,17 +124,6 @@ stages:
       displayName: Run unit tests
       workingDirectory: build
 
-    # Always run regression tests from the master branch
-    # The current SAS token will expire on 12/31/2023 and a new one will need to be set in Marian > Pipelines > Library
-    - bash: |
-        git checkout master
-        git pull origin master
-        make install
-      displayName: Prepare regression tests
-      env:
-        AZURE_STORAGE_SAS_TOKEN: $(marian-pub-tests-blob-sas-token)
-      workingDirectory: regression-tests
-
     # Continue on error to be able to collect outputs and publish them as an artifact
     - bash: MARIAN=../build ./run_mrt.sh
       continueOnError: true

From 717d351ca1165e8f640c3d087a01ee52c4d897c4 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Wed, 26 Jul 2023 17:13:22 +0000
Subject: [PATCH 22/26] Merged PR 30406: More general fallbacks for diverged
 training

This PR adds `--custom-fallbacks` and generalizes the previous attempt at handling diverged trainings.

Now we can specify any number of fallback options that get used in subsequent diverged trainings. E.g. we can restart a training from the last checkpoint by turning off fp16 training and if we still encounter a divergence, we can also lower the learning rate on the next attempt. This would be achieved by adding the following to a config file:

```
custom-fallbacks:
  - fp16: false
    precision: [float32, float32]
    cost-scaling: []
  - fp16: false
    precision: [float32, float32]
    cost-scaling: []
    learn-rate: 0.0001
```

On the command line we can specify json-style options like `--custom-fallbacks "{fp16: false, precision: [float32, float32], cost-scaling: []}" "{fp16: false, precision: [float32, float32], cost-scaling: [], learn-rate: 0.0001}"` where each string in `"..."` gets parsed to a Yaml list entry.

The previous option `--fp16-fallback-to-fp32` is now just an alias for the corresponding `--custom-fallbacks` values (first entry above). Any number of fallbacks can be specified.
---
 CHANGELOG.md                      |  1 +
 VERSION                           |  2 +-
 src/common/config_parser.cpp      | 19 ++++++-
 src/common/options.cpp            | 94 ++++++++++++++++++++++++++++++-
 src/common/options.h              | 73 ++++++++++++++----------
 src/embedder/vector_collector.cpp |  4 +-
 src/embedder/vector_collector.h   |  2 +-
 src/training/scheduler.h          | 32 ++++++++---
 src/training/training.h           | 59 ++++++++++---------
 src/training/training_state.h     | 19 ++++---
 src/training/validator.h          |  2 +-
 src/translator/scorers.cpp        |  4 +-
 src/translator/translator.h       |  4 +-
 13 files changed, 229 insertions(+), 86 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 79dd3f673..f70f73ab2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Added --custom-fallbacks option that allows to specify a list of option sets that get traversed for subsequent fallbacks upon divergence
 - Added --overwrite-checkpoint option that (when set to false) can be used to dump checkpoints with iteration numbers.   
 - Implementations of COMET-20 (reference-based) and BLEURT-20 for inference with conversion scripts.
 - `./marian evaluate` sub command for evaluation with COMET-QE-20, COMET-20 and BLEURT-20
diff --git a/VERSION b/VERSION
index 2fc612cb1..fe4dae579 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.9
\ No newline at end of file
+v1.12.10
\ No newline at end of file
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 9b36338c1..bad9904f9 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -572,12 +572,29 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       "Dynamic cost scaling for mixed precision training: "
       "scaling factor, frequency, multiplier, minimum factor")
       ->implicit_val("8.f 10000 1.f 8.f");
+  
   cli.add<std::vector<std::string>>("--throw-on-divergence",
       "Throw exception if training diverges. Divergence is detected if the running average loss over arg1 steps "
       "is exceeded by the running average loss over arg2 steps (arg1 >> arg2) by arg3 standard deviations")
-      ->implicit_val("100 10 3.0f");
+      ->implicit_val("1000 10 5.0f");
+  cli.add<std::vector<YAML::Node>>("--custom-fallbacks",
+      "List of custom fallback options after divergence. Each caught divergence exception thrown when --throw-on-divergence conditions are met progresses through another fallback. "
+      "If more exception are caught than fallbacks were specified the process will terminate with an uncaught exception.");
+
   cli.add<bool>("--fp16-fallback-to-fp32",
       "If fp16 training diverges and throws try to continue training with fp32 precision");
+  cli.alias("fp16-fallback-to-fp32", "true", [](YAML::Node& config) {
+    // use default custom-fallbacks to handle DivergenceException for fp16
+    config["custom-fallbacks"] = std::vector<YAML::Node>({ 
+      YAML::Load("{fp16 : false, precision: [float32, float32], cost-scaling: []}")
+     });
+  });
+
+  // @TODO: implement this next:
+  // cli.add<std::string>("--recover-from-fallback-after",
+  //     "Attempt to return to default options once the training has progressed in fallback mode by this many units. "
+  //     "Allowed units are the same as for disp-freq (i.e. (u)pdates, (t)okens, (e)pochs)");
+
   cli.add<size_t>("--gradient-norm-average-window",
       "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
       "After this many updates about 90% of the mass of the exponential average comes from these updates",
diff --git a/src/common/options.cpp b/src/common/options.cpp
index 59e8420a4..18f5b17d4 100644
--- a/src/common/options.cpp
+++ b/src/common/options.cpp
@@ -2,6 +2,96 @@
 
 namespace marian {
 
+// name space for helper template specializations
+namespace options_helpers {
+
+// Generic template-based implementation
+template <class T> 
+T Get<T>::apply(const Options* opt, const char* const key) {
+#if FASTOPT
+  opt->lazyRebuild();
+  ABORT_IF(!opt->has(key), "Required option '{}' has not been set", key);
+  return opt->fastOptions_[key].as<T>();
+#else
+  ABORT_IF(!opt->has(key), "Required option '{}' has not been set", key);
+  return opt->options_[key].as<T>();
+#endif
+}
+
+// Generic template-based implementation
+template <class T> 
+T Get<T>::apply(const Options* opt, const char* const key, const T& defaultValue) {
+#if FASTOPT
+  opt->lazyRebuild();
+  if(opt->has(key))
+    return opt->fastOptions_[key].as<T>();
+#else
+  if(opt->has(key))
+    return opt->options_[key].as<T>();
+#endif
+  else
+    return defaultValue;
+}
+
+// specializations for simple types
+template struct Get<bool>;
+template struct Get<int>;
+template struct Get<unsigned long>;
+template struct Get<unsigned long long>;
+template struct Get<float>;
+template struct Get<double>;
+template struct Get<std::string>;
+
+// specialization for vector of simple types
+template struct Get<std::vector<bool>>;
+template struct Get<std::vector<int>>;
+template struct Get<std::vector<unsigned long long>>;
+template struct Get<std::vector<unsigned long>>;
+template struct Get<std::vector<float>>;
+template struct Get<std::vector<double>>;
+template struct Get<std::vector<std::string>>;
+
+// specializations for std::vector<YAML::Node>
+template <>
+std::vector<YAML::Node> Get<std::vector<YAML::Node>>::apply(const Options* opt, const char* const key) {
+  ABORT_IF(!opt->has(key), "Required option '{}' has not been set", key);
+  auto vec = opt->options_[key].as<std::vector<YAML::Node>>();
+  for(auto& node : vec)  {
+    if(node.IsScalar())
+      node = YAML::Load(node.as<std::string>());
+  }
+  return vec;
+}
+
+template <>
+std::vector<YAML::Node> Get<std::vector<YAML::Node>>::apply(const Options* opt, const char* const key, const std::vector<YAML::Node>& defaultValue) {
+  if(opt->has(key))
+    return apply(opt, key);
+  return defaultValue;
+}
+
+template struct Get<std::vector<YAML::Node>>;
+
+// specializations for YAML::Node
+template <>
+YAML::Node Get<YAML::Node>::apply(const Options* opt, const char* const key) {
+  ABORT_IF(!opt->has(key), "Required option '{}' has not been set", key);
+  YAML::Node node = opt->options_[key];
+  if(node.IsScalar())
+    node = YAML::Load(node.as<std::string>());
+  return node;
+}
+
+template <>
+YAML::Node Get<YAML::Node>::apply(const Options* opt, const char* const key, const YAML::Node& defaultValue) {
+  if(opt->has(key))
+    return apply(opt, key);
+  return defaultValue;
+}
+
+template struct Get<YAML::Node>;
+}
+
 Options::Options()
 #if FASTOPT
   : fastOptions_(options_)
@@ -16,8 +106,8 @@ Options::Options(const Options& other)
   : options_(YAML::Clone(other.options_)) {}
 #endif
 
-Options Options::clone() const {
-  return Options(*this); // fastOptions_ get set in constructor above
+Ptr<Options> Options::clone() const {
+  return New<Options>(*this); // fastOptions_ get set in constructor above
 }
 
 YAML::Node Options::cloneToYamlNode() const {
diff --git a/src/common/options.h b/src/common/options.h
index 992be8760..91ef65f2c 100644
--- a/src/common/options.h
+++ b/src/common/options.h
@@ -30,6 +30,17 @@ namespace YAML {                                            \
 
 namespace marian {
 
+class Options;
+
+// helper class to enable template specialization in options.cpp
+namespace options_helpers {
+  template <class T> 
+  struct Get {
+    static T apply(const Options* opt, const char* const key);
+    static T apply(const Options* opt, const char* const key, const T& defaultValue);
+  };
+}
+
 /**
  * Container for options stored as key-value pairs. Keys are unique strings.
  * This is not thread-safe and locking is the responsibility of the caller.
@@ -60,6 +71,8 @@ class Options {
 
 public:
   Options();
+
+  // This creates a proper clone
   Options(const Options& other);
  
   // constructor with one or more key-value pairs
@@ -72,20 +85,34 @@ class Options {
   Options(const YAML::Node& node) : Options() {
      merge(node);
   }
-  
-  // constructor that clones and zero or more updates
+
+  template <typename T>
+  friend struct options_helpers::Get;
+
+  // Clones current set of options
+  Ptr<Options> clone() const;
+
+  // Clones current set of options and performs zero updates (just calls clone()).
+  Ptr<Options> with() const {
+    return clone();
+  }
+
+  // Clones current set of options and performs one or more updates
   // options->with("var1", val1, "var2", val2, ...)
-  template <typename... Args>
-  Ptr<Options> with(Args&&... args) const {
-    auto options = New<Options>(*this);
-    options->set(std::forward<Args>(args)...);
+  template <typename T, typename... Args>
+  Ptr<Options> with(const std::string& key, T value, Args&&... args) const {
+    auto options = clone();
+    options->set(key, value, std::forward<Args>(args)...);
     return options;
   }
 
-  /**
-   * @brief Return a copy of the object that can be safely modified.
-   */
-  Options clone() const;
+  // Clones current set of options and performs zero or more updates from a YAML::Node.
+  // Matching existing options get overwritten with options from the argument node.
+  Ptr<Options> with(const YAML::Node& node) const {
+    auto options = clone();
+    options->merge(node, /*overwrite=*/true);
+    return options;
+  }
 
   // Do not allow access to internal YAML object as changes on the outside are difficult to track
   // and mess with the rebuilding of the fast options lookup. Hence only return a clone which guarentees
@@ -129,14 +156,8 @@ class Options {
 
   template <typename T>
   T get(const char* const key) const {
-#if FASTOPT
-    lazyRebuild();
-    ABORT_IF(!has(key), "Required option '{}' has not been set", key);
-    return fastOptions_[key].as<T>();
-#else
-    ABORT_IF(!has(key), "Required option '{}' has not been set", key);
-    return options_[key].as<T>();
-#endif
+    // this way we can add type-based specialization, e.g. use options_ for YAML::Node and fastOptions_ for other types. See options.cpp
+    return options_helpers::Get<T>::apply(this, key);
   }
 
   template <typename T>
@@ -145,21 +166,13 @@ class Options {
   }
 
   template <typename T>
-  T get(const char* const key, T defaultValue) const {
-#if FASTOPT
-    lazyRebuild();
-    if(has(key))
-      return fastOptions_[key].as<T>();
-#else
-    if(has(key))
-      return options_[key].as<T>();
-#endif
-    else
-      return defaultValue;
+  T get(const char* const key, const T& defaultValue) const {
+    // As above, this way we can add type-based specialization, e.g. use options_ for YAML::Node and fastOptions_ for other types. See options.cpp
+    return options_helpers::Get<T>::apply(this, key, defaultValue);
   }
 
   template <typename T>
-  T get(const std::string& key, T defaultValue) const {
+  T get(const std::string& key, const T& defaultValue) const {
     return get<T>(key.c_str(), defaultValue);
   }
 
diff --git a/src/embedder/vector_collector.cpp b/src/embedder/vector_collector.cpp
index eb55779e0..1268de530 100644
--- a/src/embedder/vector_collector.cpp
+++ b/src/embedder/vector_collector.cpp
@@ -94,7 +94,7 @@ void AveragingVectorCollector::WriteAverage() {
 Ptr<VectorCollector> VectorCollector::Create(Ptr<Options> options) {
   std::string average = options->get<std::string>("average", "skip");
   std::string output  = options->get<std::string>("output");
-  size_t width        = options->get<size_t>("width", DEFAULT_WIDTH);
+  size_t width        = options->get<size_t>("width", VectorCollector::DEFAULT_WIDTH);
 
   Ptr<VectorCollector> collector;
   if(average == "skip")
@@ -109,4 +109,6 @@ Ptr<VectorCollector> VectorCollector::Create(Ptr<Options> options) {
   return collector;
 }
 
+const size_t VectorCollector::DEFAULT_WIDTH = 4;
+
 }  // namespace marian
diff --git a/src/embedder/vector_collector.h b/src/embedder/vector_collector.h
index 3f1f91e0c..6c727203c 100644
--- a/src/embedder/vector_collector.h
+++ b/src/embedder/vector_collector.h
@@ -14,7 +14,7 @@ namespace marian {
 // on its binary flag. If binary=false, width can be used to set the number of decimal places.
 class VectorCollector {
 public:
-  static const size_t DEFAULT_WIDTH = 4;
+  static const size_t DEFAULT_WIDTH;
 
   VectorCollector(bool binary=false, size_t width=DEFAULT_WIDTH);
   VectorCollector(std::string outFile, bool binary=false, size_t width=DEFAULT_WIDTH);
diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index 9c84d1593..f0f39330d 100644
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -30,10 +30,11 @@ class Scheduler : public TrainingObserver {
 
   bool first_{true};                  // true if this is the first update after renewing the training
 
-  bool throwOnDivergence_{false};     // throw an exception if training divergence is detected
-  size_t lossAvgWindowSlow_{100};     // window size for slow-moving average loss for divergence detection
-  size_t lossAvgWindowFast_{10};      // window size for fast-moving average loss for divergence detection
-  float divergenceTolerance_{3.f};    // tolerance for divergence detection as multiples of standard deviation
+  bool throwOnDivergence_{false};   // throw an exception if training divergence is detected
+  size_t lossAvgWindowSlow_{1000};  // window size for slow-moving average loss for divergence detection
+  size_t lossAvgWindowFast_{10};    // window size for fast-moving average loss for divergence detection
+  float divergenceTolerance_{5.f};  // tolerance for divergence detection as multiples of standard deviation
+  SchedulingParameter throwAfter_;  // for diagnostics only; training will throw if non-zero and training has progressed this far
   
   size_t gradientNormAvgWindow_{100}; // window size for recording the exponential average of gradient norms, after this many updates about 90% of the mass comes from this many last updates
   SchedulingParameter logicalEpoch_;
@@ -161,10 +162,17 @@ class Scheduler : public TrainingObserver {
         lossAvgWindowFast_ = std::stoul(throwParameters[1]);
       if(throwParameters.size() > 2)
         divergenceTolerance_ = std::stof(throwParameters[2]);
-        LOG(info, 
-            "[scheduler] Divergence detection is enabled for slow-moving averaging window over {} steps "
-            "vs fast-moving window over {} steps with tolerance of {} sigmas", 
-            lossAvgWindowSlow_, lossAvgWindowFast_, divergenceTolerance_);
+      if(throwParameters.size() > 3)
+        throwAfter_ = SchedulingParameter::parse(throwParameters[3]);
+        
+      LOG(info, 
+          "[scheduler] Divergence detection is enabled for slow-moving averaging window over {} steps "
+          "vs fast-moving window over {} steps with tolerance of {} sigmas", 
+          lossAvgWindowSlow_, lossAvgWindowFast_, divergenceTolerance_);
+
+      if(throwAfter_) {
+        LOG(warn, "[scheduler] Diagnostic DivergenceException will be thrown when training reaches {}", (std::string)throwAfter_);
+      }
     }
 
     // parse logical-epoch parameters
@@ -505,6 +513,14 @@ class Scheduler : public TrainingObserver {
           }
         }
       }
+
+      // purely diagnostic. This will throw a divergence exception once the specified training progress has occurred. 
+      if(throwAfter_) {
+        if(state_->enteredNewPeriodOf(throwAfter_)) {
+          LOG(warn, "Training reached {}; throwing diagnostic DivergenceException", (std::string)throwAfter_);
+          throw DivergenceException(state_->lossAvgSlow, state_->lossAvgFast, 0.f);
+        }
+      }
       
       // log slow-moving exponential average and variance of training cost stats
       float deltaSlow = currentNormalizedLoss - state_->lossAvgSlow;
diff --git a/src/training/training.h b/src/training/training.h
index cbca3eff2..e608cd11a 100644
--- a/src/training/training.h
+++ b/src/training/training.h
@@ -45,13 +45,18 @@ class Train : public ModelTask {
 
     dataset->prepare();
 
-    // We run training in a do-while loop. It should only restart if a fp16 training run was interrupted
+    // We run training in a do-while loop. It should only restart if a training run was interrupted
     // via the throwing of a DivergenceException from training/scheduler.h and if --throw-on-divergence and
-    // --fp16-fallback-to-fp32 are enabled. 
-    // The repeated training run will continue from last checkpoint (similar to a manually interrupted training) 
-    // but attempt training in fp32. If that training run or any other fp32 training happens to diverge, 
-    // training will exit with an unhandled DivergenceException. This is on purpose to indicate a fatal error.
-    bool restartTraining;
+    // custom-fallbacks are specified (directly or the via alias fp16-fallback-to-fp32) otherwise it will die with the rethrown exception. 
+    // The repeated training run will continue from the last checkpoint (similar to a manually interrupted training) 
+    // but attempt training with the options specified in the current fallback. If that training run in turn happens to diverge, 
+    // training will move on to the next defined fallback or exit with an unhandled DivergenceException if there are no more fallbacks. 
+    // The unhandled exception is on purpose to indicate a fatal error.
+
+    auto originalOptions = options_->clone(); // clone in order to keep unaltered option object around
+    bool restartTraining;      // record if training should be restarted after catching a DivergenceException
+    size_t restartCounter = 0; // count how many restarts occured. Used to progress through the list of fallbacks
+
     do {
       try {
         // there will be only one training loop execution unless in special situations,
@@ -133,34 +138,28 @@ class Train : public ModelTask {
 
       } catch(DivergenceException& e) { // handling divergent training if scheduler is configured 
         // to throw via --throw-on-divergence
-        if(options_->get<bool>("fp16-fallback-to-fp32", false)) {
-          auto precisions = options_->get<std::vector<std::string>>("precision");
-          Type parameterType = typeFromString(precisions[0]);
-          if(parameterType == Type::float16) {
-            // we diverged, but we were apparently training with fp16 and fallback to fp32
-            // is enabled. There is a chance we can rescue the training run by restarting
-            // from the last checkpoint but using fp32 precision training.
-            LOG(warn, "Training diverged, but --fp16-fallback-to-fp32 is enabled. "
-                      "Attempting restart from the last checkpoint with fp32 precision.");
-
-            // undo all options that would be set for fp16 training
-            options_ = options_->with(
-              "fp16", false,
-              "precision", std::vector<std::string>({"float32", "float32"}),
-              "cost-scaling", std::vector<std::string>({})
-            );
+
+        // get the list of possible fallback set of options
+        auto fallbacks = options_->get<std::vector<YAML::Node>>("custom-fallbacks", {});
+
+        // check if we exceeded the number of available fallbacks, if not, take the current one
+        if(restartCounter < fallbacks.size()) {
+            auto fallback = fallbacks[restartCounter];
+            fallback.SetStyle(YAML::EmitterStyle::Flow);
+
+            // we diverged, but a set of fallback options is specified. There is a chance we can rescue the training run by 
+            // restarting from the last checkpoint with the options from the current fallback.
+            LOG(warn, "Training diverged, but fallback is enabled. Attempting restart from the last checkpoint with these options: {}", YAML::Dump(fallback));
+
+            // overwrite all original options with fallback options
+            options_ = originalOptions->with(fallback);
 
             // this gets checked at final do-while condition
             restartTraining = true;
-          } else {
-            // We diverged and fallback is enabled, but we are already training with fp32, 
-            // hence rethrow and let training die with error.
-            LOG(warn, "Training diverged, rethrowing divergence exception");
-            throw e;
-          }
+            restartCounter++;
         } else {
-          // We diverged and no fallback enabled, hence rethrow and let training die with error.
-          LOG(warn, "Training diverged, rethrowing divergence exception");
+          // we diverged and no fallback is available, hence rethrow and let training die with error.
+          LOG(warn, "Training diverged and there are either no fallbacks or we exceeded the number of defined fallbacks, rethrowing divergence exception");
           throw e;
         }
       }
diff --git a/src/training/training_state.h b/src/training/training_state.h
index 800dd60c7..d034d93a1 100644
--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@@ -147,15 +147,20 @@ class TrainingState {
   // between calls to this. We call it from update(). Unfortunately, newEpoch()
   // is called at the wrong place for this to work, so SchedulingUnit::epoch is forbidden
   // for periods.
-  bool enteredNewPeriodOf(std::string schedulingParam) const {
-    auto period = SchedulingParameter::parse(schedulingParam);
+  bool enteredNewPeriodOf(SchedulingParameter schedulingParam) const {
     // @TODO: adapt to logical epochs
-    ABORT_IF(period.unit == SchedulingUnit::epochs,
+    ABORT_IF(schedulingParam.unit == SchedulingUnit::epochs,
              "Unit {} is not supported for frequency parameters",
-             schedulingParam);
-    auto previousProgress = getPreviousProgressIn(period.unit);
-    auto progress = getProgressIn(period.unit);
-    return period && progress / period.n != previousProgress / period.n;
+             (std::string)schedulingParam);
+    auto previousProgress = getPreviousProgressIn(schedulingParam.unit);
+    auto progress = getProgressIn(schedulingParam.unit);
+    return schedulingParam && progress / schedulingParam.n != previousProgress / schedulingParam.n;
+  }
+
+  // std::string version of the above function
+  bool enteredNewPeriodOf(std::string schedulingParam) const {
+    SchedulingParameter parsedSchedulingParam = SchedulingParameter::parse(schedulingParam);
+    return enteredNewPeriodOf(parsedSchedulingParam);
   }
 
   void newEpoch() {
diff --git a/src/training/validator.h b/src/training/validator.h
index aed710778..364c3893d 100644
--- a/src/training/validator.h
+++ b/src/training/validator.h
@@ -59,7 +59,7 @@ class Validator : public ValidatorBase {
       : ValidatorBase(lowerIsBetter, epsilon),
         vocabs_(vocabs),
         // options_ is a clone of global options, so it can be safely modified within the class
-        options_(New<Options>(options->clone())) {
+        options_(options->clone()) {
     // set options common for all validators
     options_->set("inference", true);
     options_->set("shuffle", "none"); // don't shuffle validation sets
diff --git a/src/translator/scorers.cpp b/src/translator/scorers.cpp
index 60ec03dd1..7c9745c22 100644
--- a/src/translator/scorers.cpp
+++ b/src/translator/scorers.cpp
@@ -60,7 +60,7 @@ std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<s
     std::string fname = "F" + std::to_string(i);
 
     // load options specific for the scorer
-    auto modelOptions = New<Options>(options->clone());
+    auto modelOptions = options->clone();
     if(!options->get<bool>("ignore-model-config")) {
       YAML::Node modelYaml;
       io::getYamlFromModel(modelYaml, "special:model.yml", items);
@@ -115,7 +115,7 @@ std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<c
     std::string fname = "F" + std::to_string(i);
 
     // load options specific for the scorer
-    auto modelOptions = New<Options>(options->clone());
+    auto modelOptions = options->clone();
     if(!options->get<bool>("ignore-model-config")) {
       YAML::Node modelYaml;
       io::getYamlFromModel(modelYaml, "special:model.yml", ptr);
diff --git a/src/translator/translator.h b/src/translator/translator.h
index 205c213cb..f0fc0b908 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -42,7 +42,7 @@ class Translate : public ModelTask {
 
 public:
   Translate(Ptr<Options> options)
-    : options_(New<Options>(options->clone())) { // @TODO: clone should return Ptr<Options> same as "with"?
+    : options_(options->clone()) {
     // This is currently safe as the translator is either created stand-alone or
     // or config is created anew from Options in the validator
 
@@ -252,7 +252,7 @@ class TranslateService : public ModelServiceTask {
   virtual ~TranslateService() {}
 
   TranslateService(Ptr<Options> options)
-    : options_(New<Options>(options->clone())) {
+    : options_(options->clone()) {
     // initialize vocabs
     options_->set("inference", true);
     options_->set("shuffle", "none");

From e383583ae5b0f2f82cedb06c4cd7c5f036fb90a3 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Thu, 27 Jul 2023 17:07:28 +0000
Subject: [PATCH 23/26] Merged PR 30482: Fixes for backward compatibility in
 fine-tuning

This PR fixes fine-tuning a model trained with an older version of Marian by:
- adding the removed option `num-devices` to the list of deprecated options
- checking if `loss-{arg,var}-{slow,fast}` are present in .progress.yml file
---
 VERSION                       |  2 +-
 src/common/cli_wrapper.cpp    |  1 +
 src/training/training_state.h | 10 +++++++---
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/VERSION b/VERSION
index fe4dae579..e47557093 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.10
\ No newline at end of file
+v1.12.11
diff --git a/src/common/cli_wrapper.cpp b/src/common/cli_wrapper.cpp
index fee50a2cb..343ff2ba2 100644
--- a/src/common/cli_wrapper.cpp
+++ b/src/common/cli_wrapper.cpp
@@ -13,6 +13,7 @@ namespace cli {
 const std::unordered_set<std::string> DEPRECATED_OPTIONS = {
   "version",
   "special-vocab",
+  "num-devices",
 // @TODO: uncomment once we actually deprecate them.
 //  "after-batches",
 //  "after-epochs"
diff --git a/src/training/training_state.h b/src/training/training_state.h
index d034d93a1..c522caa85 100644
--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@@ -209,6 +209,10 @@ class TrainingState {
   void loadFromString(const std::string& yamlString) {
     YAML::Node config = YAML::Load(yamlString);
 
+    // WARNING! When adding new options to the training state, make sure to
+    //          check of their existance when loading from the progress.yml
+    //          file for backward compatibility
+
     epochs = config["epochs"].as<size_t>();
     batches = config["batches"].as<size_t>();
     batchesEpoch = config["batches-epoch"].as<size_t>();
@@ -241,9 +245,9 @@ class TrainingState {
     samplesDisp = config["disp-samples"].as<size_t>();
     updatesDisp = config["disp-updates"].as<size_t>();
 
-    lossAvgSlow = config["loss-avg-slow"].as<float>();
-    lossAvgFast = config["loss-avg-fast"].as<float>();
-    lossVarSlow = config["loss-var-slow"].as<float>();
+    lossAvgSlow = config["loss-avg-slow"] ? config["loss-avg-slow"].as<float>() : 0;
+    lossAvgFast = config["loss-avg-fast"] ? config["loss-avg-fast"].as<float>() : 0;
+    lossVarSlow = config["loss-var-slow"] ? config["loss-var-slow"].as<float>() : 0;
 
     gradientNormAvg = config["gradient-norm-avg"].as<float>();
     gradientNormVar = config["gradient-norm-var"].as<float>();

From 3bd25dd59ed118f7433b2692b314f211c72b578c Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 31 Jul 2023 08:03:37 +0000
Subject: [PATCH 24/26] Merged PR 30516: Make sure that loss is finite when
 checking for divergence

Make sure that the averaged loss is actually well-defined and not inf or nan.
---
 VERSION                    |  2 +-
 src/common/definitions.h   | 10 +++++++++-
 src/training/graph_group.h |  8 --------
 src/training/scheduler.h   |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/VERSION b/VERSION
index e47557093..dc5ef6d14 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.11
+v1.12.12
diff --git a/src/common/definitions.h b/src/common/definitions.h
index e28ea5dcf..37213d37a 100644
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -193,6 +193,14 @@ typedef Ptr<ClipperBase> ClipperBasePtr;
 class RunBase;
 typedef Ptr<RunBase> RunBasePtr;
 
-
 const float NEMATUS_LN_EPS = 1e-5f;
+
+// With -Ofast enabled gcc will fail to identify NaN or Inf. Safeguard here.
+static inline bool isFinite(float x) {
+#ifdef __GNUC__
+  ABORT_IF(std::isfinite(0.f / 0.f), "NaN detection unreliable. Disable -Ofast compiler option.");
+#endif
+  return std::isfinite(x);
+}
+
 }  // namespace marian
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index 4cfd079aa..b0c98e3ce 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -11,14 +11,6 @@
 
 namespace marian {
 
-// With -Ofast enabled gcc will fail to identify NaN or Inf. Safeguard here.
-static inline bool isFinite(float x) {
-#ifdef __GNUC__
-  ABORT_IF(std::isfinite(0.f / 0.f), "NaN detection unreliable. Disable -Ofast compiler option.");
-#endif
-  return std::isfinite(x);
-}
-
 #ifdef _MSC_VER // MS Visual studio insists that this funtion is not being referenced although is being referenced by name as an argument
 #pragma warning(push)
 #pragma warning(disable: 4505) //Unreferenced local function has been removed
diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index f0f39330d..df902e6ef 100644
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -466,7 +466,7 @@ class Scheduler : public TrainingObserver {
     state_->newUpdate(numReadBatches);
 
     // true if --throw-on-divergence [lossAvgWindowSlow_] [lossAvgWindowFast_] [divergenceTolerance_] is enabled, false otherwise
-    if(throwOnDivergence_) {
+    if(throwOnDivergence_ && isFinite(currentNormalizedLoss)) {
       size_t windowSlow = std::min(lossAvgWindowSlow_, state_->batches); // we compare the running exponential average over a longer window
       size_t windowFast = std::min(lossAvgWindowFast_, state_->batches); // with the running exponential everage over a shorter window (for smoothing)
       

From 60aa66bab9e45214fd0f4760bad27f7785ed2ddc Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 14 Aug 2023 21:41:08 +0000
Subject: [PATCH 25/26] Merged PR 30704: Merge with public master from 20230814

---
 .github/workflows/macos.yml                 |  2 +-
 cmake/FindSSE.cmake                         | 30 ++++++++++-----------
 examples                                    |  2 +-
 regression-tests                            |  2 +-
 src/3rd_party/fbgemm                        |  2 +-
 src/3rd_party/sentencepiece                 |  2 +-
 src/onnx/expression_graph_onnx_exporter.cpp |  2 +-
 7 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index f06eed256..8b992e404 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -30,7 +30,7 @@ jobs:
           -DCOMPILE_CPU=on \
           -DCOMPILE_CUDA=off \
           -DCOMPILE_EXAMPLES=on \
-          -DCOMPILE_SERVER=on \
+          -DCOMPILE_SERVER=off \
           -DCOMPILE_TESTS=on \
           -DUSE_FBGEMM=on \
           -DUSE_SENTENCEPIECE=on
diff --git a/cmake/FindSSE.cmake b/cmake/FindSSE.cmake
index e1c58fbc9..0f1483487 100644
--- a/cmake/FindSSE.cmake
+++ b/cmake/FindSSE.cmake
@@ -4,7 +4,7 @@
 IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
    EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
 
-   STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "sse2" "${SSE_THERE}" SSE2_TRUE)
    IF (SSE2_TRUE)
       set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
@@ -13,14 +13,14 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
    ENDIF (SSE2_TRUE)
 
    # /proc/cpuinfo apparently omits sse3 :(
-   STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "sse3" "${SSE_THERE}" SSE3_TRUE)
    IF (NOT SSE3_TRUE)
-      STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE ${CPUINFO})
+      STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE "${CPUINFO}")
       STRING(COMPARE EQUAL "T2300" "${SSE_THERE}" SSE3_TRUE)
    ENDIF (NOT SSE3_TRUE)
 
-   STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "ssse3" "${SSE_THERE}" SSSE3_TRUE)
    IF (SSE3_TRUE OR SSSE3_TRUE)
       set(SSE3_FOUND true CACHE BOOL "SSE3 available on host")
@@ -33,7 +33,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
       set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
    ENDIF (SSSE3_TRUE)
 
-   STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "sse4_1" "${SSE_THERE}" SSE41_TRUE)
    IF (SSE41_TRUE)
       set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host")
@@ -41,7 +41,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
       set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
    ENDIF (SSE41_TRUE)
 
-   STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "sse4_2" "${SSE_THERE}" SSE42_TRUE)
    IF (SSE42_TRUE)
       set(SSE4_2_FOUND true CACHE BOOL "SSE4.2 available on host")
@@ -49,7 +49,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
       set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
    ENDIF (SSE42_TRUE)
 
-   STRING(REGEX REPLACE "^.*(avx).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(avx).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "avx" "${SSE_THERE}" AVX_TRUE)
    IF (AVX_TRUE)
       set(AVX_FOUND true CACHE BOOL "AVX available on host")
@@ -57,7 +57,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
       set(AVX_FOUND false CACHE BOOL "AVX available on host")
    ENDIF (AVX_TRUE)
 
-   STRING(REGEX REPLACE "^.*(avx2).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(avx2).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "avx2" "${SSE_THERE}" AVX2_TRUE)
    IF (AVX2_TRUE)
       set(AVX2_FOUND true CACHE BOOL "AVX2 available on host")
@@ -65,7 +65,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
       set(AVX2_FOUND false CACHE BOOL "AVX2 available on host")
    ENDIF (AVX2_TRUE)
 
-   STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "avx512" "${SSE_THERE}" AVX512_TRUE)
    IF (AVX512_TRUE)
       set(AVX512_FOUND true CACHE BOOL "AVX512 available on host")
@@ -76,7 +76,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
 ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
    EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features machdep.cpu.leaf7_features" OUTPUT_VARIABLE CPUINFO)
 
-   STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "SSE2" "${SSE_THERE}" SSE2_TRUE)
    IF (SSE2_TRUE)
       set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
@@ -84,7 +84,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
       set(SSE2_FOUND false CACHE BOOL "SSE2 available on host")
    ENDIF (SSE2_TRUE)
 
-   STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "SSE3" "${SSE_THERE}" SSE3_TRUE)
    IF (SSE3_TRUE)
       set(SSE3_FOUND true CACHE BOOL "SSE3 available on host")
@@ -100,7 +100,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
       set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
    ENDIF (SSSE3_TRUE)
 
-   STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "SSE4.1" "${SSE_THERE}" SSE41_TRUE)
    IF (SSE41_TRUE)
       set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host")
@@ -108,7 +108,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
       set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
    ENDIF (SSE41_TRUE)
 
-   STRING(REGEX REPLACE "^.*(AVX).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(AVX).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "AVX" "${SSE_THERE}" AVX_TRUE)
    IF (AVX_TRUE)
       set(AVX_FOUND true CACHE BOOL "AVX available on host")
@@ -116,7 +116,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
       set(AVX_FOUND false CACHE BOOL "AVX available on host")
    ENDIF (AVX_TRUE)
 
-   STRING(REGEX REPLACE "^.*(AVX2).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(AVX2).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "AVX2" "${SSE_THERE}" AVX2_TRUE)
    IF (AVX2_TRUE)
       set(AVX2_FOUND true CACHE BOOL "AVX2 available on host")
@@ -124,7 +124,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
       set(AVX2_FOUND false CACHE BOOL "AVX2 available on host")
    ENDIF (AVX2_TRUE)
 
-   STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE "${CPUINFO}")
    STRING(COMPARE EQUAL "avx512" "${SSE_THERE}" AVX512_TRUE)
    IF (AVX512_TRUE)
       set(AVX512_FOUND true CACHE BOOL "AVX512 available on host")
diff --git a/examples b/examples
index 58f48a067..6c40475a9 160000
--- a/examples
+++ b/examples
@@ -1 +1 @@
-Subproject commit 58f48a06756c623fe799613134810322e061863f
+Subproject commit 6c40475a9cbdcc219d0b6a8347ae43902204eedc
diff --git a/regression-tests b/regression-tests
index 2a8bed3f0..ab6fd7365 160000
--- a/regression-tests
+++ b/regression-tests
@@ -1 +1 @@
-Subproject commit 2a8bed3f0e937a9de2d6fa92dee3bcf482d3d47b
+Subproject commit ab6fd7365f1b40633a1164dd35c6a15b55f2d4d9
diff --git a/src/3rd_party/fbgemm b/src/3rd_party/fbgemm
index 6f45243cb..0e33146d3 160000
--- a/src/3rd_party/fbgemm
+++ b/src/3rd_party/fbgemm
@@ -1 +1 @@
-Subproject commit 6f45243cb8ab7d7ab921af18d313ae97144618b8
+Subproject commit 0e33146d3e7f070c7de9494efef49147a9d20558
diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece
index 8dc9172f8..fb6f8e408 160000
--- a/src/3rd_party/sentencepiece
+++ b/src/3rd_party/sentencepiece
@@ -1 +1 @@
-Subproject commit 8dc9172f88b1d4054ca38de0e5362b2935e9b53f
+Subproject commit fb6f8e408d2078ebfedc8ccc33985fef03c50b0e
diff --git a/src/onnx/expression_graph_onnx_exporter.cpp b/src/onnx/expression_graph_onnx_exporter.cpp
index d27f1360c..8e6625a42 100644
--- a/src/onnx/expression_graph_onnx_exporter.cpp
+++ b/src/onnx/expression_graph_onnx_exporter.cpp
@@ -5,7 +5,7 @@
 #include "models/model_factory.h"
 #include "models/encoder_decoder.h"
 #include "data/corpus_base.h"
-#include "tensors/cpu/fbgemm/expression_graph_packable.h"
+#include "tensors/cpu/expression_graph_packable.h"
 
 #include <memory>
 

From 3f93e656ea6be6f9a8816fa696ba7c435343fc2e Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Tue, 15 Aug 2023 12:55:24 -0700
Subject: [PATCH 26/26] don't include nppdefs.h. Problematic on some machines
 (#1004)

Co-authored-by: Hieu Hoang <hihoan@microsoft.com>
---
 src/tensors/gpu/tensor_operators.cu | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index 5f8c4c122..6dbded2a4 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -1,7 +1,5 @@
-# if defined(_MSC_VER)
+# if !defined(NPP_MAX_32U)
 #define NPP_MAX_32U     ( 4294967295U )              /**<  Maximum 32-bit unsigned integer */
-#else
-#include <nppdefs.h>
 #endif
 
 #include "common/types.h"
@@ -3548,7 +3546,7 @@ __global__ void HammmingAndSort(const uint32_t *weightHash,
     if (outIdx != NPP_MAX_32U) {
       uint32_t prevOutIdx;
 // Not supported in Maxwells or older
-// Not supported in Maxwells or older
+// Not supported in Maxwells or older
 #if __CUDA_ARCH__ >= 600
       prevOutIdx = atomicAdd_block(&outIdx, (uint32_t) -1);
 #else